From 20ea823d6d2198d8fe4e187f32e925fe429cb29b Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 17 Aug 2021 16:08:47 -0400
Subject: [PATCH 01/46] cudf resolve nvcc 11.0 compiler crashes during codegen
 (#9028)

When compiling with nvcc 11.0 any translation unit that uses arrow::AllocateBuffer or arrow::AllocateBitmap would cause a
compiler crash during codegen.

To avoid this issue we introduce a C++ shim between cudf/interop and arrow that does the arrow allocations for us.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Mark Harris (https://github.com/harrism)
  - Conor Hoekstra (https://github.com/codereport)

URL: https://github.com/rapidsai/cudf/pull/9028
---
 cpp/CMakeLists.txt                         |  1 +
 cpp/src/interop/detail/arrow_allocator.cpp | 51 ++++++++++++++++++++++
 cpp/src/interop/detail/arrow_allocator.hpp | 31 +++++++++++++
 cpp/src/interop/to_arrow.cu                | 43 ++++++------------
 4 files changed, 97 insertions(+), 29 deletions(-)
 create mode 100644 cpp/src/interop/detail/arrow_allocator.cpp
 create mode 100644 cpp/src/interop/detail/arrow_allocator.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 54a4c4ea023..bb17f13db53 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -255,6 +255,7 @@ add_library(cudf
     src/interop/dlpack.cpp
     src/interop/from_arrow.cu
     src/interop/to_arrow.cu
+    src/interop/detail/arrow_allocator.cpp
     src/io/avro/avro.cpp
     src/io/avro/avro_gpu.cu
     src/io/avro/reader_impl.cu
diff --git a/cpp/src/interop/detail/arrow_allocator.cpp b/cpp/src/interop/detail/arrow_allocator.cpp
new file mode 100644
index 00000000000..cb67c893573
--- /dev/null
+++ b/cpp/src/interop/detail/arrow_allocator.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/interop.hpp>
+
+namespace cudf {
+namespace detail {
+
+std::unique_ptr<arrow::Buffer> allocate_arrow_buffer(const int64_t size, arrow::MemoryPool* ar_mr)
+{
+  /*
+  nvcc 11.0 generates Internal Compiler Error during codegen when arrow::AllocateBuffer
+  and `ValueOrDie` are used inside a CUDA compilation unit.
+
+  To work around this issue we compile an allocation shim in C++ and use
+  that from our cuda sources
+  */
+  auto result = arrow::AllocateBuffer(size, ar_mr);
+  CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer");
+  return std::move(result).ValueOrDie();
+}
+
+std::shared_ptr<arrow::Buffer> allocate_arrow_bitmap(const int64_t size, arrow::MemoryPool* ar_mr)
+{
+  /*
+  nvcc 11.0 generates Internal Compiler Error during codegen when arrow::AllocateBuffer
+  and `ValueOrDie` are used inside a CUDA compilation unit.
+
+  To work around this issue we compile an allocation shim in C++ and use
+  that from our cuda sources
+  */
+  auto result = arrow::AllocateBitmap(size, ar_mr);
+  CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow bitmap");
+  return std::move(result).ValueOrDie();
+}
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/interop/detail/arrow_allocator.hpp b/cpp/src/interop/detail/arrow_allocator.hpp
new file mode 100644
index 00000000000..20099f91afa
--- /dev/null
+++ b/cpp/src/interop/detail/arrow_allocator.hpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/interop.hpp>
+
+namespace cudf {
+namespace detail {
+
+// unique_ptr because that is what AllocateBuffer returns
+std::unique_ptr<arrow::Buffer> allocate_arrow_buffer(const int64_t size, arrow::MemoryPool* ar_mr);
+
+// shared_ptr because that is what AllocateBitmap returns
+std::shared_ptr<arrow::Buffer> allocate_arrow_bitmap(const int64_t size, arrow::MemoryPool* ar_mr);
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 3cd515e9981..3271804bf39 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -34,6 +34,8 @@
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 
+#include "detail/arrow_allocator.hpp"
+
 namespace cudf {
 namespace detail {
 namespace {
@@ -48,10 +50,7 @@ std::shared_ptr<arrow::Buffer> fetch_data_buffer(column_view input_view,
 {
   const int64_t data_size_in_bytes = sizeof(T) * input_view.size();
 
-  auto result = arrow::AllocateBuffer(data_size_in_bytes, ar_mr);
-  CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for data");
-
-  std::shared_ptr<arrow::Buffer> data_buffer = std::move(result.ValueOrDie());
+  auto data_buffer = allocate_arrow_buffer(data_size_in_bytes, ar_mr);
 
   CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
                            input_view.data<T>(),
@@ -59,7 +58,7 @@ std::shared_ptr<arrow::Buffer> fetch_data_buffer(column_view input_view,
                            cudaMemcpyDeviceToHost,
                            stream.value()));
 
-  return data_buffer;
+  return std::move(data_buffer);
 }
 
 /**
@@ -72,9 +71,7 @@ std::shared_ptr<arrow::Buffer> fetch_mask_buffer(column_view input_view,
   const int64_t mask_size_in_bytes = cudf::bitmask_allocation_size_bytes(input_view.size());
 
   if (input_view.has_nulls()) {
-    auto result = arrow::AllocateBitmap(static_cast<int64_t>(input_view.size()), ar_mr);
-    CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for mask");
-    std::shared_ptr<arrow::Buffer> mask_buffer = std::move(result.ValueOrDie());
+    auto mask_buffer = allocate_arrow_bitmap(static_cast<int64_t>(input_view.size()), ar_mr);
     CUDA_TRY(cudaMemcpyAsync(
       mask_buffer->mutable_data(),
       (input_view.offset() > 0) ? cudf::copy_bitmask(input_view).data() : input_view.null_mask(),
@@ -163,10 +160,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(
                    });
 
   auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType);
-  auto result                  = arrow::AllocateBuffer(buf_size_in_bytes, ar_mr);
-  CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for data");
-
-  std::shared_ptr<arrow::Buffer> data_buffer = std::move(result.ValueOrDie());
+  auto data_buffer             = allocate_arrow_buffer(buf_size_in_bytes, ar_mr);
 
   CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
                            buf.data(),
@@ -176,7 +170,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(
 
   auto type    = arrow::decimal(18, -input.type().scale());
   auto mask    = fetch_mask_buffer(input, ar_mr, stream);
-  auto buffers = std::vector<std::shared_ptr<arrow::Buffer>>{mask, data_buffer};
+  auto buffers = std::vector<std::shared_ptr<arrow::Buffer>>{mask, std::move(data_buffer)};
   auto data    = std::make_shared<arrow::ArrayData>(type, input.size(), buffers);
 
   return std::make_shared<arrow::Decimal128Array>(data);
@@ -191,10 +185,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<bool>(column_view in
 {
   auto bitmask = bools_to_mask(input, stream);
 
-  auto result = arrow::AllocateBuffer(static_cast<int64_t>(bitmask.first->size()), ar_mr);
-  CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for data");
-
-  std::shared_ptr<arrow::Buffer> data_buffer = std::move(result.ValueOrDie());
+  auto data_buffer = allocate_arrow_buffer(static_cast<int64_t>(bitmask.first->size()), ar_mr);
 
   CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
                            bitmask.first->data(),
@@ -203,7 +194,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<bool>(column_view in
                            stream.value()));
   return to_arrow_array(id,
                         static_cast<int64_t>(input.size()),
-                        data_buffer,
+                        std::move(data_buffer),
                         fetch_mask_buffer(input, ar_mr, stream),
                         static_cast<int64_t>(input.null_count()));
 }
@@ -225,19 +216,13 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::string_view>(
   column_view input_view = (tmp_column != nullptr) ? tmp_column->view() : input;
   auto child_arrays      = fetch_child_array(input_view, {{}, {}}, ar_mr, stream);
   if (child_arrays.empty()) {
-    arrow::Result<std::unique_ptr<arrow::Buffer>> result;
-
     // Empty string will have only one value in offset of 4 bytes
-    result = arrow::AllocateBuffer(4, ar_mr);
-    CUDF_EXPECTS(result.ok(), "Failed to allocate buffer");
-    std::shared_ptr<arrow::Buffer> tmp_offset_buffer = std::move(result.ValueOrDie());
-    tmp_offset_buffer->mutable_data()[0]             = 0;
-
-    result = arrow::AllocateBuffer(0, ar_mr);
-    CUDF_EXPECTS(result.ok(), "Failed to allocate buffer");
-    std::shared_ptr<arrow::Buffer> tmp_data_buffer = std::move(result.ValueOrDie());
+    auto tmp_offset_buffer               = allocate_arrow_buffer(4, ar_mr);
+    auto tmp_data_buffer                 = allocate_arrow_buffer(0, ar_mr);
+    tmp_offset_buffer->mutable_data()[0] = 0;
 
-    return std::make_shared<arrow::StringArray>(0, tmp_offset_buffer, tmp_data_buffer);
+    return std::make_shared<arrow::StringArray>(
+      0, std::move(tmp_offset_buffer), std::move(tmp_data_buffer));
   }
   auto offset_buffer = child_arrays[0]->data()->buffers[1];
   auto data_buffer   = child_arrays[1]->data()->buffers[1];

From 0410bf9d654ed8f82ea74a2a3b337614c2795b4d Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 17 Aug 2021 16:48:35 -0400
Subject: [PATCH 02/46] Remove some debug print statements from gtests (#9048)

While working on #8883 I found some print statements that were a bit annoying while trying to narrow done the cuda-memcheck errors. This PR removes them. I don't think any of these are hard to recode if needed for a specific debug session but I feel they should not be inflicted on developers who are debugging other issues.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/9048
---
 cpp/tests/encode/encode_tests.cpp          |  3 ---
 cpp/tests/rolling/grouped_rolling_test.cpp | 26 ----------------------
 cpp/tests/rolling/rolling_test.cpp         | 13 -----------
 cpp/tests/structs/structs_column_tests.cpp |  5 -----
 4 files changed, 47 deletions(-)

diff --git a/cpp/tests/encode/encode_tests.cpp b/cpp/tests/encode/encode_tests.cpp
index 52244b38dfe..73c77a39a97 100644
--- a/cpp/tests/encode/encode_tests.cpp
+++ b/cpp/tests/encode/encode_tests.cpp
@@ -67,9 +67,6 @@ TYPED_TEST(EncodeNumericTests, SimpleWithNulls)
   cudf::test::fixed_width_column_wrapper<TypeParam> expect_keys{{1, 2, 3, 0}, {1, 1, 1, 0}};
   auto const result = cudf::encode(cudf::table_view({input}));
 
-  cudf::test::print(result.first->view().column(0));
-  cudf::test::print(expect_keys);
-
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.first->view().column(0), expect_keys);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.second->view(), expect);
 }
diff --git a/cpp/tests/rolling/grouped_rolling_test.cpp b/cpp/tests/rolling/grouped_rolling_test.cpp
index cb123114fd8..72b30c19fd5 100644
--- a/cpp/tests/rolling/grouped_rolling_test.cpp
+++ b/cpp/tests/rolling/grouped_rolling_test.cpp
@@ -139,19 +139,6 @@ class GroupedRollingTest : public cudf::test::BaseFixture {
     auto reference = create_reference_output(
       op, input, expected_grouping, preceding_window, following_window, min_periods);
 
-#ifndef NDEBUG
-    std::cout << "input:\n";
-    cudf::test::print(input, std::cout, ", ");
-    std::cout << "\n";
-    std::cout << "output:\n";
-    cudf::test::print(*output, std::cout, ", ");
-    std::cout << "\n";
-    std::cout << "reference:\n";
-    cudf::test::print(*reference, std::cout, ", ");
-    std::cout << "\n";
-    std::cout << "\n";
-#endif
-
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*output, *reference);
   }
 
@@ -709,19 +696,6 @@ class GroupedTimeRangeRollingTest : public cudf::test::BaseFixture {
                                              following_window_in_days,
                                              min_periods);
 
-#ifndef NDEBUG
-    std::cout << "input:\n";
-    cudf::test::print(input, std::cout, ", ");
-    std::cout << "\n";
-    std::cout << "output:\n";
-    cudf::test::print(*output, std::cout, ", ");
-    std::cout << "\n";
-    std::cout << "reference:\n";
-    cudf::test::print(*reference, std::cout, ", ");
-    std::cout << "\n";
-    std::cout << "\n";
-#endif
-
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*output, *reference);
   }
 
diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp
index a67e670acb7..ec88500fde1 100644
--- a/cpp/tests/rolling/rolling_test.cpp
+++ b/cpp/tests/rolling/rolling_test.cpp
@@ -190,19 +190,6 @@ class RollingTest : public cudf::test::BaseFixture {
     auto reference =
       create_reference_output(op, input, preceding_window, following_window, min_periods);
 
-#if 0
-    std::cout << "input:\n";
-    cudf::test::print(input, std::cout, ", ");
-    std::cout << "\n";
-    std::cout << "output:\n";
-    cudf::test::print(*output, std::cout, ", ");
-    std::cout << "\n";
-    std::cout << "reference:\n";
-    cudf::test::print(reference, std::cout, ", ");
-    std::cout << "\n";
-    std::cout << "\n";
-#endif
-
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*output, *reference);
   }
 
diff --git a/cpp/tests/structs/structs_column_tests.cpp b/cpp/tests/structs/structs_column_tests.cpp
index 548284d6c87..a94a35e8896 100644
--- a/cpp/tests/structs/structs_column_tests.cpp
+++ b/cpp/tests/structs/structs_column_tests.cpp
@@ -433,11 +433,6 @@ TYPED_TEST(TypedStructColumnWrapperTest, TestListsOfStructs)
 
   cudf::test::expect_columns_equivalent(expected_unchanged_struct_col,
                                         cudf::lists_column_view(*list_col).child());
-
-#ifndef NDEBUG
-  std::cout << "Printing list col: \n";
-  cudf::test::print(*list_col);
-#endif
 }
 
 TYPED_TEST(TypedStructColumnWrapperTest, ListOfStructOfList)

From 5f1b7f2eee17b43ee40223564247c024435f90ad Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 17 Aug 2021 16:18:52 -0500
Subject: [PATCH 03/46] Add support for percentile dispatch in `dask_cudf`
 (#9031)

This PR adds support for percentile dispatch in `dask_cudf`.

Upstream changes needed: https://github.com/dask/dask/pull/8029

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Marlene  (https://github.com/marlenezw)
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: https://github.com/rapidsai/cudf/pull/9031
---
 python/cudf/cudf/core/column/datetime.py  |  9 +++++
 python/cudf/cudf/core/column/timedelta.py |  9 +++++
 python/cudf/cudf/tests/test_datetime.py   |  9 +++++
 python/cudf/cudf/tests/test_timedelta.py  |  9 +++++
 python/dask_cudf/dask_cudf/backends.py    | 48 +++++++++++++++++++++++
 5 files changed, 84 insertions(+)

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index fa59f38c734..46ff1990ac2 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -158,6 +158,15 @@ def to_pandas(
             index=index,
         )
 
+    @property
+    def values(self):
+        """
+        Return a CuPy representation of the DateTimeColumn.
+        """
+        raise NotImplementedError(
+            "DateTime Arrays is not yet implemented in cudf"
+        )
+
     def get_dt_field(self, field: str) -> ColumnBase:
         return libcudf.datetime.extract_datetime_component(self, field)
 
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 295663e70f3..7c1250231f3 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -92,6 +92,15 @@ def __contains__(self, item: DatetimeLikeScalar) -> bool:
             return False
         return item.view("int64") in self.as_numerical
 
+    @property
+    def values(self):
+        """
+        Return a CuPy representation of the TimeDeltaColumn.
+        """
+        raise NotImplementedError(
+            "TimeDelta Arrays is not yet implemented in cudf"
+        )
+
     def to_arrow(self) -> pa.Array:
         mask = None
         if self.nullable:
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 96d50c66f7e..9f19bf8b960 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -1541,3 +1541,12 @@ def test_is_quarter_end(data, dtype):
     got = gs.dt.is_quarter_end
 
     assert_eq(expect, got)
+
+
+def test_error_values():
+    s = cudf.Series([1, 2, 3], dtype="datetime64[ns]")
+    with pytest.raises(
+        NotImplementedError,
+        match="DateTime Arrays is not yet implemented in cudf",
+    ):
+        s.values
diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
index a65fdeeb0dd..75923a0b284 100644
--- a/python/cudf/cudf/tests/test_timedelta.py
+++ b/python/cudf/cudf/tests/test_timedelta.py
@@ -1386,3 +1386,12 @@ def test_timedelta_reductions(data, op, dtype):
         assert True
     else:
         assert_eq(expected.to_numpy(), actual)
+
+
+def test_error_values():
+    s = cudf.Series([1, 2, 3], dtype="timedelta64[ns]")
+    with pytest.raises(
+        NotImplementedError,
+        match="TimeDelta Arrays is not yet implemented in cudf",
+    ):
+        s.values
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index be50b5c3794..c0204190957 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -1,5 +1,7 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
+from collections.abc import Iterator
+
 import cupy as cp
 import numpy as np
 import pandas as pd
@@ -256,6 +258,52 @@ def is_categorical_dtype_cudf(obj):
     return cudf.utils.dtypes.is_categorical_dtype(obj)
 
 
+try:
+    from dask.dataframe.dispatch import percentile_dispatch
+
+    @percentile_dispatch.register((cudf.Series, cp.ndarray, cudf.Index))
+    def percentile_cudf(a, q, interpolation="linear"):
+        # Cudf dispatch to the equivalent of `np.percentile`:
+        # https://numpy.org/doc/stable/reference/generated/numpy.percentile.html
+        a = cudf.Series(a)
+        # a is series.
+        n = len(a)
+        if not len(a):
+            return None, n
+        if isinstance(q, Iterator):
+            q = list(q)
+
+        if cudf.utils.dtypes.is_categorical_dtype(a.dtype):
+            result = cp.percentile(a.cat.codes, q, interpolation=interpolation)
+
+            return (
+                pd.Categorical.from_codes(
+                    result, a.dtype.categories, a.dtype.ordered
+                ),
+                n,
+            )
+        if np.issubdtype(a.dtype, np.datetime64):
+            result = a.quantile(
+                [i / 100.0 for i in q], interpolation=interpolation
+            )
+
+            if q[0] == 0:
+                # https://github.com/dask/dask/issues/6864
+                result[0] = min(result[0], a.min())
+            return result.to_pandas(), n
+        if not np.issubdtype(a.dtype, np.number):
+            interpolation = "nearest"
+        return (
+            a.quantile(
+                [i / 100.0 for i in q], interpolation=interpolation
+            ).to_pandas(),
+            n,
+        )
+
+
+except ImportError:
+    pass
+
 try:
     from dask.dataframe.dispatch import union_categoricals_dispatch
 

From b3c1caf26410782f4e2b412efbd2f566ac11194a Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Tue, 17 Aug 2021 16:55:38 -0600
Subject: [PATCH 04/46] Implement `interleave_columns` for structs columns
 (#9012)

This PR adds support for structs column in the `interleave_column` API. In addition, it also does a simple refactor of the existing overload functions with a new style of SFINAE implementation.
Closes #8927.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)

URL: https://github.com/rapidsai/cudf/pull/9012
---
 cpp/src/reshape/interleave_columns.cu         | 133 +++++--
 .../reshape/interleave_columns_tests.cpp      | 341 +++++++++++++++++-
 2 files changed, 445 insertions(+), 29 deletions(-)

diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index 869218f9643..b15708c5cf8 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -29,32 +29,111 @@
 namespace cudf {
 namespace detail {
 namespace {
-struct interleave_columns_functor {
-  template <typename T, typename... Args>
-  std::enable_if_t<not cudf::is_fixed_width<T>() and not std::is_same_v<T, cudf::string_view> and
-                     not std::is_same_v<T, cudf::list_view>,
-                   std::unique_ptr<cudf::column>>
-  operator()(Args&&...)
+// Error case when no other overload or specialization is available
+template <typename T, typename Enable = void>
+struct interleave_columns_impl {
+  template <typename... Args>
+  std::unique_ptr<column> operator()(Args&&...)
   {
-    CUDF_FAIL("Called `interleave_columns` on none-supported data type.");
+    CUDF_FAIL("Unsupported type in `interleave_columns`.");
   }
+};
 
+struct interleave_columns_functor {
   template <typename T>
-  std::enable_if_t<std::is_same_v<T, cudf::list_view>, std::unique_ptr<cudf::column>> operator()(
-    table_view const& lists_columns,
-    bool create_mask,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+  std::unique_ptr<cudf::column> operator()(table_view const& input,
+                                           bool create_mask,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
+  {
+    return interleave_columns_impl<T>{}(input, create_mask, stream, mr);
+  }
+};
+
+template <typename T>
+struct interleave_columns_impl<T, typename std::enable_if_t<std::is_same_v<T, cudf::list_view>>> {
+  std::unique_ptr<column> operator()(table_view const& lists_columns,
+                                     bool create_mask,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     return lists::detail::interleave_columns(lists_columns, create_mask, stream, mr);
   }
+};
 
-  template <typename T>
-  std::enable_if_t<std::is_same_v<T, cudf::string_view>, std::unique_ptr<cudf::column>> operator()(
-    table_view const& strings_columns,
-    bool create_mask,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+template <typename T>
+struct interleave_columns_impl<T, typename std::enable_if_t<std::is_same_v<T, cudf::struct_view>>> {
+  std::unique_ptr<cudf::column> operator()(table_view const& structs_columns,
+                                           bool create_mask,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
+  {
+    // We can safely call `column(0)` as the number of columns is known to be non zero.
+    auto const num_children = structs_columns.column(0).num_children();
+    CUDF_EXPECTS(
+      std::all_of(structs_columns.begin(),
+                  structs_columns.end(),
+                  [num_children](auto const& col) { return col.num_children() == num_children; }),
+      "Number of children of the input structs columns must be the same");
+
+    auto const num_columns = structs_columns.num_columns();
+    auto const num_rows    = structs_columns.num_rows();
+    auto const output_size = num_columns * num_rows;
+
+    // Interleave the children of the structs columns.
+    std::vector<std::unique_ptr<cudf::column>> output_struct_members;
+    for (size_type child_idx = 0; child_idx < num_children; ++child_idx) {
+      // Collect children columns from the input structs columns at index `child_idx`.
+      auto const child_iter =
+        thrust::make_transform_iterator(structs_columns.begin(), [child_idx](auto const& col) {
+          return structs_column_view(col).get_sliced_child(child_idx);
+        });
+      auto children = std::vector<column_view>(child_iter, child_iter + num_columns);
+
+      auto const child_type = children.front().type();
+      CUDF_EXPECTS(
+        std::all_of(children.cbegin(),
+                    children.cend(),
+                    [child_type](auto const& col) { return child_type == col.type(); }),
+        "Children of the input structs columns at the same child index must have the same type");
+
+      auto const children_nullable = std::any_of(
+        children.cbegin(), children.cend(), [](auto const& col) { return col.nullable(); });
+      output_struct_members.emplace_back(
+        type_dispatcher<dispatch_storage_type>(child_type,
+                                               interleave_columns_functor{},
+                                               table_view{std::move(children)},
+                                               children_nullable,
+                                               stream,
+                                               mr));
+    }
+
+    auto const create_mask_fn = [&] {
+      auto const input_dv_ptr = table_device_view::create(structs_columns);
+      auto const validity_fn  = [input_dv = *input_dv_ptr, num_columns] __device__(auto const idx) {
+        return input_dv.column(idx % num_columns).is_valid(idx / num_columns);
+      };
+      return cudf::detail::valid_if(thrust::make_counting_iterator<size_type>(0),
+                                    thrust::make_counting_iterator<size_type>(output_size),
+                                    validity_fn,
+                                    stream,
+                                    mr);
+    };
+
+    // Only create null mask if at least one input structs column is nullable.
+    auto [null_mask, null_count] =
+      create_mask ? create_mask_fn() : std::pair{rmm::device_buffer{0, stream, mr}, size_type{0}};
+    return make_structs_column(
+      output_size, std::move(output_struct_members), null_count, std::move(null_mask), stream, mr);
+  }
+};
+
+template <typename T>
+struct interleave_columns_impl<T, typename std::enable_if_t<std::is_same_v<T, cudf::string_view>>> {
+  std::unique_ptr<cudf::column> operator()(table_view const& strings_columns,
+                                           bool create_mask,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     auto num_columns = strings_columns.num_columns();
     if (num_columns == 1)  // Single strings column returns a copy
@@ -106,7 +185,7 @@ struct interleave_columns_functor {
       cudf::detail::get_value<int32_t>(offsets_column->view(), num_strings, stream);
     auto chars_column = strings::detail::create_chars_child_column(bytes, stream, mr);
     // Fill the chars column
-    auto d_results_chars = chars_column->mutable_view().data<char>();
+    auto d_results_chars = chars_column->mutable_view().template data<char>();
     thrust::for_each_n(
       rmm::exec_policy(stream),
       thrust::make_counting_iterator<size_type>(0),
@@ -132,13 +211,14 @@ struct interleave_columns_functor {
                                stream,
                                mr);
   }
+};
 
-  template <typename T>
-  std::enable_if_t<cudf::is_fixed_width<T>(), std::unique_ptr<cudf::column>> operator()(
-    table_view const& input,
-    bool create_mask,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+template <typename T>
+struct interleave_columns_impl<T, typename std::enable_if_t<cudf::is_fixed_width<T>()>> {
+  std::unique_ptr<cudf::column> operator()(table_view const& input,
+                                           bool create_mask,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     auto arch_column = input.column(0);
     auto output_size = input.num_columns() * input.num_rows();
@@ -193,11 +273,10 @@ std::unique_ptr<column> interleave_columns(table_view const& input,
   CUDF_EXPECTS(input.num_columns() > 0, "input must have at least one column to determine dtype.");
 
   auto const dtype = input.column(0).type();
-
   CUDF_EXPECTS(std::all_of(std::cbegin(input),
                            std::cend(input),
                            [dtype](auto const& col) { return dtype == col.type(); }),
-               "DTYPE mismatch");
+               "Input columns must have the same type");
 
   auto const output_needs_mask = std::any_of(
     std::cbegin(input), std::cend(input), [](auto const& col) { return col.nullable(); });
diff --git a/cpp/tests/reshape/interleave_columns_tests.cpp b/cpp/tests/reshape/interleave_columns_tests.cpp
index 386fd9d08ee..e51f0740787 100644
--- a/cpp/tests/reshape/interleave_columns_tests.cpp
+++ b/cpp/tests/reshape/interleave_columns_tests.cpp
@@ -24,7 +24,7 @@
 
 using namespace cudf::test::iterators;
 
-constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
+constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::FIRST_ERROR};
 
 template <typename T>
 struct InterleaveColumnsTest : public cudf::test::BaseFixture {
@@ -378,7 +378,7 @@ using IntListsCol = cudf::test::lists_column_wrapper<int32_t>;
 using IntCol      = cudf::test::fixed_width_column_wrapper<int32_t>;
 using TView       = cudf::table_view;
 
-constexpr int32_t null{0};
+constexpr int32_t null{0};  // mark for null elements
 }  // namespace
 
 struct ListsColumnsInterleaveTest : public cudf::test::BaseFixture {
@@ -731,4 +731,341 @@ TEST_F(ListsColumnsInterleaveTest, SlicedStringsColumnsInputWithNulls)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, verbosity);
 }
 
+namespace {
+using StructsCol = cudf::test::structs_column_wrapper;
+using StringsCol = cudf::test::strings_column_wrapper;
+}  // namespace
+
+struct StructsColumnsInterleaveTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(StructsColumnsInterleaveTest, InvalidInput)
+{
+  // Input table contains non-structs column
+  {
+    auto const col1 = IntCol{};
+    auto const col2 = StructsCol{};
+    EXPECT_THROW(cudf::interleave_columns(TView{{col1, col2}}), cudf::logic_error);
+  }
+
+  // Types mismatch
+  {
+    auto const structs1 = [] {
+      auto child1 = IntCol{1, 2, 3};
+      auto child2 = IntCol{4, 5, 6};
+      return StructsCol{{child1, child2}};
+    }();
+
+    auto const structs2 = [] {
+      auto child1 = IntCol{7, 8, 9};
+      auto child2 = StringsCol{"", "abc", "123"};
+      return StructsCol{{child1, child2}};
+    }();
+
+    EXPECT_THROW(cudf::interleave_columns(TView{{structs1, structs2}}), cudf::logic_error);
+  }
+
+  // Numbers of children mismatch
+  {
+    auto const structs1 = [] {
+      auto child1 = IntCol{1, 2, 3};
+      auto child2 = IntCol{4, 5, 6};
+      return StructsCol{{child1, child2}};
+    }();
+
+    auto const structs2 = [] {
+      auto child1 = IntCol{7, 8, 9};
+      auto child2 = IntCol{10, 11, 12};
+      auto child3 = IntCol{13, 14, 15};
+      return StructsCol{{child1, child2, child3}};
+    }();
+
+    EXPECT_THROW(cudf::interleave_columns(TView{{structs1, structs2}}), cudf::logic_error);
+  }
+}
+
+TEST_F(StructsColumnsInterleaveTest, InterleaveEmptyColumns)
+{
+  auto const structs = StructsCol{};
+  auto const results = cudf::interleave_columns(TView{{structs, structs}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(structs, *results, verbosity);
+}
+
+template <typename T>
+struct StructsColumnsInterleaveTypedTest : public cudf::test::BaseFixture {
+};
+
+using TypesForTest = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
+                                        cudf::test::FloatingPointTypes,
+                                        cudf::test::FixedPointTypes>;
+TYPED_TEST_SUITE(StructsColumnsInterleaveTypedTest, TypesForTest);
+
+TYPED_TEST(StructsColumnsInterleaveTypedTest, InterleaveOneColumnNotNull)
+{
+  using ColWrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto const structs = [] {
+    auto child1 = ColWrapper{1, 2, 3};
+    auto child2 = ColWrapper{4, 5, 6};
+    auto child3 = StringsCol{"Banana", "Mango", "Apple"};
+    return StructsCol{{child1, child2, child3}};
+  }();
+  auto const results = cudf::interleave_columns(TView{{structs}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(structs, *results, verbosity);
+}
+
+TYPED_TEST(StructsColumnsInterleaveTypedTest, InterleaveOneColumnWithNulls)
+{
+  using ColWrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto const structs = [] {
+    auto child1 = ColWrapper{{1, 2, null, 3}, null_at(2)};
+    auto child2 = ColWrapper{{4, null, 5, 6}, null_at(1)};
+    auto child3 = StringsCol{{"" /*NULL*/, "Banana", "Mango", "Apple"}, null_at(0)};
+    return StructsCol{{child1, child2, child3}, null_at(3)};
+  }();
+  auto const results = cudf::interleave_columns(TView{{structs}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(structs, *results, verbosity);
+}
+
+TYPED_TEST(StructsColumnsInterleaveTypedTest, SimpleInputNoNull)
+{
+  using ColWrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto const structs1 = [] {
+    auto child1 = ColWrapper{1, 2, 3};
+    auto child2 = ColWrapper{4, 5, 6};
+    auto child3 = StringsCol{"Banana", "Mango", "Apple"};
+    return StructsCol{{child1, child2, child3}};
+  }();
+
+  auto const structs2 = [] {
+    auto child1 = ColWrapper{7, 8, 9};
+    auto child2 = ColWrapper{10, 11, 12};
+    auto child3 = StringsCol{"Bear", "Duck", "Cat"};
+    return StructsCol{{child1, child2, child3}};
+  }();
+
+  auto const expected = [] {
+    auto child1 = ColWrapper{1, 7, 2, 8, 3, 9};
+    auto child2 = ColWrapper{4, 10, 5, 11, 6, 12};
+    auto child3 = StringsCol{"Banana", "Bear", "Mango", "Duck", "Apple", "Cat"};
+    return StructsCol{{child1, child2, child3}};
+  }();
+
+  auto const results = cudf::interleave_columns(TView{{structs1, structs2}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results, verbosity);
+}
+
+TYPED_TEST(StructsColumnsInterleaveTypedTest, SimpleInputWithNulls)
+{
+  using ColWrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto const structs1 = [] {
+    auto child1 = ColWrapper{{1, 2, null, 3, 4}, null_at(2)};
+    auto child2 = ColWrapper{{4, null, 5, 6, 7}, null_at(1)};
+    auto child3 = StringsCol{{"" /*NULL*/, "Banana", "Mango", "Apple", "Cherry"}, null_at(0)};
+    return StructsCol{{child1, child2, child3}, null_at(0)};
+  }();
+
+  auto const structs2 = [] {
+    auto child1 = ColWrapper{{7, null, null, 8, 9}, nulls_at({1, 2})};
+    auto child2 = ColWrapper{{10, 11, 12, null, 14}, null_at(3)};
+    auto child3 = StringsCol{"Bear", "Duck", "Cat", "Dog", "Panda"};
+    return StructsCol{{child1, child2, child3}, null_at(4)};
+  }();
+
+  auto const structs3 = [] {
+    auto child1 = ColWrapper{{-1, -2, -3, 0, null}, null_at(4)};
+    auto child2 = ColWrapper{{-5, 0, null, -1, -10}, null_at(2)};
+    auto child3 = StringsCol{"111", "Bànànà", "abcxyz", "é á í", "zzz"};
+    return StructsCol{{child1, child2, child3}, null_at(1)};
+  }();
+
+  auto const expected = [] {
+    auto child1 = ColWrapper{{1, 7, -1, 2, null, -2, null, null, -3, 3, 8, 0, 4, 9, null},
+                             nulls_at({4, 6, 7, 14})};
+    auto child2 = ColWrapper{{4, 10, -5, null, 11, 0, 5, 12, null, 6, null, -1, 7, 14, -10},
+                             nulls_at({3, 8, 10})};
+    auto child3 = StringsCol{{"" /*NULL*/,
+                              "Bear",
+                              "111",
+                              "Banana",
+                              "Duck",
+                              "Bànànà",
+                              "Mango",
+                              "Cat",
+                              "abcxyz",
+                              "Apple",
+                              "Dog",
+                              "é á í",
+                              "Cherry",
+                              "Panda",
+                              "zzz"},
+                             null_at(0)};
+    return StructsCol{{child1, child2, child3}, nulls_at({0, 5, 13})};
+  }();
+
+  auto const results = cudf::interleave_columns(TView{{structs1, structs2, structs3}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results, verbosity);
+}
+
+TYPED_TEST(StructsColumnsInterleaveTypedTest, NestedInputStructsColumns)
+{
+  using ColWrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto const structs1 = [] {
+    auto child_structs1 = [] {
+      auto child1 = ColWrapper{{null, 2, 3, 4, 5}, null_at(0)};
+      auto child2 = ColWrapper{{6, 7, 8, null, 10}, null_at(3)};
+      return StructsCol{{child1, child2}, null_at(0)};
+    }();
+
+    auto child_structs2 = [] {
+      auto child1 = ColWrapper{{11, null, 13, 14, 15}, null_at(1)};
+      auto child2 = ColWrapper{{null, 17, 18, 19, 20}, null_at(0)};
+      return StructsCol{{child1, child2}, nulls_at({0, 1})};
+    }();
+
+    auto child_strings = [] { return StringsCol{"Banana", "Mango", "Apple", "Cherry", "Kiwi"}; }();
+
+    return StructsCol{{child_structs1, child_structs2, child_strings}, null_at(0)};
+  }();
+
+  auto const structs2 = [] {
+    auto child_structs1 = [] {
+      auto child1 = ColWrapper{{-1, null, -3, -4, -5}, null_at(1)};
+      auto child2 = ColWrapper{{-6, -7, -8, null, -10}, null_at(3)};
+      return StructsCol{{child1, child2}};
+    }();
+
+    auto child_structs2 = [] {
+      auto child1 = ColWrapper{{-11, -12, null, -14, -15}, null_at(2)};
+      auto child2 = ColWrapper{{-16, -17, -18, -19, null}, null_at(4)};
+      return StructsCol{{child1, child2}, null_at(2)};
+    }();
+
+    auto child_strings = [] { return StringsCol{"Bear", "Duck", "Cat", "Dog", "Rabbit"}; }();
+
+    return StructsCol{{child_structs1, child_structs2, child_strings}, null_at(2)};
+  }();
+
+  auto const expected = [] {
+    auto child_structs1 = [] {
+      auto child1 = ColWrapper{{null, -1, 2, null, 3, -3, 4, -4, 5, -5}, nulls_at({0, 3})};
+      auto child2 = ColWrapper{{6, -6, 7, -7, 8, -8, null, null, 10, -10}, nulls_at({6, 7})};
+      return StructsCol{{child1, child2}, null_at(0)};
+    }();
+
+    auto child_structs2 = [] {
+      auto child1 = ColWrapper{{11, -11, null, -12, 13, null, 14, -14, 15, -15}, nulls_at({2, 5})};
+      auto child2 = ColWrapper{{null, -16, 17, -17, 18, -18, 19, -19, 20, null}, nulls_at({0, 9})};
+      return StructsCol{{child1, child2}, nulls_at({0, 2, 5})};
+    }();
+
+    auto child_strings = [] {
+      return StringsCol{
+        "Banana", "Bear", "Mango", "Duck", "Apple", "Cat", "Cherry", "Dog", "Kiwi", "Rabbit"};
+    }();
+
+    return StructsCol{{child_structs1, child_structs2, child_strings}, nulls_at({0, 5})};
+  }();
+
+  auto const results = cudf::interleave_columns(TView{{structs1, structs2}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results, verbosity);
+}
+
+TYPED_TEST(StructsColumnsInterleaveTypedTest, SlicedColumnsInputNoNull)
+{
+  using ColWrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+  constexpr int32_t NOT_USE{-1};  // mark for elements that we don't care
+
+  auto const structs1_original = [] {
+    auto child1 = ColWrapper{NOT_USE, NOT_USE, 1, 2, 3, NOT_USE};
+    auto child2 = ColWrapper{NOT_USE, NOT_USE, 4, 5, 6, NOT_USE};
+    auto child3 = StringsCol{"NOT_USE", "NOT_USE", "Banana", "Mango", "Apple", "NOT_USE"};
+    return StructsCol{{child1, child2, child3}};
+  }();
+
+  // structs2 has more rows than structs1
+  auto const structs2_original = [] {
+    auto child1 = ColWrapper{NOT_USE, 7, 8, 9, NOT_USE, NOT_USE, NOT_USE};
+    auto child2 = ColWrapper{NOT_USE, 10, 11, 12, NOT_USE, NOT_USE, NOT_USE};
+    auto child3 = StringsCol{"NOT_USE", "Bear", "Duck", "Cat", "NOT_USE", "NOT_USE", "NOT_USE"};
+    return StructsCol{{child1, child2, child3}};
+  }();
+
+  auto const expected = [] {
+    auto child1 = ColWrapper{1, 7, 2, 8, 3, 9};
+    auto child2 = ColWrapper{4, 10, 5, 11, 6, 12};
+    auto child3 = StringsCol{"Banana", "Bear", "Mango", "Duck", "Apple", "Cat"};
+    return StructsCol{{child1, child2, child3}};
+  }();
+
+  auto const structs1 = cudf::slice(structs1_original, {2, 5})[0];
+  auto const structs2 = cudf::slice(structs2_original, {1, 4})[0];
+  auto const results  = cudf::interleave_columns(TView{{structs1, structs2}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results, verbosity);
+}
+
+TYPED_TEST(StructsColumnsInterleaveTypedTest, SlicedColumnsInputWithNulls)
+{
+  using ColWrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+  constexpr int32_t NOT_USE{-1};  // mark for elements that we don't care
+
+  auto const structs1_original = [] {
+    auto child1 = ColWrapper{{NOT_USE, NOT_USE, 1, 2, null, 3, 4, NOT_USE}, null_at(4)};
+    auto child2 = ColWrapper{{NOT_USE, NOT_USE, 4, null, 5, 6, 7, NOT_USE}, null_at(3)};
+    auto child3 = StringsCol{
+      {"NOT_USE", "NOT_USE", "" /*NULL*/, "Banana", "Mango", "Apple", "Cherry", "NOT_USE"},
+      null_at(2)};
+    return StructsCol{{child1, child2, child3}, null_at(2)};
+  }();
+
+  auto const structs2_original = [] {
+    auto child1 = ColWrapper{{7, null, null, 8, 9, NOT_USE, NOT_USE}, nulls_at({1, 2})};
+    auto child2 = ColWrapper{{10, 11, 12, null, 14, NOT_USE, NOT_USE}, null_at(3)};
+    auto child3 = StringsCol{"Bear", "Duck", "Cat", "Dog", "Panda", "NOT_USE", "NOT_USE"};
+    return StructsCol{{child1, child2, child3}, null_at(4)};
+  }();
+
+  auto const structs3_original = [] {
+    auto child1 = ColWrapper{{NOT_USE, NOT_USE, NOT_USE, -1, -2, -3, 0, null}, null_at(7)};
+    auto child2 = ColWrapper{{NOT_USE, NOT_USE, NOT_USE, -5, 0, null, -1, -10}, null_at(5)};
+    auto child3 =
+      StringsCol{"NOT_USE", "NOT_USE", "NOT_USE", "111", "Bànànà", "abcxyz", "é á í", "zzz"};
+    return StructsCol{{child1, child2, child3}, null_at(4)};
+  }();
+
+  auto const expected = [] {
+    auto child1 = ColWrapper{{1, 7, -1, 2, null, -2, null, null, -3, 3, 8, 0, 4, 9, null},
+                             nulls_at({4, 6, 7, 14})};
+    auto child2 = ColWrapper{{4, 10, -5, null, 11, 0, 5, 12, null, 6, null, -1, 7, 14, -10},
+                             nulls_at({3, 8, 10})};
+    auto child3 = StringsCol{{"" /*NULL*/,
+                              "Bear",
+                              "111",
+                              "Banana",
+                              "Duck",
+                              "Bànànà",
+                              "Mango",
+                              "Cat",
+                              "abcxyz",
+                              "Apple",
+                              "Dog",
+                              "é á í",
+                              "Cherry",
+                              "Panda",
+                              "zzz"},
+                             null_at(0)};
+    return StructsCol{{child1, child2, child3}, nulls_at({0, 5, 13})};
+  }();
+
+  auto const structs1 = cudf::slice(structs1_original, {2, 7})[0];
+  auto const structs2 = cudf::slice(structs2_original, {0, 5})[0];
+  auto const structs3 = cudf::slice(structs3_original, {3, 8})[0];
+  auto const results  = cudf::interleave_columns(TView{{structs1, structs2, structs3}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results, verbosity);
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From c76ec131dd5eb2517de90a23aaa328f5e9c7b064 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 17 Aug 2021 18:14:34 -0500
Subject: [PATCH 05/46] Fix `Dataframe` indexer setitem when array is passed
 (#9006)

Fixes: #8672

This PR handles `ndarray` inputs in the `_DataFrameLocIndexer.__setitem__`

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Marlene  (https://github.com/marlenezw)

URL: https://github.com/rapidsai/cudf/pull/9006
---
 python/cudf/cudf/core/indexing.py        | 25 ++++++++++++++--
 python/cudf/cudf/tests/test_dataframe.py | 37 ++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py
index 09cfc6e144a..da999f13fa8 100755
--- a/python/cudf/cudf/core/indexing.py
+++ b/python/cudf/cudf/core/indexing.py
@@ -432,7 +432,7 @@ def _setitem_tuple_arg(self, key, value):
             )
 
         try:
-            columns = self._get_column_selection(key[1])
+            columns_df = self._get_column_selection(key[1])
         except KeyError:
             if not self._df.empty and isinstance(key[0], slice):
                 pos_range = get_label_range_or_mask(
@@ -457,8 +457,27 @@ def _setitem_tuple_arg(self, key, value):
                 )
             self._df._data.insert(key[1], new_col)
         else:
-            for col in columns:
-                self._df[col].loc[key[0]] = value
+            if isinstance(value, (cp.ndarray, np.ndarray)):
+                value_df = cudf.DataFrame(value)
+                if value_df.shape[1] != columns_df.shape[1]:
+                    if value_df.shape[1] == 1:
+                        value_cols = (
+                            value_df._data.columns * columns_df.shape[1]
+                        )
+                    else:
+                        raise ValueError(
+                            f"shape mismatch: value array of shape "
+                            f"{value_df.shape} could not be "
+                            f"broadcast to indexing result of shape "
+                            f"{columns_df.shape}"
+                        )
+                else:
+                    value_cols = value_df._data.columns
+                for i, col in enumerate(columns_df._column_names):
+                    self._df[col].loc[key[0]] = value_cols[i]
+            else:
+                for col in columns_df._column_names:
+                    self._df[col].loc[key[0]] = value
 
     def _get_column_selection(self, arg):
         return self._df._get_columns_by_label(arg)
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index a37a80236c1..a337660b5b0 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -8759,6 +8759,43 @@ def test_frame_series_where():
     assert_eq(expected, actual)
 
 
+@pytest.mark.parametrize(
+    "array,is_error",
+    [
+        (cupy.arange(20, 40).reshape(-1, 2), False),
+        (cupy.arange(20, 50).reshape(-1, 3), True),
+        (np.arange(20, 40).reshape(-1, 2), False),
+        (np.arange(20, 30).reshape(-1, 1), False),
+        (cupy.arange(20, 30).reshape(-1, 1), False),
+    ],
+)
+def test_dataframe_indexing_setitem_np_cp_array(array, is_error):
+    gdf = cudf.DataFrame({"a": range(10), "b": range(10)})
+    pdf = gdf.to_pandas()
+    if not is_error:
+        gdf.loc[:, ["a", "b"]] = array
+        pdf.loc[:, ["a", "b"]] = cupy.asnumpy(array)
+
+        assert_eq(gdf, pdf)
+    else:
+        assert_exceptions_equal(
+            lfunc=pdf.loc.__setitem__,
+            rfunc=gdf.loc.__setitem__,
+            lfunc_args_and_kwargs=(
+                [(slice(None, None, None), ["a", "b"]), cupy.asnumpy(array)],
+                {},
+            ),
+            rfunc_args_and_kwargs=(
+                [(slice(None, None, None), ["a", "b"]), array],
+                {},
+            ),
+            compare_error_message=False,
+            expected_error_message="shape mismatch: value array of shape "
+            "(10, 3) could not be broadcast to indexing "
+            "result of shape (10, 2)",
+        )
+
+
 @pytest.mark.parametrize(
     "data", [{"a": [1, 2, 3], "b": [1, 1, 0]}],
 )

From 0789a81afda0b4ab57180dacb98e9591391567f7 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 17 Aug 2021 18:40:19 -0500
Subject: [PATCH 06/46] Remove usage of string based `set_dtypes` for `csv` &
 `json` readers (#9049)

This PR removes usage of `string` based `set_dtypes` in both `CSV` & `JSON` readers.

Partially addresses: https://github.com/rapidsai/cudf/issues/8240

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: https://github.com/rapidsai/cudf/pull/9049
---
 python/cudf/cudf/_lib/cpp/io/csv.pxd  |   1 -
 python/cudf/cudf/_lib/cpp/io/json.pxd |   1 -
 python/cudf/cudf/_lib/csv.pyx         | 118 +++++++++++++++++---------
 python/cudf/cudf/_lib/json.pyx        |  46 ++++++----
 python/cudf/cudf/tests/test_csv.py    |   1 -
 python/cudf/cudf/tests/test_json.py   |   4 +-
 6 files changed, 107 insertions(+), 64 deletions(-)

diff --git a/python/cudf/cudf/_lib/cpp/io/csv.pxd b/python/cudf/cudf/_lib/cpp/io/csv.pxd
index 725757121d9..4afd8732320 100644
--- a/python/cudf/cudf/_lib/cpp/io/csv.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/csv.pxd
@@ -101,7 +101,6 @@ cdef extern from "cudf/io/csv.hpp" \
         void set_parse_hex(vector[int]) except+
 
         # Conversion settings
-        void set_dtypes(vector[string] types) except+
         void set_dtypes(vector[data_type] types) except+
         void set_dtypes(map[string, data_type] types) except+
         void set_true_values(vector[string] vals) except+
diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd
index 4a3792f5023..2c65e329bb0 100644
--- a/python/cudf/cudf/_lib/cpp/io/json.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/json.pxd
@@ -26,7 +26,6 @@ cdef extern from "cudf/io/json.hpp" \
         bool is_enabled_dayfirst() except+
 
         # setter
-        void set_dtypes(vector[string] types) except+
         void set_dtypes(vector[data_type] types) except+
         void set_dtypes(map[string, data_type] types) except+
         void set_compression(
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index a15a180d466..812d614e6d3 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -1,11 +1,16 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 
 from libcpp cimport bool
+from libcpp.map cimport map
 from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
+cimport cudf._lib.cpp.types as libcudf_types
+from cudf._lib.cpp.types cimport data_type, type_id
+from cudf._lib.types cimport dtype_to_data_type
+
 import numpy as np
 import pandas as pd
 
@@ -69,6 +74,12 @@ class Compression(IntEnum):
     )
 
 
+CSV_HEX_TYPE_MAP = {
+    "hex": np.dtype("int64"),
+    "hex64": np.dtype("int64"),
+    "hex32": np.dtype("int32")
+}
+
 cdef csv_reader_options make_csv_reader_options(
     object datasource,
     object lineterminator,
@@ -119,7 +130,10 @@ cdef csv_reader_options make_csv_reader_options(
     cdef quote_style c_quoting
     cdef vector[string] c_parse_dates_names
     cdef vector[int] c_parse_dates_indexes
-    cdef vector[string] c_dtypes
+    cdef vector[string] c_hex_col_names
+    cdef vector[data_type] c_dtypes_list
+    cdef map[string, data_type] c_dtypes_map
+    cdef vector[int] c_hex_col_indexes
     cdef vector[string] c_true_values
     cdef vector[string] c_false_values
     cdef vector[string] c_na_values
@@ -232,37 +246,50 @@ cdef csv_reader_options make_csv_reader_options(
 
     if dtype is not None:
         if isinstance(dtype, abc.Mapping):
-            c_dtypes.reserve(len(dtype))
             for k, v in dtype.items():
-                c_dtypes.push_back(
-                    str(
-                        str(k)+":"+
-                        _get_cudf_compatible_str_from_dtype(v)
-                    ).encode()
-                )
+                col_type = v
+                if v in CSV_HEX_TYPE_MAP:
+                    col_type = CSV_HEX_TYPE_MAP[v]
+                    c_hex_col_names.push_back(str(k).encode())
+
+                c_dtypes_map[str(k).encode()] = \
+                    _get_cudf_data_type_from_dtype(
+                        cudf.dtype(col_type))
+            csv_reader_options_c.set_dtypes(c_dtypes_map)
+            csv_reader_options_c.set_parse_hex(c_hex_col_names)
         elif (
             cudf.utils.dtypes.is_scalar(dtype) or
             isinstance(dtype, (
                 np.dtype, pd.core.dtypes.dtypes.ExtensionDtype, type
             ))
         ):
-            c_dtypes.reserve(1)
-            c_dtypes.push_back(
-                _get_cudf_compatible_str_from_dtype(dtype).encode()
+            c_dtypes_list.reserve(1)
+            if dtype in CSV_HEX_TYPE_MAP:
+                dtype = CSV_HEX_TYPE_MAP[dtype]
+                c_hex_col_indexes.push_back(0)
+
+            c_dtypes_list.push_back(
+                _get_cudf_data_type_from_dtype(dtype)
             )
+            csv_reader_options_c.set_dtypes(c_dtypes_list)
+            csv_reader_options_c.set_parse_hex(c_hex_col_indexes)
         elif isinstance(dtype, abc.Iterable):
-            c_dtypes.reserve(len(dtype))
-            for col_dtype in dtype:
-                c_dtypes.push_back(
-                    _get_cudf_compatible_str_from_dtype(col_dtype).encode()
+            c_dtypes_list.reserve(len(dtype))
+            for index, col_dtype in enumerate(dtype):
+                if col_dtype in CSV_HEX_TYPE_MAP:
+                    col_dtype = CSV_HEX_TYPE_MAP[col_dtype]
+                    c_hex_col_indexes.push_back(index)
+
+                c_dtypes_list.push_back(
+                    _get_cudf_data_type_from_dtype(col_dtype)
                 )
+            csv_reader_options_c.set_dtypes(c_dtypes_list)
+            csv_reader_options_c.set_parse_hex(c_hex_col_indexes)
         else:
             raise ValueError(
                 "dtype should be a scalar/str/list-like/dict-like"
             )
 
-        csv_reader_options_c.set_dtypes(c_dtypes)
-
     if true_values is not None:
         c_true_values.reserve(len(true_values))
         for tv in true_values:
@@ -484,7 +511,7 @@ cpdef write_csv(
         cpp_write_csv(options)
 
 
-def _get_cudf_compatible_str_from_dtype(dtype):
+cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +:
     # TODO: Remove this Error message once the
     # following issue is fixed:
     # https://github.com/rapidsai/cudf/issues/3960
@@ -494,29 +521,38 @@ def _get_cudf_compatible_str_from_dtype(dtype):
             "supported in CSV reader"
         )
 
-    if (
-        str(dtype) in cudf.utils.dtypes.ALL_TYPES or
-        str(dtype) in {
-            "hex", "hex32", "hex64", "date", "date32", "timestamp",
-            "timestamp[us]", "timestamp[s]", "timestamp[ms]", "timestamp[ns]",
-            "date64"
-        }
-    ):
-        return str(dtype)
-    pd_dtype = pd.core.dtypes.common.pandas_dtype(dtype)
-
-    if pd_dtype in cudf.utils.dtypes.pandas_dtypes_to_cudf_dtypes:
-        return str(cudf.utils.dtypes.pandas_dtypes_to_cudf_dtypes[pd_dtype])
-    elif isinstance(pd_dtype, np.dtype) and pd_dtype.kind in ("O", "U"):
-        return "str"
-    elif (
-        pd_dtype in cudf.utils.dtypes.cudf_dtypes_to_pandas_dtypes or
-        str(pd_dtype) in cudf.utils.dtypes.ALL_TYPES or
-        cudf.utils.dtypes.is_categorical_dtype(pd_dtype)
-    ):
-        return str(pd_dtype)
-    else:
-        raise ValueError(f"dtype not understood: {dtype}")
+    if isinstance(dtype, str):
+        if str(dtype) == "date32":
+            return libcudf_types.data_type(
+                libcudf_types.type_id.TIMESTAMP_DAYS
+            )
+        elif str(dtype) in ("date", "date64"):
+            return libcudf_types.data_type(
+                libcudf_types.type_id.TIMESTAMP_MILLISECONDS
+            )
+        elif str(dtype) == "timestamp":
+            return libcudf_types.data_type(
+                libcudf_types.type_id.TIMESTAMP_MILLISECONDS
+            )
+        elif str(dtype) == "timestamp[us]":
+            return libcudf_types.data_type(
+                libcudf_types.type_id.TIMESTAMP_MICROSECONDS
+            )
+        elif str(dtype) == "timestamp[s]":
+            return libcudf_types.data_type(
+                libcudf_types.type_id.TIMESTAMP_SECONDS
+            )
+        elif str(dtype) == "timestamp[ms]":
+            return libcudf_types.data_type(
+                libcudf_types.type_id.TIMESTAMP_MILLISECONDS
+            )
+        elif str(dtype) == "timestamp[ns]":
+            return libcudf_types.data_type(
+                libcudf_types.type_id.TIMESTAMP_NANOSECONDS
+            )
+
+    dtype = cudf.dtype(dtype)
+    return dtype_to_data_type(dtype)
 
 
 def columns_apply_na_rep(column_names, na_rep):
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index db196528e97..68d9da57e83 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -10,18 +10,21 @@ import os
 import cudf
 
 from libcpp cimport bool
+from libcpp.map cimport map
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
+cimport cudf._lib.cpp.types as libcudf_types
 from cudf._lib.cpp.io.json cimport (
     json_reader_options,
     read_json as libcudf_read_json,
 )
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.cpp.types cimport data_type, size_type, type_id
 from cudf._lib.io.utils cimport make_source_info
 from cudf._lib.table cimport Table
+from cudf._lib.types cimport dtype_to_data_type
 from cudf._lib.utils cimport data_from_unique_ptr
 
 
@@ -51,7 +54,8 @@ cpdef read_json(object filepaths_or_buffers,
             filepaths_or_buffers[idx] = filepaths_or_buffers[idx].encode()
 
     # Setup arguments
-    cdef vector[string] c_dtypes
+    cdef vector[data_type] c_dtypes_list
+    cdef map[string, data_type] c_dtypes_map
     cdef cudf_io_types.compression_type c_compression
     # Determine byte read offsets if applicable
     cdef size_type c_range_offset = (
@@ -71,40 +75,36 @@ cpdef read_json(object filepaths_or_buffers,
             c_compression = cudf_io_types.compression_type.AUTO
     else:
         c_compression = cudf_io_types.compression_type.NONE
-
+    is_list_like_dtypes = False
     if dtype is False:
         raise ValueError("False value is unsupported for `dtype`")
     elif dtype is not True:
         if isinstance(dtype, abc.Mapping):
-            c_dtypes.reserve(len(dtype))
             for k, v in dtype.items():
-                if cudf.utils.dtypes.is_categorical_dtype(v):
-                    raise NotImplementedError(
-                        "CategoricalDtype as dtype is not yet "
-                        "supported in JSON reader"
-                    )
-                c_dtypes.push_back(str(str(k) + ":" + str(v)).encode())
+                c_dtypes_map[str(k).encode()] = \
+                    _get_cudf_data_type_from_dtype(v)
         elif not isinstance(dtype, abc.Iterable):
             raise TypeError("`dtype` must be 'list like' or 'dict'")
         else:
-            c_dtypes.reserve(len(dtype))
+            is_list_like_dtypes = True
+            c_dtypes_list.reserve(len(dtype))
             for col_dtype in dtype:
-                if cudf.utils.dtypes.is_categorical_dtype(col_dtype):
-                    raise NotImplementedError(
-                        "CategoricalDtype as dtype is not yet "
-                        "supported in JSON reader"
-                    )
-                c_dtypes.push_back(str(col_dtype).encode())
+                c_dtypes_list.push_back(
+                    _get_cudf_data_type_from_dtype(
+                        col_dtype))
 
     cdef json_reader_options opts = move(
         json_reader_options.builder(make_source_info(filepaths_or_buffers))
-        .dtypes(c_dtypes)
         .compression(c_compression)
         .lines(c_lines)
         .byte_range_offset(c_range_offset)
         .byte_range_size(c_range_size)
         .build()
     )
+    if is_list_like_dtypes:
+        opts.set_dtypes(c_dtypes_list)
+    else:
+        opts.set_dtypes(c_dtypes_map)
 
     # Read JSON
     cdef cudf_io_types.table_with_metadata c_out_table
@@ -115,3 +115,13 @@ cpdef read_json(object filepaths_or_buffers,
     column_names = [x.decode() for x in c_out_table.metadata.column_names]
     return data_from_unique_ptr(move(c_out_table.tbl),
                                 column_names=column_names)
+
+cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +:
+    if cudf.utils.dtypes.is_categorical_dtype(dtype):
+        raise NotImplementedError(
+            "CategoricalDtype as dtype is not yet "
+            "supported in JSON reader"
+        )
+
+    dtype = cudf.dtype(dtype)
+    return dtype_to_data_type(dtype)
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 5511a65d0a4..f04a5e6dca0 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -320,7 +320,6 @@ def test_csv_reader_dtype_dict(use_names):
     dtypes = df.dtypes.to_dict()
     gdf_names = list(gdf_dtypes.keys()) if use_names else None
     pdf_names = list(pdf_dtypes.keys()) if use_names else None
-
     gdf = read_csv(StringIO(buffer), dtype=dtypes, names=gdf_names)
     pdf = pd.read_csv(StringIO(buffer), dtype=dtypes, names=pdf_names)
 
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 0b138f446ae..8c06dbea03f 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -278,7 +278,7 @@ def test_json_lines_byte_range(json_input):
 )
 def test_json_lines_dtypes(json_input, dtype):
     df = cudf.read_json(json_input, lines=True, dtype=dtype)
-    assert all(df.dtypes == ["float32", "int32", "int16"])
+    assert all(df.dtypes == ["float64", "int64", "int16"])
 
 
 @pytest.mark.parametrize(
@@ -301,7 +301,7 @@ def test_json_lines_compression(tmpdir, ext, out_comp, in_comp):
     pd_df.to_json(fname, compression=out_comp, lines=True, orient="records")
 
     cu_df = cudf.read_json(
-        str(fname), compression=in_comp, lines=True, dtype=["int", "int"]
+        str(fname), compression=in_comp, lines=True, dtype=["int32", "int32"]
     )
     assert_eq(pd_df, cu_df)
 

From dfe0a03d40f956561e9416b6cc00246d99537aa5 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <devavret@users.noreply.github.com>
Date: Wed, 18 Aug 2021 23:20:48 +0530
Subject: [PATCH 07/46] Fix nvcc warnings in ORC writer (#8975)

Contributes to #8916

Authors:
  - Devavret Makkar (https://github.com/devavret)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/8975
---
 .../cudf/column/column_device_view.cuh        |  9 +++
 cpp/src/io/orc/orc_gpu.h                      |  2 +-
 cpp/src/io/orc/writer_impl.cu                 | 77 +++++++++++++------
 3 files changed, 62 insertions(+), 26 deletions(-)

diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 87fbc1ac651..5950edabbfc 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -24,6 +24,7 @@
 #include <cudf/structs/struct_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
+#include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -853,6 +854,14 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
     return d_children[child_index];
   }
 
+  /**
+   * @brief Returns a span containing the children of this column
+   */
+  __device__ device_span<column_device_view const> children() const noexcept
+  {
+    return device_span<column_device_view const>(d_children, _num_children);
+  }
+
   /**
    * @brief Returns the number of child columns
    *
diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h
index c866afd5324..004812615eb 100644
--- a/cpp/src/io/orc/orc_gpu.h
+++ b/cpp/src/io/orc/orc_gpu.h
@@ -139,7 +139,7 @@ struct EncChunk {
   int32_t scale;                     // scale for decimals or timestamps
 
   uint32_t* dict_index;  // dictionary index from row index
-  device_span<uint32_t> decimal_offsets;
+  uint32_t* decimal_offsets;
   column_device_view const* leaf_column;
 };
 
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index d93845530d7..e0018ed7166 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -678,9 +678,7 @@ encoded_data writer::impl::encode_columns(orc_table_view const& orc_table,
           ck.dtype_len = column.type_width();
         }
         ck.scale = column.scale();
-        if (ck.type_kind == TypeKind::DECIMAL) {
-          ck.decimal_offsets = device_span<uint32_t>{column.decimal_offsets(), ck.num_rows};
-        }
+        if (ck.type_kind == TypeKind::DECIMAL) { ck.decimal_offsets = column.decimal_offsets(); }
       }
     }
   }
@@ -1140,26 +1138,28 @@ void writer::impl::init_state()
   out_sink_->host_write(MAGIC, std::strlen(MAGIC));
 }
 
-/**
- * @brief pre-order append ORC device columns
- */
-void __device__ append_orc_device_column(uint32_t& idx,
-                                         thrust::optional<uint32_t> parent_idx,
-                                         device_span<orc_column_device_view> cols,
-                                         column_device_view col)
-{
-  auto const current_idx = idx;
-  cols[current_idx]      = orc_column_device_view{col, parent_idx};
-  idx++;
-  if (col.type().id() == type_id::LIST) {
-    append_orc_device_column(
-      idx, current_idx, cols, col.child(lists_column_view::child_column_index));
+template <typename T>
+struct device_stack {
+  __device__ device_stack(T* stack_storage, int capacity)
+    : stack(stack_storage), capacity(capacity), size(0)
+  {
   }
-  if (col.type().id() == type_id::STRUCT) {
-    for (auto child_idx = 0; child_idx < col.num_child_columns(); ++child_idx) {
-      append_orc_device_column(idx, current_idx, cols, col.child(child_idx));
-    }
+  __device__ void push(T const& val)
+  {
+    cudf_assert(size < capacity and "Stack overflow");
+    stack[size++] = val;
   }
+  __device__ T pop()
+  {
+    cudf_assert(size > 0 and "Stack underflow");
+    return stack[--size];
+  }
+  __device__ bool empty() { return size == 0; }
+
+ private:
+  T* stack;
+  int capacity;
+  int size;
 };
 
 orc_table_view make_orc_table_view(table_view const& table,
@@ -1189,13 +1189,40 @@ orc_table_view make_orc_table_view(table_view const& table,
   }
 
   rmm::device_uvector<orc_column_device_view> d_orc_columns(orc_columns.size(), stream);
+  using stack_value_type = thrust::pair<column_device_view const*, thrust::optional<uint32_t>>;
+  rmm::device_uvector<stack_value_type> stack_storage(orc_columns.size(), stream);
 
+  // pre-order append ORC device columns
   cudf::detail::device_single_thread(
-    [d_orc_cols = device_span<orc_column_device_view>{d_orc_columns},
-     d_table    = d_table] __device__() mutable {
+    [d_orc_cols         = device_span<orc_column_device_view>{d_orc_columns},
+     d_table            = d_table,
+     stack_storage      = stack_storage.data(),
+     stack_storage_size = stack_storage.size()] __device__() {
+      device_stack stack(stack_storage, stack_storage_size);
+
+      thrust::for_each(thrust::seq,
+                       thrust::make_reverse_iterator(d_table.end()),
+                       thrust::make_reverse_iterator(d_table.begin()),
+                       [&stack](column_device_view const& c) {
+                         stack.push({&c, thrust::nullopt});
+                       });
+
       uint32_t idx = 0;
-      for (auto const& column : d_table) {
-        append_orc_device_column(idx, thrust::nullopt, d_orc_cols, column);
+      while (not stack.empty()) {
+        auto [col, parent] = stack.pop();
+        d_orc_cols[idx]    = orc_column_device_view{*col, parent};
+
+        if (col->type().id() == type_id::LIST) {
+          stack.push({&col->children()[lists_column_view::child_column_index], idx});
+        } else if (col->type().id() == type_id::STRUCT) {
+          thrust::for_each(thrust::seq,
+                           thrust::make_reverse_iterator(col->children().end()),
+                           thrust::make_reverse_iterator(col->children().begin()),
+                           [&stack, idx](column_device_view const& c) {
+                             stack.push({&c, idx});
+                           });
+        }
+        idx++;
       }
     },
     stream);

From 04b7027b954e3154fd0451a4ae5b5776a95e6b6f Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 18 Aug 2021 12:46:10 -0700
Subject: [PATCH 08/46] Expose expression base class publicly and simplify
 public AST API (#9045)

This PR performs the renames `ast::expression`->`ast::operation` and `ast::node`->`ast::expression`, which more accurately reflects that literals and column references are also valid expressions. The new `expression` class is publicly exposed, allowing client code to maintain a list of expression pointers composed of different subclasses of expression, and all APIs now accept arbitrary expressions as input so that it is possible to evaluate e.g. a raw column reference on a table. The old `nodes.hpp` and `operators.hpp` files have been merged into a single `expressions.hpp` header that now contains the entire public API for AST construction, but some implementation details have been moved into an `expressions.cpp` source file so dependencies on detail APIs like the `expression_parser` are restricted to a single TU, which dramatically simplifies recompilation of the code on small changes. Many more parameters are now accepted by (const) ref rather than by value, avoiding unnecessary copies and improving const correctness of the code.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Nghia Truong (https://github.com/ttnghia)
  - Jason Lowe (https://github.com/jlowe)
  - Dillon Cullinan (https://github.com/dillon-cullinan)

URL: https://github.com/rapidsai/cudf/pull/9045
---
 conda/recipes/libcudf/meta.yaml               |   3 +-
 cpp/CMakeLists.txt                            |   1 +
 cpp/benchmarks/ast/transform_benchmark.cpp    |  10 +-
 .../join/conditional_join_benchmark.cu        |  15 +-
 cpp/benchmarks/join/join_benchmark_common.hpp |   3 +-
 .../cudf/ast/detail/expression_evaluator.cuh  |   3 +-
 .../cudf/ast/detail/expression_parser.hpp     |  47 ++++---
 cpp/include/cudf/ast/detail/operators.hpp     |   2 +-
 .../cudf/ast/{nodes.hpp => expressions.hpp}   | 122 +++++++++++------
 cpp/include/cudf/ast/operators.hpp            |  76 -----------
 cpp/include/cudf/detail/transform.hpp         |   4 +-
 cpp/include/cudf/join.hpp                     |  56 ++++----
 cpp/include/cudf/transform.hpp                |   4 +-
 cpp/src/ast/expression_parser.cpp             |  96 +++++++------
 cpp/src/ast/expressions.cpp                   |  58 ++++++++
 cpp/src/join/conditional_join.cu              |  60 ++++----
 cpp/src/join/conditional_join.hpp             |   6 +-
 cpp/src/transform/compute_column.cu           |   6 +-
 cpp/tests/ast/transform_tests.cpp             | 107 +++++++++------
 cpp/tests/join/conditional_join_tests.cu      |  59 ++++----
 .../ast/{AstNode.java => AstExpression.java}  |  15 +-
 ...ryExpression.java => BinaryOperation.java} |  14 +-
 .../ai/rapids/cudf/ast/BinaryOperator.java    |   2 +-
 .../ai/rapids/cudf/ast/ColumnReference.java   |   6 +-
 .../java/ai/rapids/cudf/ast/Expression.java   |  31 -----
 .../main/java/ai/rapids/cudf/ast/Literal.java |  10 +-
 ...aryExpression.java => UnaryOperation.java} |  12 +-
 .../ai/rapids/cudf/ast/UnaryOperator.java     |   2 +-
 .../main/native/src/CompiledExpression.cpp    |  67 ++++-----
 .../src/main/native/src/jni_compiled_expr.hpp |  18 +--
 .../test/java/ai/rapids/cudf/TableTest.java   |  38 +++---
 .../cudf/ast/CompiledExpressionTest.java      | 128 ++++++++----------
 32 files changed, 536 insertions(+), 545 deletions(-)
 rename cpp/include/cudf/ast/{nodes.hpp => expressions.hpp} (59%)
 delete mode 100644 cpp/include/cudf/ast/operators.hpp
 create mode 100644 cpp/src/ast/expressions.cpp
 rename java/src/main/java/ai/rapids/cudf/ast/{AstNode.java => AstExpression.java} (82%)
 rename java/src/main/java/ai/rapids/cudf/ast/{BinaryExpression.java => BinaryOperation.java} (72%)
 delete mode 100644 java/src/main/java/ai/rapids/cudf/ast/Expression.java
 rename java/src/main/java/ai/rapids/cudf/ast/{UnaryExpression.java => UnaryOperation.java} (73%)

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index c1ba2b495eb..208c21c2dc0 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -53,8 +53,7 @@ test:
     - test -f $PREFIX/include/cudf/aggregation.hpp
     - test -f $PREFIX/include/cudf/ast/detail/expression_parser.hpp
     - test -f $PREFIX/include/cudf/ast/detail/operators.hpp
-    - test -f $PREFIX/include/cudf/ast/nodes.hpp
-    - test -f $PREFIX/include/cudf/ast/operators.hpp
+    - test -f $PREFIX/include/cudf/ast/expressions.hpp
     - test -f $PREFIX/include/cudf/binaryop.hpp
     - test -f $PREFIX/include/cudf/labeling/label_bins.hpp
     - test -f $PREFIX/include/cudf/column/column_factories.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index bb17f13db53..81fad82b3ea 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -153,6 +153,7 @@ add_library(cudf
     src/aggregation/aggregation.cu
     src/aggregation/result_cache.cpp
     src/ast/expression_parser.cpp
+    src/ast/expressions.cpp
     src/binaryop/binaryop.cpp
     src/binaryop/compiled/binary_ops.cu
     src/binaryop/compiled/Add.cu
diff --git a/cpp/benchmarks/ast/transform_benchmark.cpp b/cpp/benchmarks/ast/transform_benchmark.cpp
index 75b502bf7bf..fd0a0f7d2c8 100644
--- a/cpp/benchmarks/ast/transform_benchmark.cpp
+++ b/cpp/benchmarks/ast/transform_benchmark.cpp
@@ -95,22 +95,22 @@ static void BM_ast_transform(benchmark::State& state)
   // Note that a std::list is required here because of its guarantees against reference invalidation
   // when items are added or removed. References to items in a std::vector are not safe if the
   // vector must re-allocate.
-  auto expressions = std::list<cudf::ast::expression>();
+  auto expressions = std::list<cudf::ast::operation>();
 
   // Construct tree that chains additions like (((a + b) + c) + d)
   auto const op = cudf::ast::ast_operator::ADD;
   if (reuse_columns) {
-    expressions.push_back(cudf::ast::expression(op, column_refs.at(0), column_refs.at(0)));
+    expressions.push_back(cudf::ast::operation(op, column_refs.at(0), column_refs.at(0)));
     for (cudf::size_type i = 0; i < tree_levels - 1; i++) {
-      expressions.push_back(cudf::ast::expression(op, expressions.back(), column_refs.at(0)));
+      expressions.push_back(cudf::ast::operation(op, expressions.back(), column_refs.at(0)));
     }
   } else {
-    expressions.push_back(cudf::ast::expression(op, column_refs.at(0), column_refs.at(1)));
+    expressions.push_back(cudf::ast::operation(op, column_refs.at(0), column_refs.at(1)));
     std::transform(std::next(column_refs.cbegin(), 2),
                    column_refs.cend(),
                    std::back_inserter(expressions),
                    [&](auto const& column_ref) {
-                     return cudf::ast::expression(op, expressions.back(), column_ref);
+                     return cudf::ast::operation(op, expressions.back(), column_ref);
                    });
   }
 
diff --git a/cpp/benchmarks/join/conditional_join_benchmark.cu b/cpp/benchmarks/join/conditional_join_benchmark.cu
index f778f6ac010..71b90685fb9 100644
--- a/cpp/benchmarks/join/conditional_join_benchmark.cu
+++ b/cpp/benchmarks/join/conditional_join_benchmark.cu
@@ -26,7 +26,7 @@ class ConditionalJoin : public cudf::benchmark {
   {                                                                                     \
     auto join = [](cudf::table_view const& left,                                        \
                    cudf::table_view const& right,                                       \
-                   cudf::ast::expression binary_pred,                                   \
+                   cudf::ast::operation binary_pred,                                    \
                    cudf::null_equality compare_nulls) {                                 \
       return cudf::conditional_inner_join(left, right, binary_pred, compare_nulls);     \
     };                                                                                  \
@@ -45,7 +45,7 @@ CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_64bit_nulls, int6
   {                                                                                    \
     auto join = [](cudf::table_view const& left,                                       \
                    cudf::table_view const& right,                                      \
-                   cudf::ast::expression binary_pred,                                  \
+                   cudf::ast::operation binary_pred,                                   \
                    cudf::null_equality compare_nulls) {                                \
       return cudf::conditional_left_join(left, right, binary_pred, compare_nulls);     \
     };                                                                                 \
@@ -64,7 +64,7 @@ CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit_nulls, int64_
   {                                                                                    \
     auto join = [](cudf::table_view const& left,                                       \
                    cudf::table_view const& right,                                      \
-                   cudf::ast::expression binary_pred,                                  \
+                   cudf::ast::operation binary_pred,                                   \
                    cudf::null_equality compare_nulls) {                                \
       return cudf::conditional_inner_join(left, right, binary_pred, compare_nulls);    \
     };                                                                                 \
@@ -83,7 +83,7 @@ CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_64bit_nulls, int64_
   {                                                                                         \
     auto join = [](cudf::table_view const& left,                                            \
                    cudf::table_view const& right,                                           \
-                   cudf::ast::expression binary_pred,                                       \
+                   cudf::ast::operation binary_pred,                                        \
                    cudf::null_equality compare_nulls) {                                     \
       return cudf::conditional_left_anti_join(left, right, binary_pred, compare_nulls);     \
     };                                                                                      \
@@ -114,7 +114,7 @@ CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_64bit_nul
   {                                                                                         \
     auto join = [](cudf::table_view const& left,                                            \
                    cudf::table_view const& right,                                           \
-                   cudf::ast::expression binary_pred,                                       \
+                   cudf::ast::operation binary_pred,                                        \
                    cudf::null_equality compare_nulls) {                                     \
       return cudf::conditional_left_semi_join(left, right, binary_pred, compare_nulls);     \
     };                                                                                      \
@@ -145,11 +145,6 @@ BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_32bit)
   ->Args({100'000, 100'000})
   ->Args({100'000, 400'000})
   ->Args({100'000, 1'000'000})
-  // TODO: The below benchmark is slow, but can be useful to validate that the
-  // code works for large data sets. This benchmark was used to compare to the
-  // otherwise equivalent nullable benchmark below, which has memory errors for
-  // sufficiently large data sets.
-  //->Args({1'000'000, 1'000'000})
   ->UseManualTime();
 
 BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_64bit)
diff --git a/cpp/benchmarks/join/join_benchmark_common.hpp b/cpp/benchmarks/join/join_benchmark_common.hpp
index e6fed454707..add87bf7dfb 100644
--- a/cpp/benchmarks/join/join_benchmark_common.hpp
+++ b/cpp/benchmarks/join/join_benchmark_common.hpp
@@ -21,6 +21,7 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
+#include <cudf/ast/expressions.hpp>
 #include <cudf/join.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/error.hpp>
@@ -139,7 +140,7 @@ static void BM_join(state_type& state, Join JoinFunc)
     const auto col_ref_left_0  = cudf::ast::column_reference(0);
     const auto col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
     auto left_zero_eq_right_zero =
-      cudf::ast::expression(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0);
+      cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0);
 
     for (auto _ : state) {
       cuda_event_timer raii(state, true, rmm::cuda_stream_default);
diff --git a/cpp/include/cudf/ast/detail/expression_evaluator.cuh b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
index ca2cab96123..fb198761115 100644
--- a/cpp/include/cudf/ast/detail/expression_evaluator.cuh
+++ b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
@@ -17,8 +17,7 @@
 
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/ast/detail/operators.hpp>
-#include <cudf/ast/nodes.hpp>
-#include <cudf/ast/operators.hpp>
+#include <cudf/ast/expressions.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/utilities/assert.cuh>
diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp
index 9eca250b898..1f35b54ea61 100644
--- a/cpp/include/cudf/ast/detail/expression_parser.hpp
+++ b/cpp/include/cudf/ast/detail/expression_parser.hpp
@@ -15,8 +15,7 @@
  */
 #pragma once
 
-#include <cudf/ast/nodes.hpp>
-#include <cudf/ast/operators.hpp>
+#include <cudf/ast/expressions.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
@@ -44,7 +43,7 @@ enum class device_data_reference_type {
 };
 
 /**
- * @brief A device data reference describes a source of data used by a node.
+ * @brief A device data reference describes a source of data used by a expression.
  *
  * This is a POD class used to create references describing data type and locations for consumption
  * by the `row_evaluator`.
@@ -115,11 +114,11 @@ struct expression_device_view {
  * @brief The expression_parser traverses an expression and converts it into a form suitable for
  * execution on the device.
  *
- * This class is part of a "visitor" pattern with the `node` class.
+ * This class is part of a "visitor" pattern with the `expression` class.
  *
  * This class does pre-processing work on the host, validating operators and operand data types. It
- * traverses downward from a root node in a depth-first fashion, capturing information about
- * the nodes and constructing vectors of information that are later used by the device for
+ * traverses downward from a root expression in a depth-first fashion, capturing information about
+ * the expressions and constructing vectors of information that are later used by the device for
  * evaluating the abstract syntax tree as a "linear" list of operators whose input dependencies are
  * resolved into intermediate data storage in shared memory.
  */
@@ -132,13 +131,17 @@ class expression_parser {
    * @param left The left table used for evaluating the abstract syntax tree.
    * @param right The right table used for evaluating the abstract syntax tree.
    */
-  expression_parser(node const& expr,
+  expression_parser(expression const& expr,
                     cudf::table_view const& left,
                     std::optional<std::reference_wrapper<cudf::table_view const>> right,
                     bool has_nulls,
                     rmm::cuda_stream_view stream,
                     rmm::mr::device_memory_resource* mr)
-    : _left{left}, _right{right}, _node_count{0}, _intermediate_counter{}, _has_nulls(has_nulls)
+    : _left{left},
+      _right{right},
+      _expression_count{0},
+      _intermediate_counter{},
+      _has_nulls(has_nulls)
   {
     expr.accept(*this);
     move_to_device(stream, mr);
@@ -150,7 +153,7 @@ class expression_parser {
    * @param expr The expression to create an evaluable expression_parser for.
    * @param table The table used for evaluating the abstract syntax tree.
    */
-  expression_parser(node const& expr,
+  expression_parser(expression const& expr,
                     cudf::table_view const& table,
                     bool has_nulls,
                     rmm::cuda_stream_view stream,
@@ -167,33 +170,33 @@ class expression_parser {
   cudf::data_type output_type() const;
 
   /**
-   * @brief Visit a literal node.
+   * @brief Visit a literal expression.
    *
-   * @param expr Literal node.
-   * @return cudf::size_type Index of device data reference for the node.
+   * @param expr Literal expression.
+   * @return cudf::size_type Index of device data reference for the expression.
    */
   cudf::size_type visit(literal const& expr);
 
   /**
-   * @brief Visit a column reference node.
+   * @brief Visit a column reference expression.
    *
-   * @param expr Column reference node.
-   * @return cudf::size_type Index of device data reference for the node.
+   * @param expr Column reference expression.
+   * @return cudf::size_type Index of device data reference for the expression.
    */
   cudf::size_type visit(column_reference const& expr);
 
   /**
-   * @brief Visit an expression node.
+   * @brief Visit an expression expression.
    *
-   * @param expr Expression node.
-   * @return cudf::size_type Index of device data reference for the node.
+   * @param expr Expression expression.
+   * @return cudf::size_type Index of device data reference for the expression.
    */
-  cudf::size_type visit(expression const& expr);
+  cudf::size_type visit(operation const& expr);
 
   /**
    * @brief Internal class used to track the utilization of intermediate storage locations.
    *
-   * As nodes are being evaluated, they may generate "intermediate" data that is immediately
+   * As expressions are being evaluated, they may generate "intermediate" data that is immediately
    * consumed. Rather than manifesting this data in global memory, we can store intermediates of any
    * fixed width type (up to 8 bytes) by placing them in shared memory. This class helps to track
    * the number and indices of intermediate data in shared memory using a give-take model. Locations
@@ -308,7 +311,7 @@ class expression_parser {
    * @return The indices of the operands stored in the data references.
    */
   std::vector<cudf::size_type> visit_operands(
-    std::vector<std::reference_wrapper<node const>> operands);
+    std::vector<std::reference_wrapper<expression const>> operands);
 
   /**
    * @brief Add a data reference to the internal list.
@@ -325,7 +328,7 @@ class expression_parser {
 
   cudf::table_view const& _left;
   std::optional<std::reference_wrapper<cudf::table_view const>> _right;
-  cudf::size_type _node_count;
+  cudf::size_type _expression_count;
   intermediate_counter _intermediate_counter;
   bool _has_nulls;
   std::vector<detail::device_data_reference> _data_references;
diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp
index fd3a0775401..00723004a9f 100644
--- a/cpp/include/cudf/ast/detail/operators.hpp
+++ b/cpp/include/cudf/ast/detail/operators.hpp
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <cudf/ast/operators.hpp>
+#include <cudf/ast/expressions.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
diff --git a/cpp/include/cudf/ast/nodes.hpp b/cpp/include/cudf/ast/expressions.hpp
similarity index 59%
rename from cpp/include/cudf/ast/nodes.hpp
rename to cpp/include/cudf/ast/expressions.hpp
index f36d7bcd3c7..d9ba197f8fe 100644
--- a/cpp/include/cudf/ast/nodes.hpp
+++ b/cpp/include/cudf/ast/expressions.hpp
@@ -15,8 +15,6 @@
  */
 #pragma once
 
-#include <cudf/ast/detail/operators.hpp>
-#include <cudf/ast/operators.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/table/table_view.hpp>
@@ -25,21 +23,75 @@
 
 namespace cudf {
 namespace ast {
-namespace detail {
 
-// Forward declaration
+// Forward declaration.
+namespace detail {
 class expression_parser;
+}
+
 /**
- * @brief A generic node that can be evaluated to return a value.
+ * @brief A generic expression that can be evaluated to return a value.
  *
  * This class is a part of a "visitor" pattern with the `linearizer` class.
  * Nodes inheriting from this class can accept visitors.
  */
-struct node {
-  virtual cudf::size_type accept(expression_parser& visitor) const = 0;
+struct expression {
+  virtual cudf::size_type accept(detail::expression_parser& visitor) const = 0;
+
+  virtual ~expression() {}
 };
 
-}  // namespace detail
+/**
+ * @brief Enum of supported operators.
+ */
+enum class ast_operator {
+  // Binary operators
+  ADD,            ///< operator +
+  SUB,            ///< operator -
+  MUL,            ///< operator *
+  DIV,            ///< operator / using common type of lhs and rhs
+  TRUE_DIV,       ///< operator / after promoting type to floating point
+  FLOOR_DIV,      ///< operator / after promoting to 64 bit floating point and then
+                  ///< flooring the result
+  MOD,            ///< operator %
+  PYMOD,          ///< operator % but following python's sign rules for negatives
+  POW,            ///< lhs ^ rhs
+  EQUAL,          ///< operator ==
+  NOT_EQUAL,      ///< operator !=
+  LESS,           ///< operator <
+  GREATER,        ///< operator >
+  LESS_EQUAL,     ///< operator <=
+  GREATER_EQUAL,  ///< operator >=
+  BITWISE_AND,    ///< operator &
+  BITWISE_OR,     ///< operator |
+  BITWISE_XOR,    ///< operator ^
+  LOGICAL_AND,    ///< operator &&
+  LOGICAL_OR,     ///< operator ||
+  // Unary operators
+  IDENTITY,    ///< Identity function
+  SIN,         ///< Trigonometric sine
+  COS,         ///< Trigonometric cosine
+  TAN,         ///< Trigonometric tangent
+  ARCSIN,      ///< Trigonometric sine inverse
+  ARCCOS,      ///< Trigonometric cosine inverse
+  ARCTAN,      ///< Trigonometric tangent inverse
+  SINH,        ///< Hyperbolic sine
+  COSH,        ///< Hyperbolic cosine
+  TANH,        ///< Hyperbolic tangent
+  ARCSINH,     ///< Hyperbolic sine inverse
+  ARCCOSH,     ///< Hyperbolic cosine inverse
+  ARCTANH,     ///< Hyperbolic tangent inverse
+  EXP,         ///< Exponential (base e, Euler number)
+  LOG,         ///< Natural Logarithm (base e)
+  SQRT,        ///< Square-root (x^0.5)
+  CBRT,        ///< Cube-root (x^(1.0/3))
+  CEIL,        ///< Smallest integer value not less than arg
+  FLOOR,       ///< largest integer value not greater than arg
+  ABS,         ///< Absolute value
+  RINT,        ///< Rounds the floating-point argument arg to an integer value
+  BIT_INVERT,  ///< Bitwise Not (~)
+  NOT          ///< Logical Not (!)
+};
 
 /**
  * @brief Enum of table references.
@@ -55,7 +107,7 @@ enum class table_reference {
 /**
  * @brief A literal value used in an abstract syntax tree.
  */
-class literal : public detail::node {
+class literal : public expression {
  public:
   /**
    * @brief Construct a new literal object.
@@ -117,14 +169,14 @@ class literal : public detail::node {
 };
 
 /**
- * @brief A node referring to data from a column in a table.
+ * @brief A expression referring to data from a column in a table.
  */
-class column_reference : public detail::node {
+class column_reference : public expression {
  public:
   /**
    * @brief Construct a new column reference object
    *
-   * @param column_index Index of this column in the table (provided when the node is
+   * @param column_index Index of this column in the table (provided when the expression is
    * evaluated).
    * @param table_source Which table to use in cases with two tables (e.g. joins).
    */
@@ -194,43 +246,33 @@ class column_reference : public detail::node {
 };
 
 /**
- * @brief An expression node holds an operator and zero or more operands.
+ * @brief An operation expression holds an operator and zero or more operands.
  */
-class expression : public detail::node {
+class operation : public expression {
  public:
   /**
-   * @brief Construct a new unary expression object.
+   * @brief Construct a new unary operation object.
    *
    * @param op Operator
-   * @param input Input node (operand)
+   * @param input Input expression (operand)
    */
-  expression(ast_operator op, node const& input) : op(op), operands({input})
-  {
-    if (cudf::ast::detail::ast_operator_arity(op) != 1) {
-      CUDF_FAIL("The provided operator is not a unary operator.");
-    }
-  }
+  operation(ast_operator op, expression const& input);
 
   /**
-   * @brief Construct a new binary expression object.
+   * @brief Construct a new binary operation object.
    *
    * @param op Operator
-   * @param left Left input node (left operand)
-   * @param right Right input node (right operand)
+   * @param left Left input expression (left operand)
+   * @param right Right input expression (right operand)
    */
-  expression(ast_operator op, node const& left, node const& right) : op(op), operands({left, right})
-  {
-    if (cudf::ast::detail::ast_operator_arity(op) != 2) {
-      CUDF_FAIL("The provided operator is not a binary operator.");
-    }
-  }
+  operation(ast_operator op, expression const& left, expression const& right);
 
-  // expression only stores references to nodes, so it does not accept r-value
-  // references: the calling code must own the nodes.
-  expression(ast_operator op, node&& input)                   = delete;
-  expression(ast_operator op, node&& left, node&& right)      = delete;
-  expression(ast_operator op, node&& left, node const& right) = delete;
-  expression(ast_operator op, node const& left, node&& right) = delete;
+  // operation only stores references to expressions, so it does not accept r-value
+  // references: the calling code must own the expressions.
+  operation(ast_operator op, expression&& input)                         = delete;
+  operation(ast_operator op, expression&& left, expression&& right)      = delete;
+  operation(ast_operator op, expression&& left, expression const& right) = delete;
+  operation(ast_operator op, expression const& left, expression&& right) = delete;
 
   /**
    * @brief Get the operator.
@@ -242,9 +284,9 @@ class expression : public detail::node {
   /**
    * @brief Get the operands.
    *
-   * @return std::vector<std::reference_wrapper<const node>>
+   * @return std::vector<std::reference_wrapper<const expression>>
    */
-  std::vector<std::reference_wrapper<node const>> get_operands() const { return operands; }
+  std::vector<std::reference_wrapper<expression const>> get_operands() const { return operands; }
 
   /**
    * @brief Accepts a visitor class.
@@ -256,7 +298,7 @@ class expression : public detail::node {
 
  private:
   ast_operator const op;
-  std::vector<std::reference_wrapper<node const>> const operands;
+  std::vector<std::reference_wrapper<expression const>> const operands;
 };
 
 }  // namespace ast
diff --git a/cpp/include/cudf/ast/operators.hpp b/cpp/include/cudf/ast/operators.hpp
deleted file mode 100644
index 78e56340246..00000000000
--- a/cpp/include/cudf/ast/operators.hpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-namespace cudf {
-
-namespace ast {
-
-/**
- * @brief Enum of supported operators.
- */
-enum class ast_operator {
-  // Binary operators
-  ADD,            ///< operator +
-  SUB,            ///< operator -
-  MUL,            ///< operator *
-  DIV,            ///< operator / using common type of lhs and rhs
-  TRUE_DIV,       ///< operator / after promoting type to floating point
-  FLOOR_DIV,      ///< operator / after promoting to 64 bit floating point and then
-                  ///< flooring the result
-  MOD,            ///< operator %
-  PYMOD,          ///< operator % but following python's sign rules for negatives
-  POW,            ///< lhs ^ rhs
-  EQUAL,          ///< operator ==
-  NOT_EQUAL,      ///< operator !=
-  LESS,           ///< operator <
-  GREATER,        ///< operator >
-  LESS_EQUAL,     ///< operator <=
-  GREATER_EQUAL,  ///< operator >=
-  BITWISE_AND,    ///< operator &
-  BITWISE_OR,     ///< operator |
-  BITWISE_XOR,    ///< operator ^
-  LOGICAL_AND,    ///< operator &&
-  LOGICAL_OR,     ///< operator ||
-  // Unary operators
-  IDENTITY,    ///< Identity function
-  SIN,         ///< Trigonometric sine
-  COS,         ///< Trigonometric cosine
-  TAN,         ///< Trigonometric tangent
-  ARCSIN,      ///< Trigonometric sine inverse
-  ARCCOS,      ///< Trigonometric cosine inverse
-  ARCTAN,      ///< Trigonometric tangent inverse
-  SINH,        ///< Hyperbolic sine
-  COSH,        ///< Hyperbolic cosine
-  TANH,        ///< Hyperbolic tangent
-  ARCSINH,     ///< Hyperbolic sine inverse
-  ARCCOSH,     ///< Hyperbolic cosine inverse
-  ARCTANH,     ///< Hyperbolic tangent inverse
-  EXP,         ///< Exponential (base e, Euler number)
-  LOG,         ///< Natural Logarithm (base e)
-  SQRT,        ///< Square-root (x^0.5)
-  CBRT,        ///< Cube-root (x^(1.0/3))
-  CEIL,        ///< Smallest integer value not less than arg
-  FLOOR,       ///< largest integer value not greater than arg
-  ABS,         ///< Absolute value
-  RINT,        ///< Rounds the floating-point argument arg to an integer value
-  BIT_INVERT,  ///< Bitwise Not (~)
-  NOT          ///< Logical Not (!)
-};
-
-}  // namespace ast
-
-}  // namespace cudf
diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp
index 96ef27529be..12948498455 100644
--- a/cpp/include/cudf/detail/transform.hpp
+++ b/cpp/include/cudf/detail/transform.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <cudf/ast/nodes.hpp>
+#include <cudf/ast/expressions.hpp>
 #include <cudf/transform.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -43,7 +43,7 @@ std::unique_ptr<column> transform(
  */
 std::unique_ptr<column> compute_column(
   table_view const table,
-  ast::expression const& expr,
+  ast::operation const& expr,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index dbafa95ee77..483cd75c739 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <cudf/ast/nodes.hpp>
+#include <cudf/ast/expressions.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
@@ -687,9 +687,9 @@ class hash_join {
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 conditional_inner_join(
-  table_view left,
-  table_view right,
-  ast::expression binary_predicate,
+  table_view const& left,
+  table_view const& right,
+  ast::expression const& binary_predicate,
   null_equality compare_nulls            = null_equality::EQUAL,
   std::optional<std::size_t> output_size = {},
   rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
@@ -733,9 +733,9 @@ conditional_inner_join(
  */
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-conditional_left_join(table_view left,
-                      table_view right,
-                      ast::expression binary_predicate,
+conditional_left_join(table_view const& left,
+                      table_view const& right,
+                      ast::expression const& binary_predicate,
                       null_equality compare_nulls            = null_equality::EQUAL,
                       std::optional<std::size_t> output_size = {},
                       rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
@@ -778,9 +778,9 @@ conditional_left_join(table_view left,
  */
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-conditional_full_join(table_view left,
-                      table_view right,
-                      ast::expression binary_predicate,
+conditional_full_join(table_view const& left,
+                      table_view const& right,
+                      ast::expression const& binary_predicate,
                       null_equality compare_nulls         = null_equality::EQUAL,
                       rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
@@ -817,9 +817,9 @@ conditional_full_join(table_view left,
  * `right` .
  */
 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
-  table_view left,
-  table_view right,
-  ast::expression binary_predicate,
+  table_view const& left,
+  table_view const& right,
+  ast::expression const& binary_predicate,
   null_equality compare_nulls            = null_equality::EQUAL,
   std::optional<std::size_t> output_size = {},
   rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
@@ -857,9 +857,9 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
  * `right` .
  */
 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
-  table_view left,
-  table_view right,
-  ast::expression binary_predicate,
+  table_view const& left,
+  table_view const& right,
+  ast::expression const& binary_predicate,
   null_equality compare_nulls            = null_equality::EQUAL,
   std::optional<std::size_t> output_size = {},
   rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
@@ -883,9 +883,9 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
  * @return The size that would result from performing the requested join.
  */
 std::size_t conditional_inner_join_size(
-  table_view left,
-  table_view right,
-  ast::expression binary_predicate,
+  table_view const& left,
+  table_view const& right,
+  ast::expression const& binary_predicate,
   null_equality compare_nulls         = null_equality::EQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
@@ -908,9 +908,9 @@ std::size_t conditional_inner_join_size(
  * @return The size that would result from performing the requested join.
  */
 std::size_t conditional_left_join_size(
-  table_view left,
-  table_view right,
-  ast::expression binary_predicate,
+  table_view const& left,
+  table_view const& right,
+  ast::expression const& binary_predicate,
   null_equality compare_nulls         = null_equality::EQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
@@ -933,9 +933,9 @@ std::size_t conditional_left_join_size(
  * @return The size that would result from performing the requested join.
  */
 std::size_t conditional_left_semi_join_size(
-  table_view left,
-  table_view right,
-  ast::expression binary_predicate,
+  table_view const& left,
+  table_view const& right,
+  ast::expression const& binary_predicate,
   null_equality compare_nulls         = null_equality::EQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
@@ -958,9 +958,9 @@ std::size_t conditional_left_semi_join_size(
  * @return The size that would result from performing the requested join.
  */
 std::size_t conditional_left_anti_join_size(
-  table_view left,
-  table_view right,
-  ast::expression binary_predicate,
+  table_view const& left,
+  table_view const& right,
+  ast::expression const& binary_predicate,
   null_equality compare_nulls         = null_equality::EQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 /** @} */  // end of group
diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index cf391b2b23d..af2858d948e 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <cudf/ast/nodes.hpp>
+#include <cudf/ast/expressions.hpp>
 #include <cudf/types.hpp>
 
 #include <memory>
@@ -89,7 +89,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
  * @return std::unique_ptr<column> Output column.
  */
 std::unique_ptr<column> compute_column(
-  table_view const table,
+  table_view const& table,
   ast::expression const& expr,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/src/ast/expression_parser.cpp b/cpp/src/ast/expression_parser.cpp
index 760f47a5045..1072bff43dd 100644
--- a/cpp/src/ast/expression_parser.cpp
+++ b/cpp/src/ast/expression_parser.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 #include <cudf/ast/detail/expression_parser.hpp>
-#include <cudf/ast/nodes.hpp>
-#include <cudf/ast/operators.hpp>
+#include <cudf/ast/detail/operators.hpp>
+#include <cudf/ast/expressions.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/table/table_view.hpp>
@@ -85,46 +85,57 @@ cudf::size_type expression_parser::intermediate_counter::find_first_missing() co
 
 cudf::size_type expression_parser::visit(literal const& expr)
 {
-  _node_count++;                                                 // Increment the node index
-  auto const data_type     = expr.get_data_type();               // Resolve node type
-  auto device_view         = expr.get_value();                   // Construct a scalar device view
-  auto const literal_index = cudf::size_type(_literals.size());  // Push literal
-  _literals.push_back(device_view);
-  auto const source = detail::device_data_reference(
-    detail::device_data_reference_type::LITERAL, data_type, literal_index);  // Push data reference
-  return add_data_reference(source);
+  if (_expression_count == 0) {
+    // Handle the trivial case of a literal as the entire expression.
+    return visit(operation(ast_operator::IDENTITY, expr));
+  } else {
+    _expression_count++;                                           // Increment the expression index
+    auto const data_type     = expr.get_data_type();               // Resolve expression type
+    auto device_view         = expr.get_value();                   // Construct a scalar device view
+    auto const literal_index = cudf::size_type(_literals.size());  // Push literal
+    _literals.push_back(device_view);
+    auto const source = detail::device_data_reference(detail::device_data_reference_type::LITERAL,
+                                                      data_type,
+                                                      literal_index);  // Push data reference
+    return add_data_reference(source);
+  }
 }
 
 cudf::size_type expression_parser::visit(column_reference const& expr)
 {
-  // Increment the node index
-  _node_count++;
-  // Resolve node type
-  cudf::data_type data_type;
-  if (expr.get_table_source() == table_reference::LEFT) {
-    data_type = expr.get_data_type(_left);
+  if (_expression_count == 0) {
+    // Handle the trivial case of a column reference as the entire expression.
+    return visit(operation(ast_operator::IDENTITY, expr));
   } else {
-    if (_right.has_value()) {
-      data_type = expr.get_data_type(*_right);
+    // Increment the expression index
+    _expression_count++;
+    // Resolve expression type
+    cudf::data_type data_type;
+    if (expr.get_table_source() == table_reference::LEFT) {
+      data_type = expr.get_data_type(_left);
     } else {
-      CUDF_FAIL(
-        "Your expression contains a reference to the RIGHT table even though it will only be "
-        "evaluated on a single table (by convention, the LEFT table).");
+      if (_right.has_value()) {
+        data_type = expr.get_data_type(*_right);
+      } else {
+        CUDF_FAIL(
+          "Your expression contains a reference to the RIGHT table even though it will only be "
+          "evaluated on a single table (by convention, the LEFT table).");
+      }
     }
+    // Push data reference
+    auto const source = detail::device_data_reference(detail::device_data_reference_type::COLUMN,
+                                                      data_type,
+                                                      expr.get_column_index(),
+                                                      expr.get_table_source());
+    return add_data_reference(source);
   }
-  // Push data reference
-  auto const source = detail::device_data_reference(detail::device_data_reference_type::COLUMN,
-                                                    data_type,
-                                                    expr.get_column_index(),
-                                                    expr.get_table_source());
-  return add_data_reference(source);
 }
 
-cudf::size_type expression_parser::visit(expression const& expr)
+cudf::size_type expression_parser::visit(operation const& expr)
 {
-  // Increment the node index
-  auto const node_index = _node_count++;
-  // Visit children (operands) of this node
+  // Increment the expression index
+  auto const expression_index = _expression_count++;
+  // Visit children (operands) of this expression
   auto const operand_data_ref_indices = visit_operands(expr.get_operands());
   // Resolve operand types
   auto data_ref = [this](auto const& index) { return _data_references[index].data_type; };
@@ -149,18 +160,18 @@ cudf::size_type expression_parser::visit(expression const& expr)
         _intermediate_counter.give(intermediate_index);
       }
     });
-  // Resolve node type
+  // Resolve expression type
   auto const op        = expr.get_operator();
   auto const data_type = cudf::ast::detail::ast_operator_return_type(op, operand_types);
   _operators.push_back(op);
   // Push data reference
   auto const output = [&]() {
-    if (node_index == 0) {
-      // This node is the root. Output should be directed to the output column.
+    if (expression_index == 0) {
+      // This expression is the root. Output should be directed to the output column.
       return detail::device_data_reference(
         detail::device_data_reference_type::COLUMN, data_type, 0, table_reference::OUTPUT);
     } else {
-      // This node is not the root. Output is an intermediate value.
+      // This expression is not the root. Output is an intermediate value.
       // Ensure that the output type is fixed width and fits in the intermediate storage.
       if (!cudf::is_fixed_width(data_type)) {
         CUDF_FAIL(
@@ -189,7 +200,7 @@ cudf::data_type expression_parser::output_type() const
 }
 
 std::vector<cudf::size_type> expression_parser::visit_operands(
-  std::vector<std::reference_wrapper<detail::node const>> operands)
+  std::vector<std::reference_wrapper<expression const>> operands)
 {
   auto operand_data_reference_indices = std::vector<cudf::size_type>();
   for (auto const& operand : operands) {
@@ -214,19 +225,6 @@ cudf::size_type expression_parser::add_data_reference(detail::device_data_refere
 
 }  // namespace detail
 
-cudf::size_type literal::accept(detail::expression_parser& visitor) const
-{
-  return visitor.visit(*this);
-}
-cudf::size_type column_reference::accept(detail::expression_parser& visitor) const
-{
-  return visitor.visit(*this);
-}
-cudf::size_type expression::accept(detail::expression_parser& visitor) const
-{
-  return visitor.visit(*this);
-}
-
 }  // namespace ast
 
 }  // namespace cudf
diff --git a/cpp/src/ast/expressions.cpp b/cpp/src/ast/expressions.cpp
new file mode 100644
index 00000000000..88cc6650d6c
--- /dev/null
+++ b/cpp/src/ast/expressions.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/ast/detail/expression_parser.hpp>
+#include <cudf/ast/detail/operators.hpp>
+#include <cudf/ast/expressions.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/scalar/scalar_device_view.cuh>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+
+namespace cudf {
+namespace ast {
+
+operation::operation(ast_operator op, expression const& input) : op(op), operands({input})
+{
+  if (cudf::ast::detail::ast_operator_arity(op) != 1) {
+    CUDF_FAIL("The provided operator is not a unary operator.");
+  }
+}
+
+operation::operation(ast_operator op, expression const& left, expression const& right)
+  : op(op), operands({left, right})
+{
+  if (cudf::ast::detail::ast_operator_arity(op) != 2) {
+    CUDF_FAIL("The provided operator is not a binary operator.");
+  }
+}
+
+cudf::size_type literal::accept(detail::expression_parser& visitor) const
+{
+  return visitor.visit(*this);
+}
+cudf::size_type column_reference::accept(detail::expression_parser& visitor) const
+{
+  return visitor.visit(*this);
+}
+cudf::size_type operation::accept(detail::expression_parser& visitor) const
+{
+  return visitor.visit(*this);
+}
+
+}  // namespace ast
+
+}  // namespace cudf
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index ee076d80140..bfabe99aaf9 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -15,7 +15,7 @@
  */
 
 #include <cudf/ast/detail/expression_parser.hpp>
-#include <cudf/ast/nodes.hpp>
+#include <cudf/ast/expressions.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/join.hpp>
 #include <cudf/table/table.hpp>
@@ -38,7 +38,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 conditional_join(table_view const& left,
                  table_view const& right,
-                 ast::expression binary_predicate,
+                 ast::expression const& binary_predicate,
                  null_equality compare_nulls,
                  join_kind join_type,
                  std::optional<std::size_t> output_size,
@@ -171,7 +171,7 @@ conditional_join(table_view const& left,
 
 std::size_t compute_conditional_join_output_size(table_view const& left,
                                                  table_view const& right,
-                                                 ast::expression binary_predicate,
+                                                 ast::expression const& binary_predicate,
                                                  null_equality compare_nulls,
                                                  join_kind join_type,
                                                  rmm::cuda_stream_view stream,
@@ -248,9 +248,9 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-conditional_inner_join(table_view left,
-                       table_view right,
-                       ast::expression binary_predicate,
+conditional_inner_join(table_view const& left,
+                       table_view const& right,
+                       ast::expression const& binary_predicate,
                        null_equality compare_nulls,
                        std::optional<std::size_t> output_size,
                        rmm::mr::device_memory_resource* mr)
@@ -268,9 +268,9 @@ conditional_inner_join(table_view left,
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-conditional_left_join(table_view left,
-                      table_view right,
-                      ast::expression binary_predicate,
+conditional_left_join(table_view const& left,
+                      table_view const& right,
+                      ast::expression const& binary_predicate,
                       null_equality compare_nulls,
                       std::optional<std::size_t> output_size,
                       rmm::mr::device_memory_resource* mr)
@@ -288,9 +288,9 @@ conditional_left_join(table_view left,
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-conditional_full_join(table_view left,
-                      table_view right,
-                      ast::expression binary_predicate,
+conditional_full_join(table_view const& left,
+                      table_view const& right,
+                      ast::expression const& binary_predicate,
                       null_equality compare_nulls,
                       rmm::mr::device_memory_resource* mr)
 {
@@ -306,9 +306,9 @@ conditional_full_join(table_view left,
 }
 
 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
-  table_view left,
-  table_view right,
-  ast::expression binary_predicate,
+  table_view const& left,
+  table_view const& right,
+  ast::expression const& binary_predicate,
   null_equality compare_nulls,
   std::optional<std::size_t> output_size,
   rmm::mr::device_memory_resource* mr)
@@ -326,9 +326,9 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
 }
 
 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
-  table_view left,
-  table_view right,
-  ast::expression binary_predicate,
+  table_view const& left,
+  table_view const& right,
+  ast::expression const& binary_predicate,
   null_equality compare_nulls,
   std::optional<std::size_t> output_size,
   rmm::mr::device_memory_resource* mr)
@@ -345,9 +345,9 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
                      .first);
 }
 
-std::size_t conditional_inner_join_size(table_view left,
-                                        table_view right,
-                                        ast::expression binary_predicate,
+std::size_t conditional_inner_join_size(table_view const& left,
+                                        table_view const& right,
+                                        ast::expression const& binary_predicate,
                                         null_equality compare_nulls,
                                         rmm::mr::device_memory_resource* mr)
 {
@@ -361,9 +361,9 @@ std::size_t conditional_inner_join_size(table_view left,
                                                       mr);
 }
 
-std::size_t conditional_left_join_size(table_view left,
-                                       table_view right,
-                                       ast::expression binary_predicate,
+std::size_t conditional_left_join_size(table_view const& left,
+                                       table_view const& right,
+                                       ast::expression const& binary_predicate,
                                        null_equality compare_nulls,
                                        rmm::mr::device_memory_resource* mr)
 {
@@ -377,9 +377,9 @@ std::size_t conditional_left_join_size(table_view left,
                                                       mr);
 }
 
-std::size_t conditional_left_semi_join_size(table_view left,
-                                            table_view right,
-                                            ast::expression binary_predicate,
+std::size_t conditional_left_semi_join_size(table_view const& left,
+                                            table_view const& right,
+                                            ast::expression const& binary_predicate,
                                             null_equality compare_nulls,
                                             rmm::mr::device_memory_resource* mr)
 {
@@ -393,9 +393,9 @@ std::size_t conditional_left_semi_join_size(table_view left,
                                                                 mr));
 }
 
-std::size_t conditional_left_anti_join_size(table_view left,
-                                            table_view right,
-                                            ast::expression binary_predicate,
+std::size_t conditional_left_anti_join_size(table_view const& left,
+                                            table_view const& right,
+                                            ast::expression const& binary_predicate,
                                             null_equality compare_nulls,
                                             rmm::mr::device_memory_resource* mr)
 {
diff --git a/cpp/src/join/conditional_join.hpp b/cpp/src/join/conditional_join.hpp
index b5b49815381..5a3fe887838 100644
--- a/cpp/src/join/conditional_join.hpp
+++ b/cpp/src/join/conditional_join.hpp
@@ -17,7 +17,7 @@
 
 #include "join_common_utils.hpp"
 
-#include <cudf/ast/nodes.hpp>
+#include <cudf/ast/expressions.hpp>
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -45,7 +45,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 conditional_join(table_view const& left,
                  table_view const& right,
-                 ast::expression binary_predicate,
+                 ast::expression const& binary_predicate,
                  null_equality compare_nulls,
                  join_kind JoinKind,
                  std::optional<std::size_t> output_size = {},
@@ -68,7 +68,7 @@ conditional_join(table_view const& left,
 std::size_t compute_conditional_join_output_size(
   table_view const& left,
   table_view const& right,
-  ast::expression binary_predicate,
+  ast::expression const& binary_predicate,
   null_equality compare_nulls,
   join_kind JoinKind,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu
index cd8196e555c..1466ee9ad27 100644
--- a/cpp/src/transform/compute_column.cu
+++ b/cpp/src/transform/compute_column.cu
@@ -16,7 +16,7 @@
 
 #include <cudf/ast/detail/expression_evaluator.cuh>
 #include <cudf/ast/detail/expression_parser.hpp>
-#include <cudf/ast/nodes.hpp>
+#include <cudf/ast/expressions.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -78,7 +78,7 @@ __launch_bounds__(max_block_size) __global__
   }
 }
 
-std::unique_ptr<column> compute_column(table_view const table,
+std::unique_ptr<column> compute_column(table_view const& table,
                                        ast::expression const& expr,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
@@ -135,7 +135,7 @@ std::unique_ptr<column> compute_column(table_view const table,
 
 }  // namespace detail
 
-std::unique_ptr<column> compute_column(table_view const table,
+std::unique_ptr<column> compute_column(table_view const& table,
                                        ast::expression const& expr,
                                        rmm::mr::device_memory_resource* mr)
 {
diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp
index 19797d0ce2e..de6c9d486ec 100644
--- a/cpp/tests/ast/transform_tests.cpp
+++ b/cpp/tests/ast/transform_tests.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <cudf/ast/operators.hpp>
+#include <cudf/ast/expressions.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/iterator.cuh>
@@ -47,6 +47,35 @@ constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_leve
 struct TransformTest : public cudf::test::BaseFixture {
 };
 
+TEST_F(TransformTest, ColumnReference)
+{
+  auto c_0   = column_wrapper<int32_t>{3, 20, 1, 50};
+  auto c_1   = column_wrapper<int32_t>{10, 7, 20, 0};
+  auto table = cudf::table_view{{c_0, c_1}};
+
+  auto col_ref_0 = cudf::ast::column_reference(0);
+
+  auto const& expected = c_0;
+  auto result          = cudf::compute_column(table, col_ref_0);
+
+  cudf::test::expect_columns_equal(expected, result->view(), verbosity);
+}
+
+TEST_F(TransformTest, Literal)
+{
+  auto c_0   = column_wrapper<int32_t>{3, 20, 1, 50};
+  auto c_1   = column_wrapper<int32_t>{10, 7, 20, 0};
+  auto table = cudf::table_view{{c_0, c_1}};
+
+  auto literal_value = cudf::numeric_scalar<int32_t>(42);
+  auto literal       = cudf::ast::literal(literal_value);
+
+  auto expected = column_wrapper<int32_t>{42, 42, 42, 42};
+  auto result   = cudf::compute_column(table, literal);
+
+  cudf::test::expect_columns_equal(expected, result->view(), verbosity);
+}
+
 TEST_F(TransformTest, BasicAddition)
 {
   auto c_0   = column_wrapper<int32_t>{3, 20, 1, 50};
@@ -55,7 +84,7 @@ TEST_F(TransformTest, BasicAddition)
 
   auto col_ref_0  = cudf::ast::column_reference(0);
   auto col_ref_1  = cudf::ast::column_reference(1);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1);
 
   auto expected = column_wrapper<int32_t>{13, 27, 21, 50};
   auto result   = cudf::compute_column(table, expression);
@@ -70,7 +99,7 @@ TEST_F(TransformTest, BasicAdditionLarge)
   auto table = cudf::table_view{{col, col}};
 
   auto col_ref    = cudf::ast::column_reference(0);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref, col_ref);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref, col_ref);
 
   auto b        = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; });
   auto expected = column_wrapper<int32_t>(b, b + 2000);
@@ -87,7 +116,7 @@ TEST_F(TransformTest, LessComparator)
 
   auto col_ref_0  = cudf::ast::column_reference(0);
   auto col_ref_1  = cudf::ast::column_reference(1);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1);
 
   auto expected = column_wrapper<bool>{true, false, true, false};
   auto result   = cudf::compute_column(table, expression);
@@ -105,7 +134,7 @@ TEST_F(TransformTest, LessComparatorLarge)
 
   auto col_ref_0  = cudf::ast::column_reference(0);
   auto col_ref_1  = cudf::ast::column_reference(1);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1);
 
   auto c        = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i < 500; });
   auto expected = column_wrapper<bool>(c, c + 2000);
@@ -126,12 +155,12 @@ TEST_F(TransformTest, MultiLevelTreeArithmetic)
   auto col_ref_2 = cudf::ast::column_reference(2);
 
   auto expression_left_subtree =
-    cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1);
+    cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1);
 
   auto expression_right_subtree =
-    cudf::ast::expression(cudf::ast::ast_operator::SUB, col_ref_2, col_ref_0);
+    cudf::ast::operation(cudf::ast::ast_operator::SUB, col_ref_2, col_ref_0);
 
-  auto expression_tree = cudf::ast::expression(
+  auto expression_tree = cudf::ast::operation(
     cudf::ast::ast_operator::ADD, expression_left_subtree, expression_right_subtree);
 
   auto result   = cudf::compute_column(table, expression_tree);
@@ -142,8 +171,6 @@ TEST_F(TransformTest, MultiLevelTreeArithmetic)
 
 TEST_F(TransformTest, MultiLevelTreeArithmeticLarge)
 {
-  using namespace cudf::ast;
-
   auto a     = thrust::make_counting_iterator(0);
   auto b     = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i + 1; });
   auto c     = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; });
@@ -152,13 +179,15 @@ TEST_F(TransformTest, MultiLevelTreeArithmeticLarge)
   auto c_2   = column_wrapper<int32_t>(c, c + 2000);
   auto table = cudf::table_view{{c_0, c_1, c_2}};
 
-  auto col_ref_0 = column_reference(0);
-  auto col_ref_1 = column_reference(1);
-  auto col_ref_2 = column_reference(2);
+  auto col_ref_0 = cudf::ast::column_reference(0);
+  auto col_ref_1 = cudf::ast::column_reference(1);
+  auto col_ref_2 = cudf::ast::column_reference(2);
 
-  auto expr_left_subtree  = expression(cudf::ast::ast_operator::MUL, col_ref_0, col_ref_1);
-  auto expr_right_subtree = expression(cudf::ast::ast_operator::ADD, col_ref_2, col_ref_0);
-  auto expr_tree          = expression(ast_operator::SUB, expr_left_subtree, expr_right_subtree);
+  auto expr_left_subtree = cudf::ast::operation(cudf::ast::ast_operator::MUL, col_ref_0, col_ref_1);
+  auto expr_right_subtree =
+    cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_2, col_ref_0);
+  auto expr_tree =
+    cudf::ast::operation(cudf::ast::ast_operator::SUB, expr_left_subtree, expr_right_subtree);
 
   auto result = cudf::compute_column(table, expr_tree);
   auto calc   = [](auto i) { return (i * (i + 1)) - (i + (i * 2)); };
@@ -180,10 +209,10 @@ TEST_F(TransformTest, ImbalancedTreeArithmetic)
   auto col_ref_2 = cudf::ast::column_reference(2);
 
   auto expression_right_subtree =
-    cudf::ast::expression(cudf::ast::ast_operator::MUL, col_ref_0, col_ref_1);
+    cudf::ast::operation(cudf::ast::ast_operator::MUL, col_ref_0, col_ref_1);
 
   auto expression_tree =
-    cudf::ast::expression(cudf::ast::ast_operator::SUB, col_ref_2, expression_right_subtree);
+    cudf::ast::operation(cudf::ast::ast_operator::SUB, col_ref_2, expression_right_subtree);
 
   auto result = cudf::compute_column(table, expression_tree);
   auto expected =
@@ -204,12 +233,12 @@ TEST_F(TransformTest, MultiLevelTreeComparator)
   auto col_ref_2 = cudf::ast::column_reference(2);
 
   auto expression_left_subtree =
-    cudf::ast::expression(cudf::ast::ast_operator::GREATER_EQUAL, col_ref_0, col_ref_1);
+    cudf::ast::operation(cudf::ast::ast_operator::GREATER_EQUAL, col_ref_0, col_ref_1);
 
   auto expression_right_subtree =
-    cudf::ast::expression(cudf::ast::ast_operator::GREATER, col_ref_2, col_ref_0);
+    cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_2, col_ref_0);
 
-  auto expression_tree = cudf::ast::expression(
+  auto expression_tree = cudf::ast::operation(
     cudf::ast::ast_operator::LOGICAL_AND, expression_left_subtree, expression_right_subtree);
 
   auto result   = cudf::compute_column(table, expression_tree);
@@ -228,9 +257,9 @@ TEST_F(TransformTest, MultiTypeOperationFailure)
   auto col_ref_1 = cudf::ast::column_reference(1);
 
   auto expression_0_plus_1 =
-    cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1);
+    cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1);
   auto expression_1_plus_0 =
-    cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref_1, col_ref_0);
+    cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_1, col_ref_0);
 
   // Operations on different types are not allowed
   EXPECT_THROW(cudf::compute_column(table, expression_0_plus_1), cudf::logic_error);
@@ -246,7 +275,7 @@ TEST_F(TransformTest, LiteralComparison)
   auto literal_value = cudf::numeric_scalar<int32_t>(41);
   auto literal       = cudf::ast::literal(literal_value);
 
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::GREATER, col_ref_0, literal);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_0, literal);
 
   auto result   = cudf::compute_column(table, expression);
   auto expected = column_wrapper<bool>{false, false, false, true};
@@ -261,7 +290,7 @@ TEST_F(TransformTest, UnaryNot)
 
   auto col_ref_0 = cudf::ast::column_reference(0);
 
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::NOT, col_ref_0);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::NOT, col_ref_0);
 
   auto result   = cudf::compute_column(table, expression);
   auto expected = column_wrapper<bool>{false, true, false, false};
@@ -277,17 +306,17 @@ TEST_F(TransformTest, UnaryTrigonometry)
   auto col_ref_0 = cudf::ast::column_reference(0);
 
   auto expected_sin   = column_wrapper<double>{0.0, std::sqrt(2) / 2, std::sqrt(3.0) / 2.0};
-  auto expression_sin = cudf::ast::expression(cudf::ast::ast_operator::SIN, col_ref_0);
+  auto expression_sin = cudf::ast::operation(cudf::ast::ast_operator::SIN, col_ref_0);
   auto result_sin     = cudf::compute_column(table, expression_sin);
   cudf::test::expect_columns_equivalent(expected_sin, result_sin->view(), verbosity);
 
   auto expected_cos   = column_wrapper<double>{1.0, std::sqrt(2) / 2, 0.5};
-  auto expression_cos = cudf::ast::expression(cudf::ast::ast_operator::COS, col_ref_0);
+  auto expression_cos = cudf::ast::operation(cudf::ast::ast_operator::COS, col_ref_0);
   auto result_cos     = cudf::compute_column(table, expression_cos);
   cudf::test::expect_columns_equivalent(expected_cos, result_cos->view(), verbosity);
 
   auto expected_tan   = column_wrapper<double>{0.0, 1.0, std::sqrt(3.0)};
-  auto expression_tan = cudf::ast::expression(cudf::ast::ast_operator::TAN, col_ref_0);
+  auto expression_tan = cudf::ast::operation(cudf::ast::ast_operator::TAN, col_ref_0);
   auto result_tan     = cudf::compute_column(table, expression_tan);
   cudf::test::expect_columns_equivalent(expected_tan, result_tan->view(), verbosity);
 }
@@ -295,8 +324,8 @@ TEST_F(TransformTest, UnaryTrigonometry)
 TEST_F(TransformTest, ArityCheckFailure)
 {
   auto col_ref_0 = cudf::ast::column_reference(0);
-  EXPECT_THROW(cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref_0), cudf::logic_error);
-  EXPECT_THROW(cudf::ast::expression(cudf::ast::ast_operator::ABS, col_ref_0, col_ref_0),
+  EXPECT_THROW(cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0), cudf::logic_error);
+  EXPECT_THROW(cudf::ast::operation(cudf::ast::ast_operator::ABS, col_ref_0, col_ref_0),
                cudf::logic_error);
 }
 
@@ -308,7 +337,7 @@ TEST_F(TransformTest, StringComparison)
 
   auto col_ref_0  = cudf::ast::column_reference(0);
   auto col_ref_1  = cudf::ast::column_reference(1);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1);
 
   auto expected = column_wrapper<bool>{true, false, true, false};
   auto result   = cudf::compute_column(table, expression);
@@ -322,7 +351,7 @@ TEST_F(TransformTest, CopyColumn)
   auto table = cudf::table_view{{c_0}};
 
   auto col_ref_0  = cudf::ast::column_reference(0);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::IDENTITY, col_ref_0);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::IDENTITY, col_ref_0);
 
   auto result   = cudf::compute_column(table, expression);
   auto expected = column_wrapper<int32_t>{3, 0, 1, 50};
@@ -338,7 +367,7 @@ TEST_F(TransformTest, CopyLiteral)
   auto literal_value = cudf::numeric_scalar<int32_t>(-123);
   auto literal       = cudf::ast::literal(literal_value);
 
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::IDENTITY, literal);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::IDENTITY, literal);
 
   auto result   = cudf::compute_column(table, expression);
   auto expected = column_wrapper<int32_t>{-123, -123, -123, -123};
@@ -355,7 +384,7 @@ TEST_F(TransformTest, TrueDiv)
   auto literal_value = cudf::numeric_scalar<int32_t>(2);
   auto literal       = cudf::ast::literal(literal_value);
 
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::TRUE_DIV, col_ref_0, literal);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::TRUE_DIV, col_ref_0, literal);
 
   auto result   = cudf::compute_column(table, expression);
   auto expected = column_wrapper<double>{1.5, 0.0, 0.5, 25.0};
@@ -372,7 +401,7 @@ TEST_F(TransformTest, FloorDiv)
   auto literal_value = cudf::numeric_scalar<double>(2.0);
   auto literal       = cudf::ast::literal(literal_value);
 
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::FLOOR_DIV, col_ref_0, literal);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::FLOOR_DIV, col_ref_0, literal);
 
   auto result   = cudf::compute_column(table, expression);
   auto expected = column_wrapper<double>{1.0, 0.0, 0.0, 25.0};
@@ -389,7 +418,7 @@ TEST_F(TransformTest, Mod)
   auto literal_value = cudf::numeric_scalar<double>(2.0);
   auto literal       = cudf::ast::literal(literal_value);
 
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::MOD, col_ref_0, literal);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::MOD, col_ref_0, literal);
 
   auto result   = cudf::compute_column(table, expression);
   auto expected = column_wrapper<double>{1.0, 0.0, -1.0, 0.0};
@@ -406,7 +435,7 @@ TEST_F(TransformTest, PyMod)
   auto literal_value = cudf::numeric_scalar<double>(2.0);
   auto literal       = cudf::ast::literal(literal_value);
 
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::PYMOD, col_ref_0, literal);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::PYMOD, col_ref_0, literal);
 
   auto result   = cudf::compute_column(table, expression);
   auto expected = column_wrapper<double>{1.0, 0.0, 1.0, 0.0};
@@ -422,7 +451,7 @@ TEST_F(TransformTest, BasicAdditionNulls)
 
   auto col_ref_0  = cudf::ast::column_reference(0);
   auto col_ref_1  = cudf::ast::column_reference(1);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1);
 
   auto expected = column_wrapper<int32_t>{{0, 0, 0, 50}, {0, 0, 0, 1}};
   auto result   = cudf::compute_column(table, expression);
@@ -447,7 +476,7 @@ TEST_F(TransformTest, BasicAdditionLargeNulls)
   auto table = cudf::table_view{{col}};
 
   auto col_ref    = cudf::ast::column_reference(0);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref, col_ref);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref, col_ref);
 
   auto b        = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; });
   auto expected = column_wrapper<int32_t>(b, b + N, validities.begin());
diff --git a/cpp/tests/join/conditional_join_tests.cu b/cpp/tests/join/conditional_join_tests.cu
index e16e1ec7de8..8018d613e05 100644
--- a/cpp/tests/join/conditional_join_tests.cu
+++ b/cpp/tests/join/conditional_join_tests.cu
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#include <cudf/ast/nodes.hpp>
-#include <cudf/ast/operators.hpp>
+#include <cudf/ast/expressions.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/join.hpp>
 #include <cudf/table/table_view.hpp>
@@ -50,7 +49,7 @@ const auto col_ref_right_1 = cudf::ast::column_reference(1, cudf::ast::table_ref
 
 // Common expressions.
 auto left_zero_eq_right_zero =
-  cudf::ast::expression(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0);
+  cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0);
 }  // namespace
 
 /**
@@ -147,7 +146,7 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest<T> {
    */
   void test(std::vector<std::vector<T>> left_data,
             std::vector<std::vector<T>> right_data,
-            cudf::ast::expression predicate,
+            cudf::ast::operation predicate,
             std::vector<std::pair<cudf::size_type, cudf::size_type>> expected_outputs)
   {
     // Note that we need to maintain the column wrappers otherwise the
@@ -174,7 +173,7 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest<T> {
 
   void test_nulls(std::vector<std::pair<std::vector<T>, std::vector<bool>>> left_data,
                   std::vector<std::pair<std::vector<T>, std::vector<bool>>> right_data,
-                  cudf::ast::expression predicate,
+                  cudf::ast::operation predicate,
                   std::vector<std::pair<cudf::size_type, cudf::size_type>> expected_outputs)
   {
     // Note that we need to maintain the column wrappers otherwise the
@@ -252,7 +251,7 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest<T> {
    */
   virtual std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
                     std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
-  join(cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) = 0;
+  join(cudf::table_view left, cudf::table_view right, cudf::ast::operation predicate) = 0;
 
   /**
    * This method must be implemented by subclasses for specific types of joins.
@@ -261,7 +260,7 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest<T> {
    */
   virtual std::size_t join_size(cudf::table_view left,
                                 cudf::table_view right,
-                                cudf::ast::expression predicate) = 0;
+                                cudf::ast::operation predicate) = 0;
 
   /**
    * This method must be implemented by subclasses for specific types of joins.
@@ -280,14 +279,14 @@ template <typename T>
 struct ConditionalInnerJoinTest : public ConditionalJoinPairReturnTest<T> {
   std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
             std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
-  join(cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) override
+  join(cudf::table_view left, cudf::table_view right, cudf::ast::operation predicate) override
   {
     return cudf::conditional_inner_join(left, right, predicate);
   }
 
   std::size_t join_size(cudf::table_view left,
                         cudf::table_view right,
-                        cudf::ast::expression predicate) override
+                        cudf::ast::operation predicate) override
   {
     return cudf::conditional_inner_join_size(left, right, predicate);
   }
@@ -336,7 +335,7 @@ TYPED_TEST(ConditionalInnerJoinTest, TestTwoColumnThreeRowSomeEqual)
 TYPED_TEST(ConditionalInnerJoinTest, TestNotComparison)
 {
   auto col_ref_0  = cudf::ast::column_reference(0);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::NOT, col_ref_0);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::NOT, col_ref_0);
 
   this->test({{0, 1, 2}}, {{3, 4, 5}}, expression, {{0, 0}, {0, 1}, {0, 2}});
 };
@@ -345,7 +344,7 @@ TYPED_TEST(ConditionalInnerJoinTest, TestGreaterComparison)
 {
   auto col_ref_0  = cudf::ast::column_reference(0);
   auto col_ref_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1);
 
   this->test({{0, 1, 2}}, {{1, 0, 0}}, expression, {{1, 1}, {1, 2}, {2, 0}, {2, 1}, {2, 2}});
 };
@@ -354,7 +353,7 @@ TYPED_TEST(ConditionalInnerJoinTest, TestGreaterTwoColumnComparison)
 {
   auto col_ref_0  = cudf::ast::column_reference(0);
   auto col_ref_1  = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1);
 
   this->test({{0, 1, 2}, {0, 0, 0}},
              {{0, 0, 0}, {1, 0, 0}},
@@ -366,7 +365,7 @@ TYPED_TEST(ConditionalInnerJoinTest, TestGreaterDifferentNumberColumnComparison)
 {
   auto col_ref_0  = cudf::ast::column_reference(0);
   auto col_ref_1  = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1);
 
   this->test(
     {{0, 1, 2}}, {{0, 0, 0}, {1, 0, 0}}, expression, {{1, 1}, {1, 2}, {2, 0}, {2, 1}, {2, 2}});
@@ -376,7 +375,7 @@ TYPED_TEST(ConditionalInnerJoinTest, TestGreaterDifferentNumberColumnDifferentSi
 {
   auto col_ref_0  = cudf::ast::column_reference(0);
   auto col_ref_1  = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1);
 
   this->test({{0, 1}}, {{0, 0, 0}, {1, 0, 0}}, expression, {{1, 1}, {1, 2}});
 };
@@ -387,14 +386,14 @@ TYPED_TEST(ConditionalInnerJoinTest, TestComplexConditionMultipleColumns)
   auto col_ref_0      = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
   auto scalar_1       = cudf::numeric_scalar<TypeParam>(1);
   auto literal_1      = cudf::ast::literal(scalar_1);
-  auto left_0_equal_1 = cudf::ast::expression(cudf::ast::ast_operator::EQUAL, col_ref_0, literal_1);
+  auto left_0_equal_1 = cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_0, literal_1);
 
   auto col_ref_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
   auto comparison_filter =
-    cudf::ast::expression(cudf::ast::ast_operator::LESS, col_ref_1, col_ref_0);
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_1, col_ref_0);
 
   auto expression =
-    cudf::ast::expression(cudf::ast::ast_operator::LOGICAL_AND, left_0_equal_1, comparison_filter);
+    cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, left_0_equal_1, comparison_filter);
 
   this->test({{0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}},
              {{0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2},
@@ -408,9 +407,9 @@ TYPED_TEST(ConditionalInnerJoinTest, TestSymmetry)
 {
   auto col_ref_0  = cudf::ast::column_reference(0);
   auto col_ref_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::GREATER, col_ref_1, col_ref_0);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_1, col_ref_0);
   auto expression_reverse =
-    cudf::ast::expression(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1);
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1);
 
   this->test(
     {{0, 1, 2}}, {{1, 2, 3}}, expression, {{0, 0}, {0, 1}, {0, 2}, {1, 1}, {1, 2}, {2, 2}});
@@ -462,14 +461,14 @@ template <typename T>
 struct ConditionalLeftJoinTest : public ConditionalJoinPairReturnTest<T> {
   std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
             std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
-  join(cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) override
+  join(cudf::table_view left, cudf::table_view right, cudf::ast::operation predicate) override
   {
     return cudf::conditional_left_join(left, right, predicate);
   }
 
   std::size_t join_size(cudf::table_view left,
                         cudf::table_view right,
-                        cudf::ast::expression predicate) override
+                        cudf::ast::operation predicate) override
   {
     return cudf::conditional_left_join_size(left, right, predicate);
   }
@@ -525,14 +524,14 @@ template <typename T>
 struct ConditionalFullJoinTest : public ConditionalJoinPairReturnTest<T> {
   std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
             std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
-  join(cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) override
+  join(cudf::table_view left, cudf::table_view right, cudf::ast::operation predicate) override
   {
     return cudf::conditional_full_join(left, right, predicate);
   }
 
   std::size_t join_size(cudf::table_view left,
                         cudf::table_view right,
-                        cudf::ast::expression predicate) override
+                        cudf::ast::operation predicate) override
   {
     // Full joins don't actually support size calculations, but to support a
     // uniform testing framework we just calculate it from the result of doing
@@ -610,7 +609,7 @@ struct ConditionalJoinSingleReturnTest : public ConditionalJoinTest<T> {
    */
   void test(std::vector<std::vector<T>> left_data,
             std::vector<std::vector<T>> right_data,
-            cudf::ast::expression predicate,
+            cudf::ast::operation predicate,
             std::vector<cudf::size_type> expected_outputs)
   {
     auto [left_wrappers, right_wrappers, left_columns, right_columns, left, right] =
@@ -661,7 +660,7 @@ struct ConditionalJoinSingleReturnTest : public ConditionalJoinTest<T> {
    * conditional join API.
    */
   virtual std::unique_ptr<rmm::device_uvector<cudf::size_type>> join(
-    cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) = 0;
+    cudf::table_view left, cudf::table_view right, cudf::ast::operation predicate) = 0;
 
   /**
    * This method must be implemented by subclasses for specific types of joins.
@@ -670,7 +669,7 @@ struct ConditionalJoinSingleReturnTest : public ConditionalJoinTest<T> {
    */
   virtual std::size_t join_size(cudf::table_view left,
                                 cudf::table_view right,
-                                cudf::ast::expression predicate) = 0;
+                                cudf::ast::operation predicate) = 0;
 
   /**
    * This method must be implemented by subclasses for specific types of joins.
@@ -687,14 +686,14 @@ struct ConditionalJoinSingleReturnTest : public ConditionalJoinTest<T> {
 template <typename T>
 struct ConditionalLeftSemiJoinTest : public ConditionalJoinSingleReturnTest<T> {
   std::unique_ptr<rmm::device_uvector<cudf::size_type>> join(
-    cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) override
+    cudf::table_view left, cudf::table_view right, cudf::ast::operation predicate) override
   {
     return cudf::conditional_left_semi_join(left, right, predicate);
   }
 
   std::size_t join_size(cudf::table_view left,
                         cudf::table_view right,
-                        cudf::ast::expression predicate) override
+                        cudf::ast::operation predicate) override
   {
     return cudf::conditional_left_semi_join_size(left, right, predicate);
   }
@@ -745,14 +744,14 @@ TYPED_TEST(ConditionalLeftSemiJoinTest, TestCompareRandomToHash)
 template <typename T>
 struct ConditionalLeftAntiJoinTest : public ConditionalJoinSingleReturnTest<T> {
   std::unique_ptr<rmm::device_uvector<cudf::size_type>> join(
-    cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) override
+    cudf::table_view left, cudf::table_view right, cudf::ast::operation predicate) override
   {
     return cudf::conditional_left_anti_join(left, right, predicate);
   }
 
   std::size_t join_size(cudf::table_view left,
                         cudf::table_view right,
-                        cudf::ast::expression predicate) override
+                        cudf::ast::operation predicate) override
   {
     return cudf::conditional_left_anti_join_size(left, right, predicate);
   }
diff --git a/java/src/main/java/ai/rapids/cudf/ast/AstNode.java b/java/src/main/java/ai/rapids/cudf/ast/AstExpression.java
similarity index 82%
rename from java/src/main/java/ai/rapids/cudf/ast/AstNode.java
rename to java/src/main/java/ai/rapids/cudf/ast/AstExpression.java
index 8160462de98..5ac15f714f0 100644
--- a/java/src/main/java/ai/rapids/cudf/ast/AstNode.java
+++ b/java/src/main/java/ai/rapids/cudf/ast/AstExpression.java
@@ -17,14 +17,15 @@
 package ai.rapids.cudf.ast;
 
 import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 
 /** Base class of every node in an AST */
-public abstract class AstNode {
+public abstract class AstExpression {
   /**
    * Enumeration for the types of AST nodes that can appear in a serialized AST.
    * NOTE: This must be kept in sync with the `jni_serialized_node_type` in CompiledExpression.cpp!
    */
-  protected enum NodeType {
+  protected enum ExpressionType {
     VALID_LITERAL(0),
     NULL_LITERAL(1),
     COLUMN_REFERENCE(2),
@@ -33,7 +34,7 @@ protected enum NodeType {
 
     private final byte nativeId;
 
-    NodeType(int nativeId) {
+    ExpressionType(int nativeId) {
       this.nativeId = (byte) nativeId;
       assert this.nativeId == nativeId;
     }
@@ -49,6 +50,14 @@ void serialize(ByteBuffer bb) {
     }
   }
 
+  public CompiledExpression compile() {
+    int size = getSerializedSize();
+    ByteBuffer bb = ByteBuffer.allocate(size);
+    bb.order(ByteOrder.nativeOrder());
+    serialize(bb);
+    return new CompiledExpression(bb.array());
+  }
+
   /** Get the size in bytes of the serialized form of this node and all child nodes */
   abstract int getSerializedSize();
 
diff --git a/java/src/main/java/ai/rapids/cudf/ast/BinaryExpression.java b/java/src/main/java/ai/rapids/cudf/ast/BinaryOperation.java
similarity index 72%
rename from java/src/main/java/ai/rapids/cudf/ast/BinaryExpression.java
rename to java/src/main/java/ai/rapids/cudf/ast/BinaryOperation.java
index ed4f95b01e1..c39c1c3a1c5 100644
--- a/java/src/main/java/ai/rapids/cudf/ast/BinaryExpression.java
+++ b/java/src/main/java/ai/rapids/cudf/ast/BinaryOperation.java
@@ -18,13 +18,13 @@
 
 import java.nio.ByteBuffer;
 
-/** A binary expression consisting of an operator and two operands. */
-public class BinaryExpression extends Expression {
+/** A binary operation consisting of an operator and two operands. */
+public class BinaryOperation extends AstExpression {
   private final BinaryOperator op;
-  private final AstNode leftInput;
-  private final AstNode rightInput;
+  private final AstExpression leftInput;
+  private final AstExpression rightInput;
 
-  public BinaryExpression(BinaryOperator op, AstNode leftInput, AstNode rightInput) {
+  public BinaryOperation(BinaryOperator op, AstExpression leftInput, AstExpression rightInput) {
     this.op = op;
     this.leftInput = leftInput;
     this.rightInput = rightInput;
@@ -32,7 +32,7 @@ public BinaryExpression(BinaryOperator op, AstNode leftInput, AstNode rightInput
 
   @Override
   int getSerializedSize() {
-    return NodeType.BINARY_EXPRESSION.getSerializedSize() +
+    return ExpressionType.BINARY_EXPRESSION.getSerializedSize() +
         op.getSerializedSize() +
         leftInput.getSerializedSize() +
         rightInput.getSerializedSize();
@@ -40,7 +40,7 @@ int getSerializedSize() {
 
   @Override
   void serialize(ByteBuffer bb) {
-    NodeType.BINARY_EXPRESSION.serialize(bb);
+    ExpressionType.BINARY_EXPRESSION.serialize(bb);
     op.serialize(bb);
     leftInput.serialize(bb);
     rightInput.serialize(bb);
diff --git a/java/src/main/java/ai/rapids/cudf/ast/BinaryOperator.java b/java/src/main/java/ai/rapids/cudf/ast/BinaryOperator.java
index 12e4d985658..595badb14b6 100644
--- a/java/src/main/java/ai/rapids/cudf/ast/BinaryOperator.java
+++ b/java/src/main/java/ai/rapids/cudf/ast/BinaryOperator.java
@@ -19,7 +19,7 @@
 import java.nio.ByteBuffer;
 
 /**
- * Enumeration of AST operations that can appear in a binary expression.
+ * Enumeration of AST operators that can appear in a binary operation.
  * NOTE: This must be kept in sync with `jni_to_binary_operator` in CompiledExpression.cpp!
  */
 public enum BinaryOperator {
diff --git a/java/src/main/java/ai/rapids/cudf/ast/ColumnReference.java b/java/src/main/java/ai/rapids/cudf/ast/ColumnReference.java
index 34e4064e23b..4860a088a83 100644
--- a/java/src/main/java/ai/rapids/cudf/ast/ColumnReference.java
+++ b/java/src/main/java/ai/rapids/cudf/ast/ColumnReference.java
@@ -19,7 +19,7 @@
 import java.nio.ByteBuffer;
 
 /** A reference to a column in an input table. */
-public final class ColumnReference extends AstNode {
+public final class ColumnReference extends AstExpression {
   private final int columnIndex;
   private final TableReference tableSource;
 
@@ -37,14 +37,14 @@ public ColumnReference(int columnIndex, TableReference tableSource) {
   @Override
   int getSerializedSize() {
     // node type + table ref + column index
-    return NodeType.COLUMN_REFERENCE.getSerializedSize() +
+    return ExpressionType.COLUMN_REFERENCE.getSerializedSize() +
         tableSource.getSerializedSize() +
         Integer.BYTES;
   }
 
   @Override
   void serialize(ByteBuffer bb) {
-    NodeType.COLUMN_REFERENCE.serialize(bb);
+    ExpressionType.COLUMN_REFERENCE.serialize(bb);
     tableSource.serialize(bb);
     bb.putInt(columnIndex);
   }
diff --git a/java/src/main/java/ai/rapids/cudf/ast/Expression.java b/java/src/main/java/ai/rapids/cudf/ast/Expression.java
deleted file mode 100644
index 8d391298cef..00000000000
--- a/java/src/main/java/ai/rapids/cudf/ast/Expression.java
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package ai.rapids.cudf.ast;
-
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-
-/** Base class of every AST expression. */
-public abstract class Expression extends AstNode {
-  public CompiledExpression compile() {
-    int size = getSerializedSize();
-    ByteBuffer bb = ByteBuffer.allocate(size);
-    bb.order(ByteOrder.nativeOrder());
-    serialize(bb);
-    return new CompiledExpression(bb.array());
-  }
-}
diff --git a/java/src/main/java/ai/rapids/cudf/ast/Literal.java b/java/src/main/java/ai/rapids/cudf/ast/Literal.java
index be306cd99c4..b93efce8c94 100644
--- a/java/src/main/java/ai/rapids/cudf/ast/Literal.java
+++ b/java/src/main/java/ai/rapids/cudf/ast/Literal.java
@@ -22,7 +22,7 @@
 import java.nio.ByteOrder;
 
 /** A literal value in an AST expression. */
-public final class Literal extends AstNode {
+public final class Literal extends AstExpression {
   private final DType type;
   private final byte[] serializedValue;
 
@@ -207,8 +207,8 @@ public static Literal ofDurationFromLong(DType type, Long value) {
 
   @Override
   int getSerializedSize() {
-    NodeType nodeType = serializedValue != null
-        ? NodeType.VALID_LITERAL : NodeType.NULL_LITERAL;
+    ExpressionType nodeType = serializedValue != null
+        ? ExpressionType.VALID_LITERAL : ExpressionType.NULL_LITERAL;
     int size = nodeType.getSerializedSize() + getDataTypeSerializedSize();
     if (serializedValue != null) {
       size += serializedValue.length;
@@ -218,8 +218,8 @@ int getSerializedSize() {
 
   @Override
   void serialize(ByteBuffer bb) {
-    NodeType nodeType = serializedValue != null
-        ? NodeType.VALID_LITERAL : NodeType.NULL_LITERAL;
+    ExpressionType nodeType = serializedValue != null
+        ? ExpressionType.VALID_LITERAL : ExpressionType.NULL_LITERAL;
     nodeType.serialize(bb);
     serializeDataType(bb);
     if (serializedValue != null) {
diff --git a/java/src/main/java/ai/rapids/cudf/ast/UnaryExpression.java b/java/src/main/java/ai/rapids/cudf/ast/UnaryOperation.java
similarity index 73%
rename from java/src/main/java/ai/rapids/cudf/ast/UnaryExpression.java
rename to java/src/main/java/ai/rapids/cudf/ast/UnaryOperation.java
index fa8e70266ac..03c4c45afd4 100644
--- a/java/src/main/java/ai/rapids/cudf/ast/UnaryExpression.java
+++ b/java/src/main/java/ai/rapids/cudf/ast/UnaryOperation.java
@@ -18,26 +18,26 @@
 
 import java.nio.ByteBuffer;
 
-/** A unary expression consisting of an operator and an operand. */
-public final class UnaryExpression extends Expression {
+/** A unary operation consisting of an operator and an operand. */
+public final class UnaryOperation extends AstExpression {
   private final UnaryOperator op;
-  private final AstNode input;
+  private final AstExpression input;
 
-  public UnaryExpression(UnaryOperator op, AstNode input) {
+  public UnaryOperation(UnaryOperator op, AstExpression input) {
     this.op = op;
     this.input = input;
   }
 
   @Override
   int getSerializedSize() {
-    return NodeType.UNARY_EXPRESSION.getSerializedSize() +
+    return ExpressionType.UNARY_EXPRESSION.getSerializedSize() +
         op.getSerializedSize() +
         input.getSerializedSize();
   }
 
   @Override
   void serialize(ByteBuffer bb) {
-    NodeType.UNARY_EXPRESSION.serialize(bb);
+    ExpressionType.UNARY_EXPRESSION.serialize(bb);
     op.serialize(bb);
     input.serialize(bb);
   }
diff --git a/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java b/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java
index c3f193d06b4..9ef18dbd75d 100644
--- a/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java
+++ b/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java
@@ -19,7 +19,7 @@
 import java.nio.ByteBuffer;
 
 /**
- * Enumeration of AST operations that can appear in a unary expression.
+ * Enumeration of AST operators that can appear in a unary operation.
  * NOTE: This must be kept in sync with `jni_to_unary_operator` in CompiledExpression.cpp!
  */
 public enum UnaryOperator {
diff --git a/java/src/main/native/src/CompiledExpression.cpp b/java/src/main/native/src/CompiledExpression.cpp
index fe57f79c955..470464f35c8 100644
--- a/java/src/main/native/src/CompiledExpression.cpp
+++ b/java/src/main/native/src/CompiledExpression.cpp
@@ -18,8 +18,7 @@
 #include <stdexcept>
 #include <vector>
 
-#include <cudf/ast/nodes.hpp>
-#include <cudf/ast/operators.hpp>
+#include <cudf/ast/expressions.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/transform.hpp>
@@ -104,15 +103,15 @@ class jni_serialized_ast {
 };
 
 /**
- * Enumeration of the AST node types that can appear in the serialized data.
+ * Enumeration of the AST expression types that can appear in the serialized data.
  * NOTE: This must be kept in sync with the NodeType enumeration in AstNode.java!
  */
-enum class jni_serialized_node_type : int8_t {
+enum class jni_serialized_expression_type : int8_t {
   VALID_LITERAL = 0,
   NULL_LITERAL = 1,
   COLUMN_REFERENCE = 2,
-  UNARY_EXPRESSION = 3,
-  BINARY_EXPRESSION = 4
+  UNARY_OPERATION = 3,
+  BINARY_OPERATION = 4
 };
 
 /**
@@ -276,41 +275,42 @@ cudf::ast::column_reference &compile_column_reference(cudf::jni::ast::compiled_e
 }
 
 // forward declaration
-cudf::ast::detail::node &compile_node(cudf::jni::ast::compiled_expr &compiled_expr,
-                                      jni_serialized_ast &jni_ast);
+cudf::ast::expression &compile_expression(cudf::jni::ast::compiled_expr &compiled_expr,
+                                          jni_serialized_ast &jni_ast);
 
 /** Decode a serialized AST unary expression */
-cudf::ast::expression &compile_unary_expression(cudf::jni::ast::compiled_expr &compiled_expr,
-                                                jni_serialized_ast &jni_ast) {
+cudf::ast::operation &compile_unary_expression(cudf::jni::ast::compiled_expr &compiled_expr,
+                                               jni_serialized_ast &jni_ast) {
   auto const ast_op = jni_to_unary_operator(jni_ast.read_byte());
-  cudf::ast::detail::node &child_node = compile_node(compiled_expr, jni_ast);
-  return compiled_expr.add_expression(std::make_unique<cudf::ast::expression>(ast_op, child_node));
+  cudf::ast::expression &child_expression = compile_expression(compiled_expr, jni_ast);
+  return compiled_expr.add_operation(
+      std::make_unique<cudf::ast::operation>(ast_op, child_expression));
 }
 
 /** Decode a serialized AST binary expression */
-cudf::ast::expression &compile_binary_expression(cudf::jni::ast::compiled_expr &compiled_expr,
-                                                 jni_serialized_ast &jni_ast) {
+cudf::ast::operation &compile_binary_expression(cudf::jni::ast::compiled_expr &compiled_expr,
+                                                jni_serialized_ast &jni_ast) {
   auto const ast_op = jni_to_binary_operator(jni_ast.read_byte());
-  cudf::ast::detail::node &left_child = compile_node(compiled_expr, jni_ast);
-  cudf::ast::detail::node &right_child = compile_node(compiled_expr, jni_ast);
-  return compiled_expr.add_expression(
-      std::make_unique<cudf::ast::expression>(ast_op, left_child, right_child));
+  cudf::ast::expression &left_child = compile_expression(compiled_expr, jni_ast);
+  cudf::ast::expression &right_child = compile_expression(compiled_expr, jni_ast);
+  return compiled_expr.add_operation(
+      std::make_unique<cudf::ast::operation>(ast_op, left_child, right_child));
 }
 
-/** Decode a serialized AST node by reading the node type and dispatching */
-cudf::ast::detail::node &compile_node(cudf::jni::ast::compiled_expr &compiled_expr,
-                                      jni_serialized_ast &jni_ast) {
-  auto const node_type = static_cast<jni_serialized_node_type>(jni_ast.read_byte());
-  switch (node_type) {
-    case jni_serialized_node_type::VALID_LITERAL:
+/** Decode a serialized AST expression by reading the expression type and dispatching */
+cudf::ast::expression &compile_expression(cudf::jni::ast::compiled_expr &compiled_expr,
+                                          jni_serialized_ast &jni_ast) {
+  auto const expression_type = static_cast<jni_serialized_expression_type>(jni_ast.read_byte());
+  switch (expression_type) {
+    case jni_serialized_expression_type::VALID_LITERAL:
       return compile_literal(true, compiled_expr, jni_ast);
-    case jni_serialized_node_type::NULL_LITERAL:
+    case jni_serialized_expression_type::NULL_LITERAL:
       return compile_literal(false, compiled_expr, jni_ast);
-    case jni_serialized_node_type::COLUMN_REFERENCE:
+    case jni_serialized_expression_type::COLUMN_REFERENCE:
       return compile_column_reference(compiled_expr, jni_ast);
-    case jni_serialized_node_type::UNARY_EXPRESSION:
+    case jni_serialized_expression_type::UNARY_OPERATION:
       return compile_unary_expression(compiled_expr, jni_ast);
-    case jni_serialized_node_type::BINARY_EXPRESSION:
+    case jni_serialized_expression_type::BINARY_OPERATION:
       return compile_binary_expression(compiled_expr, jni_ast);
     default: throw std::invalid_argument("data is not a serialized AST expression");
   }
@@ -319,16 +319,7 @@ cudf::ast::detail::node &compile_node(cudf::jni::ast::compiled_expr &compiled_ex
 /** Decode a serialized AST into a native libcudf AST and associated resources */
 std::unique_ptr<cudf::jni::ast::compiled_expr> compile_serialized_ast(jni_serialized_ast &jni_ast) {
   auto jni_expr_ptr = std::make_unique<cudf::jni::ast::compiled_expr>();
-  auto const node_type = static_cast<jni_serialized_node_type>(jni_ast.read_byte());
-  switch (node_type) {
-    case jni_serialized_node_type::UNARY_EXPRESSION:
-      (void)compile_unary_expression(*jni_expr_ptr, jni_ast);
-      break;
-    case jni_serialized_node_type::BINARY_EXPRESSION:
-      (void)compile_binary_expression(*jni_expr_ptr, jni_ast);
-      break;
-    default: throw std::invalid_argument("data is not a serialized AST expression");
-  }
+  (void)compile_expression(*jni_expr_ptr, jni_ast);
 
   if (!jni_ast.at_eof()) {
     throw std::invalid_argument("Extra bytes at end of serialized AST");
diff --git a/java/src/main/native/src/jni_compiled_expr.hpp b/java/src/main/native/src/jni_compiled_expr.hpp
index e42e5a37fba..74010f71011 100644
--- a/java/src/main/native/src/jni_compiled_expr.hpp
+++ b/java/src/main/native/src/jni_compiled_expr.hpp
@@ -32,12 +32,6 @@ namespace ast {
  * base AST node type. Then we do not have to track every AST node type separately.
  */
 class compiled_expr {
-  /** All literal nodes within the expression tree */
-  std::vector<std::unique_ptr<cudf::ast::literal>> literals;
-
-  /** All column reference nodes within the expression tree */
-  std::vector<std::unique_ptr<cudf::ast::column_reference>> column_refs;
-
   /** All expression nodes within the expression tree */
   std::vector<std::unique_ptr<cudf::ast::expression>> expressions;
 
@@ -47,20 +41,20 @@ class compiled_expr {
 public:
   cudf::ast::literal &add_literal(std::unique_ptr<cudf::ast::literal> literal_ptr,
                                   std::unique_ptr<cudf::scalar> scalar_ptr) {
-    literals.push_back(std::move(literal_ptr));
+    expressions.push_back(std::move(literal_ptr));
     scalars.push_back(std::move(scalar_ptr));
-    return *literals.back();
+    return static_cast<cudf::ast::literal &>(*expressions.back());
   }
 
   cudf::ast::column_reference &
   add_column_ref(std::unique_ptr<cudf::ast::column_reference> ref_ptr) {
-    column_refs.push_back(std::move(ref_ptr));
-    return *column_refs.back();
+    expressions.push_back(std::move(ref_ptr));
+    return static_cast<cudf::ast::column_reference &>(*expressions.back());
   }
 
-  cudf::ast::expression &add_expression(std::unique_ptr<cudf::ast::expression> expr_ptr) {
+  cudf::ast::operation &add_operation(std::unique_ptr<cudf::ast::operation> expr_ptr) {
     expressions.push_back(std::move(expr_ptr));
-    return *expressions.back();
+    return static_cast<cudf::ast::operation &>(*expressions.back());
   }
 
   /** Return the expression node at the top of the tree */
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 725abd9486d..8e4e3df612b 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -25,7 +25,7 @@
 import ai.rapids.cudf.HostColumnVector.StructData;
 import ai.rapids.cudf.HostColumnVector.StructType;
 
-import ai.rapids.cudf.ast.BinaryExpression;
+import ai.rapids.cudf.ast.BinaryOperation;
 import ai.rapids.cudf.ast.BinaryOperator;
 import ai.rapids.cudf.ast.ColumnReference;
 import ai.rapids.cudf.ast.CompiledExpression;
@@ -1503,7 +1503,7 @@ void testLeftJoinGatherMapsNulls() {
   @Test
   void testConditionalLeftJoinGatherMaps() {
     final int inv = Integer.MIN_VALUE;
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.GREATER,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
@@ -1529,7 +1529,7 @@ void testConditionalLeftJoinGatherMaps() {
   @Test
   void testConditionalLeftJoinGatherMapsNulls() {
     final int inv = Integer.MIN_VALUE;
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.EQUAL,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder()
@@ -1557,7 +1557,7 @@ void testConditionalLeftJoinGatherMapsNulls() {
   @Test
   void testConditionalLeftJoinGatherMapsWithCount() {
     final int inv = Integer.MIN_VALUE;
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.GREATER,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
@@ -1585,7 +1585,7 @@ void testConditionalLeftJoinGatherMapsWithCount() {
   @Test
   void testConditionalLeftJoinGatherMapsNullsWithCount() {
     final int inv = Integer.MIN_VALUE;
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.EQUAL,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder()
@@ -1656,7 +1656,7 @@ void testInnerJoinGatherMapsNulls() {
 
   @Test
   void testConditionalInnerJoinGatherMaps() {
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.GREATER,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
@@ -1681,7 +1681,7 @@ void testConditionalInnerJoinGatherMaps() {
 
   @Test
   void testConditionalInnerJoinGatherMapsNulls() {
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.EQUAL,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder()
@@ -1708,7 +1708,7 @@ void testConditionalInnerJoinGatherMapsNulls() {
 
   @Test
   void testConditionalInnerJoinGatherMapsWithCount() {
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.GREATER,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
@@ -1735,7 +1735,7 @@ void testConditionalInnerJoinGatherMapsWithCount() {
 
   @Test
   void testConditionalInnerJoinGatherMapsNullsWithCount() {
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.EQUAL,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder()
@@ -1809,7 +1809,7 @@ void testFullJoinGatherMapsNulls() {
   @Test
   void testConditionalFullJoinGatherMaps() {
     final int inv = Integer.MIN_VALUE;
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.GREATER,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
@@ -1835,7 +1835,7 @@ void testConditionalFullJoinGatherMaps() {
   @Test
   void testConditionalFullJoinGatherMapsNulls() {
     final int inv = Integer.MIN_VALUE;
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.EQUAL,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder()
@@ -1890,7 +1890,7 @@ void testLeftSemiJoinGatherMapNulls() {
 
   @Test
   void testConditionalLeftSemiJoinGatherMap() {
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.GREATER,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
@@ -1908,7 +1908,7 @@ void testConditionalLeftSemiJoinGatherMap() {
 
   @Test
   void testConditionalLeftSemiJoinGatherMapNulls() {
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.EQUAL,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder()
@@ -1928,7 +1928,7 @@ void testConditionalLeftSemiJoinGatherMapNulls() {
 
   @Test
   void testConditionalLeftSemiJoinGatherMapWithCount() {
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.GREATER,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
@@ -1950,7 +1950,7 @@ void testConditionalLeftSemiJoinGatherMapWithCount() {
 
   @Test
   void testConditionalLeftSemiJoinGatherMapNullsWithCount() {
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.EQUAL,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder()
@@ -2002,7 +2002,7 @@ void testAntiSemiJoinGatherMapNulls() {
 
   @Test
   void testConditionalLeftAntiJoinGatherMap() {
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.GREATER,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
@@ -2020,7 +2020,7 @@ void testConditionalLeftAntiJoinGatherMap() {
 
   @Test
   void testConditionalAntiSemiJoinGatherMapNulls() {
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.EQUAL,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder()
@@ -2040,7 +2040,7 @@ void testConditionalAntiSemiJoinGatherMapNulls() {
 
   @Test
   void testConditionalLeftAntiJoinGatherMapWithCount() {
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.GREATER,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
@@ -2062,7 +2062,7 @@ void testConditionalLeftAntiJoinGatherMapWithCount() {
 
   @Test
   void testConditionalAntiSemiJoinGatherMapNullsWithCount() {
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.EQUAL,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder()
diff --git a/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java b/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java
index 177abe9d6e3..13af9aff682 100644
--- a/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java
@@ -42,16 +42,14 @@ public class CompiledExpressionTest extends CudfTestBase {
   public void testColumnReferenceTransform() {
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build()) {
       // use an implicit table reference
-      UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY,
-          new ColumnReference(1));
+      ColumnReference expr = new ColumnReference(1);
       try (CompiledExpression compiledExpr = expr.compile();
            ColumnVector actual = compiledExpr.computeColumn(t)) {
         assertColumnsAreEqual(t.getColumn(1), actual);
       }
 
       // use an explicit table reference
-      expr = new UnaryExpression(UnaryOperator.IDENTITY,
-          new ColumnReference(1, TableReference.LEFT));
+      expr = new ColumnReference(1, TableReference.LEFT);
       try (CompiledExpression compiledExpr = expr.compile();
            ColumnVector actual = compiledExpr.computeColumn(t)) {
         assertColumnsAreEqual(t.getColumn(1), actual);
@@ -62,8 +60,7 @@ public void testColumnReferenceTransform() {
   @Test
   public void testInvalidColumnReferenceTransform() {
     // Verify that computeColumn throws when passed an expression operating on TableReference.RIGHT.
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY,
-        new ColumnReference(1, TableReference.RIGHT));
+    ColumnReference expr = new ColumnReference(1, TableReference.RIGHT);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile()) {
       Assertions.assertThrows(CudfException.class, () -> compiledExpr.computeColumn(t).close());
@@ -73,9 +70,8 @@ public void testInvalidColumnReferenceTransform() {
   @Test
   public void testBooleanLiteralTransform() {
     try (Table t = new Table.TestBuilder().column(true, false, null).build()) {
-      Literal trueLiteral = Literal.ofBoolean(true);
-      UnaryExpression trueExpr = new UnaryExpression(UnaryOperator.IDENTITY, trueLiteral);
-      try (CompiledExpression trueCompiledExpr = trueExpr.compile();
+      Literal expr = Literal.ofBoolean(true);
+      try (CompiledExpression trueCompiledExpr = expr.compile();
            ColumnVector trueExprActual = trueCompiledExpr.computeColumn(t);
            ColumnVector trueExprExpected = ColumnVector.fromBoxedBooleans(true, true, true)) {
         assertColumnsAreEqual(trueExprExpected, trueExprActual);
@@ -83,7 +79,7 @@ public void testBooleanLiteralTransform() {
 
       // Uncomment the following after https://github.com/rapidsai/cudf/issues/8831 is fixed
       // Literal nullLiteral = Literal.ofBoolean(null);
-      // UnaryExpression nullExpr = new UnaryExpression(AstOperator.IDENTITY, nullLiteral);
+      // UnaryOperation nullExpr = new UnaryOperation(AstOperator.IDENTITY, nullLiteral);
       // try (CompiledExpression nullCompiledExpr = nullExpr.compile();
       //      ColumnVector nullExprActual = nullCompiledExpr.computeColumn(t);
       //      ColumnVector nullExprExpected = ColumnVector.fromBoxedBooleans(null, null, null)) {
@@ -97,8 +93,7 @@ public void testBooleanLiteralTransform() {
   // @NullSource
   @ValueSource(bytes = 0x12)
   public void testByteLiteralTransform(Byte value) {
-    Literal literal = Literal.ofByte(value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofByte(value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -112,8 +107,7 @@ public void testByteLiteralTransform(Byte value) {
   // @NullSource
   @ValueSource(shorts = 0x1234)
   public void testShortLiteralTransform(Short value) {
-    Literal literal = Literal.ofShort(value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofShort(value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -127,8 +121,7 @@ public void testShortLiteralTransform(Short value) {
   // @NullSource
   @ValueSource(ints = 0x12345678)
   public void testIntLiteralTransform(Integer value) {
-    Literal literal = Literal.ofInt(value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofInt(value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -142,8 +135,7 @@ public void testIntLiteralTransform(Integer value) {
   // @NullSource
   @ValueSource(longs = 0x1234567890abcdefL)
   public void testLongLiteralTransform(Long value) {
-    Literal literal = Literal.ofLong(value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofLong(value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -157,8 +149,7 @@ public void testLongLiteralTransform(Long value) {
   // @NullSource
   @ValueSource(floats = { 123456.789f, Float.NaN, Float.POSITIVE_INFINITY, Float.NEGATIVE_INFINITY} )
   public void testFloatLiteralTransform(Float value) {
-    Literal literal = Literal.ofFloat(value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofFloat(value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -172,8 +163,7 @@ public void testFloatLiteralTransform(Float value) {
   // @NullSource
   @ValueSource(doubles = { 123456.789f, Double.NaN, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY} )
   public void testDoubleLiteralTransform(Double value) {
-    Literal literal = Literal.ofDouble(value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofDouble(value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -187,8 +177,7 @@ public void testDoubleLiteralTransform(Double value) {
   // @NullSource
   @ValueSource(ints = 0x12345678)
   public void testTimestampDaysLiteralTransform(Integer value) {
-    Literal literal = Literal.ofTimestampDaysFromInt(value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofTimestampDaysFromInt(value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -203,8 +192,7 @@ public void testTimestampDaysLiteralTransform(Integer value) {
   // @NullSource
   @ValueSource(longs = 0x1234567890abcdefL)
   public void testTimestampSecondsLiteralTransform(Long value) {
-    Literal literal = Literal.ofTimestampFromLong(DType.TIMESTAMP_SECONDS, value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofTimestampFromLong(DType.TIMESTAMP_SECONDS, value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -219,8 +207,7 @@ public void testTimestampSecondsLiteralTransform(Long value) {
   // @NullSource
   @ValueSource(longs = 0x1234567890abcdefL)
   public void testTimestampMilliSecondsLiteralTransform(Long value) {
-    Literal literal = Literal.ofTimestampFromLong(DType.TIMESTAMP_MILLISECONDS, value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofTimestampFromLong(DType.TIMESTAMP_MILLISECONDS, value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -235,8 +222,7 @@ public void testTimestampMilliSecondsLiteralTransform(Long value) {
   // @NullSource
   @ValueSource(longs = 0x1234567890abcdefL)
   public void testTimestampMicroSecondsLiteralTransform(Long value) {
-    Literal literal = Literal.ofTimestampFromLong(DType.TIMESTAMP_MICROSECONDS, value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofTimestampFromLong(DType.TIMESTAMP_MICROSECONDS, value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -251,8 +237,7 @@ public void testTimestampMicroSecondsLiteralTransform(Long value) {
   // @NullSource
   @ValueSource(longs = 0x1234567890abcdefL)
   public void testTimestampNanoSecondsLiteralTransform(Long value) {
-    Literal literal = Literal.ofTimestampFromLong(DType.TIMESTAMP_NANOSECONDS, value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofTimestampFromLong(DType.TIMESTAMP_NANOSECONDS, value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -267,8 +252,7 @@ public void testTimestampNanoSecondsLiteralTransform(Long value) {
   // @NullSource
   @ValueSource(ints = 0x12345678)
   public void testDurationDaysLiteralTransform(Integer value) {
-    Literal literal = Literal.ofDurationDaysFromInt(value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofDurationDaysFromInt(value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -283,8 +267,7 @@ public void testDurationDaysLiteralTransform(Integer value) {
   // @NullSource
   @ValueSource(longs = 0x1234567890abcdefL)
   public void testDurationSecondsLiteralTransform(Long value) {
-    Literal literal = Literal.ofDurationFromLong(DType.DURATION_SECONDS, value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofDurationFromLong(DType.DURATION_SECONDS, value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -299,8 +282,7 @@ public void testDurationSecondsLiteralTransform(Long value) {
   // @NullSource
   @ValueSource(longs = 0x1234567890abcdefL)
   public void testDurationMilliSecondsLiteralTransform(Long value) {
-    Literal literal = Literal.ofDurationFromLong(DType.DURATION_MILLISECONDS, value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofDurationFromLong(DType.DURATION_MILLISECONDS, value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -315,8 +297,7 @@ public void testDurationMilliSecondsLiteralTransform(Long value) {
   // @NullSource
   @ValueSource(longs = 0x1234567890abcdefL)
   public void testDurationMicroSecondsLiteralTransform(Long value) {
-    Literal literal = Literal.ofDurationFromLong(DType.DURATION_MICROSECONDS, value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofDurationFromLong(DType.DURATION_MICROSECONDS, value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -331,8 +312,7 @@ public void testDurationMicroSecondsLiteralTransform(Long value) {
   // @NullSource
   @ValueSource(longs = 0x1234567890abcdefL)
   public void testDurationNanoSecondsLiteralTransform(Long value) {
-    Literal literal = Literal.ofDurationFromLong(DType.DURATION_NANOSECONDS, value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofDurationFromLong(DType.DURATION_NANOSECONDS, value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -359,7 +339,7 @@ private static <T, U, R> ArrayList<R> mapArray(T[] in1, U[] in2, BiFunction<T, U
     return result;
   }
 
-  private static Stream<Arguments> createUnaryDoubleExpressionParams() {
+  private static Stream<Arguments> createUnaryDoubleOperationParams() {
     Double[] input = new Double[] { -5., 4.5, null, 2.7, 1.5 };
     return Stream.of(
         Arguments.of(UnaryOperator.IDENTITY, input, Arrays.asList(input)),
@@ -383,10 +363,10 @@ private static Stream<Arguments> createUnaryDoubleExpressionParams() {
   }
 
   @ParameterizedTest
-  @MethodSource("createUnaryDoubleExpressionParams")
-  void testUnaryDoubleExpressionTransform(UnaryOperator op, Double[] input,
+  @MethodSource("createUnaryDoubleOperationParams")
+  void testUnaryDoubleOperationTransform(UnaryOperator op, Double[] input,
                                           List<Double> expectedValues) {
-    UnaryExpression expr = new UnaryExpression(op, new ColumnReference(0));
+    UnaryOperation expr = new UnaryOperation(op, new ColumnReference(0));
     try (Table t = new Table.TestBuilder().column(input).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -397,17 +377,17 @@ void testUnaryDoubleExpressionTransform(UnaryOperator op, Double[] input,
   }
 
   @Test
-  void testUnaryShortExpressionTransform() {
+  void testUnaryShortOperationTransform() {
     Short[] input = new Short[] { -5, 4, null, 2, 1 };
     try (Table t = new Table.TestBuilder().column(input).build()) {
-      UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, new ColumnReference(0));
+      ColumnReference expr = new ColumnReference(0);
       try (CompiledExpression compiledExpr = expr.compile();
            ColumnVector actual = compiledExpr.computeColumn(t)) {
         assertColumnsAreEqual(t.getColumn(0), actual);
       }
 
-      expr = new UnaryExpression(UnaryOperator.BIT_INVERT, new ColumnReference(0));
-      try (CompiledExpression compiledExpr = expr.compile();
+      UnaryOperation expr2 = new UnaryOperation(UnaryOperator.BIT_INVERT, new ColumnReference(0));
+      try (CompiledExpression compiledExpr = expr2.compile();
            ColumnVector actual = compiledExpr.computeColumn(t);
            ColumnVector expected = ColumnVector.fromBoxedInts(4, -5, null, -3, -2)) {
         assertColumnsAreEqual(expected, actual);
@@ -416,8 +396,8 @@ void testUnaryShortExpressionTransform() {
   }
 
   @Test
-  void testUnaryLogicalExpressionTransform() {
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.NOT, new ColumnReference(0));
+  void testUnaryLogicalOperationTransform() {
+    UnaryOperation expr = new UnaryOperation(UnaryOperator.NOT, new ColumnReference(0));
     try (Table t = new Table.TestBuilder().column(-5L, 0L, null, 2L, 1L).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -426,7 +406,7 @@ void testUnaryLogicalExpressionTransform() {
     }
   }
 
-  private static Stream<Arguments> createBinaryFloatExpressionParams() {
+  private static Stream<Arguments> createBinaryFloatOperationParams() {
     Float[] in1 = new Float[] { -5f, 4.5f, null, 2.7f };
     Float[] in2 = new Float[] { 123f, -456f, null, 0f };
     return Stream.of(
@@ -442,10 +422,10 @@ private static Stream<Arguments> createBinaryFloatExpressionParams() {
   }
 
   @ParameterizedTest
-  @MethodSource("createBinaryFloatExpressionParams")
-  void testBinaryFloatExpressionTransform(BinaryOperator op, Float[] in1, Float[] in2,
+  @MethodSource("createBinaryFloatOperationParams")
+  void testBinaryFloatOperationTransform(BinaryOperator op, Float[] in1, Float[] in2,
                                           List<Float> expectedValues) {
-    BinaryExpression expr = new BinaryExpression(op,
+    BinaryOperation expr = new BinaryOperation(op,
         new ColumnReference(0),
         new ColumnReference(1));
     try (Table t = new Table.TestBuilder().column(in1).column(in2).build();
@@ -457,7 +437,7 @@ void testBinaryFloatExpressionTransform(BinaryOperator op, Float[] in1, Float[]
     }
   }
 
-  private static Stream<Arguments> createBinaryDoublePromotedExpressionParams() {
+  private static Stream<Arguments> createBinaryDoublePromotedOperationParams() {
     Float[] in1 = new Float[] { -5f, 4.5f, null, 2.7f };
     Float[] in2 = new Float[] { 123f, -456f, null, 0f };
     return Stream.of(
@@ -468,10 +448,10 @@ private static Stream<Arguments> createBinaryDoublePromotedExpressionParams() {
   }
 
   @ParameterizedTest
-  @MethodSource("createBinaryDoublePromotedExpressionParams")
-  void testBinaryDoublePromotedExpressionTransform(BinaryOperator op, Float[] in1, Float[] in2,
+  @MethodSource("createBinaryDoublePromotedOperationParams")
+  void testBinaryDoublePromotedOperationTransform(BinaryOperator op, Float[] in1, Float[] in2,
                                                    List<Double> expectedValues) {
-    BinaryExpression expr = new BinaryExpression(op,
+    BinaryOperation expr = new BinaryOperation(op,
         new ColumnReference(0),
         new ColumnReference(1));
     try (Table t = new Table.TestBuilder().column(in1).column(in2).build();
@@ -483,7 +463,7 @@ void testBinaryDoublePromotedExpressionTransform(BinaryOperator op, Float[] in1,
     }
   }
 
-  private static Stream<Arguments> createBinaryComparisonExpressionParams() {
+  private static Stream<Arguments> createBinaryComparisonOperationParams() {
     Integer[] in1 = new Integer[] { -5, 4, null, 2, -3 };
     Integer[] in2 = new Integer[] { 123, -456, null, 0, -3 };
     return Stream.of(
@@ -497,10 +477,10 @@ private static Stream<Arguments> createBinaryComparisonExpressionParams() {
   }
 
   @ParameterizedTest
-  @MethodSource("createBinaryComparisonExpressionParams")
-  void testBinaryComparisonExpressionTransform(BinaryOperator op, Integer[] in1, Integer[] in2,
+  @MethodSource("createBinaryComparisonOperationParams")
+  void testBinaryComparisonOperationTransform(BinaryOperator op, Integer[] in1, Integer[] in2,
                                                List<Boolean> expectedValues) {
-    BinaryExpression expr = new BinaryExpression(op,
+    BinaryOperation expr = new BinaryOperation(op,
         new ColumnReference(0),
         new ColumnReference(1));
     try (Table t = new Table.TestBuilder().column(in1).column(in2).build();
@@ -512,7 +492,7 @@ void testBinaryComparisonExpressionTransform(BinaryOperator op, Integer[] in1, I
     }
   }
 
-  private static Stream<Arguments> createBinaryBitwiseExpressionParams() {
+  private static Stream<Arguments> createBinaryBitwiseOperationParams() {
     Integer[] in1 = new Integer[] { -5, 4, null, 2, -3 };
     Integer[] in2 = new Integer[] { 123, -456, null, 0, -3 };
     return Stream.of(
@@ -522,10 +502,10 @@ private static Stream<Arguments> createBinaryBitwiseExpressionParams() {
   }
 
   @ParameterizedTest
-  @MethodSource("createBinaryBitwiseExpressionParams")
-  void testBinaryBitwiseExpressionTransform(BinaryOperator op, Integer[] in1, Integer[] in2,
+  @MethodSource("createBinaryBitwiseOperationParams")
+  void testBinaryBitwiseOperationTransform(BinaryOperator op, Integer[] in1, Integer[] in2,
                                             List<Integer> expectedValues) {
-    BinaryExpression expr = new BinaryExpression(op,
+    BinaryOperation expr = new BinaryOperation(op,
         new ColumnReference(0),
         new ColumnReference(1));
     try (Table t = new Table.TestBuilder().column(in1).column(in2).build();
@@ -537,7 +517,7 @@ void testBinaryBitwiseExpressionTransform(BinaryOperator op, Integer[] in1, Inte
     }
   }
 
-  private static Stream<Arguments> createBinaryBooleanExpressionParams() {
+  private static Stream<Arguments> createBinaryBooleanOperationParams() {
     Boolean[] in1 = new Boolean[] { false, true, null, true, false };
     Boolean[] in2 = new Boolean[] { true, null, null, true, false };
     return Stream.of(
@@ -546,10 +526,10 @@ private static Stream<Arguments> createBinaryBooleanExpressionParams() {
   }
 
   @ParameterizedTest
-  @MethodSource("createBinaryBooleanExpressionParams")
-  void testBinaryBooleanExpressionTransform(BinaryOperator op, Boolean[] in1, Boolean[] in2,
+  @MethodSource("createBinaryBooleanOperationParams")
+  void testBinaryBooleanOperationTransform(BinaryOperator op, Boolean[] in1, Boolean[] in2,
                                             List<Boolean> expectedValues) {
-    BinaryExpression expr = new BinaryExpression(op,
+    BinaryOperation expr = new BinaryOperation(op,
         new ColumnReference(0),
         new ColumnReference(1));
     try (Table t = new Table.TestBuilder().column(in1).column(in2).build();
@@ -562,9 +542,9 @@ void testBinaryBooleanExpressionTransform(BinaryOperator op, Boolean[] in1, Bool
   }
 
   @Test
-  void testMismatchedBinaryExpressionTypes() {
+  void testMismatchedBinaryOperationTypes() {
     // verify expression fails to transform if operands are not the same type
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.ADD,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.ADD,
         new ColumnReference(0),
         new ColumnReference(1));
     try (Table t = new Table.TestBuilder().column(1, 2, 3).column(1L, 2L, 3L).build();

From 417b34df0f91593c90d688def32f1bddf4b4b52c Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Wed, 18 Aug 2021 22:05:00 -0400
Subject: [PATCH 09/46] Various multiindex related fixes (#9036)

This PR fixes two different issues with multiIndex creation:
1. Creating of multiIndex when the input is not a `cudf.DataFrame` and `names` has duplicate values.
2. Assigning and maintaining names correctly in indexing and slicing operations.

Authors:
  - Ashwin Srinath (https://github.com/shwina)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9036
---
 python/cudf/cudf/core/dataframe.py        | 11 ++++--
 python/cudf/cudf/core/multiindex.py       | 46 ++++++++++-------------
 python/cudf/cudf/core/series.py           |  2 +-
 python/cudf/cudf/tests/test_multiindex.py | 28 ++++++++++++++
 4 files changed, 55 insertions(+), 32 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index a78c24c21b9..f66cb570fbb 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -860,10 +860,13 @@ def _slice(self: T, arg: slice) -> T:
                 )
 
                 result._copy_type_metadata(self, include_index=keep_index)
-                # Adding index of type RangeIndex back to
-                # result
-                if keep_index is False and self.index is not None:
-                    result.index = self.index[start:stop]
+                if self.index is not None:
+                    if keep_index:
+                        result._index.names = self.index.names
+                    else:
+                        # Adding index of type RangeIndex back to
+                        # result
+                        result.index = self.index[start:stop]
                 result.columns = self.columns
                 return result
 
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index af6ac5f3dae..079a6d902b6 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -93,7 +93,6 @@ def __init__(
 
         self._name = None
 
-        column_names = []
         if labels:
             warnings.warn(
                 "the 'labels' keyword is deprecated, use 'codes' " "instead",
@@ -123,17 +122,6 @@ def __init__(
             self._levels = levels
             return
 
-        # name setup
-        if isinstance(names, (Sequence, pd.core.indexes.frozen.FrozenList,),):
-            if sum(x is None for x in names) > 1:
-                column_names = list(range(len(codes)))
-            else:
-                column_names = names
-        elif names is None:
-            column_names = list(range(len(codes)))
-        else:
-            column_names = names
-
         if len(levels) == 0:
             raise ValueError("Must pass non-zero number of levels/codes")
 
@@ -146,10 +134,12 @@ def __init__(
             self._codes = codes
         elif len(levels) == len(codes):
             self._codes = cudf.DataFrame()
-            for i, codes in enumerate(codes):
-                name = column_names[i] or i
-                codes = column.as_column(codes)
-                self._codes[name] = codes.astype(np.int64)
+            self._codes = cudf.DataFrame._from_data(
+                {
+                    i: column.as_column(code).astype(np.int64)
+                    for i, code in enumerate(codes)
+                }
+            )
         else:
             raise ValueError(
                 "MultiIndex has unequal number of levels and "
@@ -160,20 +150,20 @@ def __init__(
         self._validate_levels_and_codes(self._levels, self._codes)
 
         source_data = cudf.DataFrame()
-        for i, name in enumerate(self._codes.columns):
-            codes = as_index(self._codes[name]._column)
-            if -1 in self._codes[name].values:
+        for i, n in enumerate(self._codes.columns):
+            codes = as_index(self._codes[n]._column)
+            if -1 in self._codes[n].values:
                 # Must account for null(s) in _source_data column
                 level = cudf.DataFrame(
-                    {name: [None] + list(self._levels[i])},
+                    {n: [None] + list(self._levels[i])},
                     index=range(-1, len(self._levels[i])),
                 )
             else:
-                level = cudf.DataFrame({name: self._levels[i]})
+                level = cudf.DataFrame({n: self._levels[i]})
 
-            source_data[name] = libcudf.copying.gather(
+            source_data[n] = libcudf.copying.gather(
                 level, codes._data.columns[0]
-            )[0][name]
+            )[0][n]
 
         self._data = source_data._data
         self.names = names
@@ -1106,10 +1096,12 @@ def __getitem__(self, index):
         match = self.take(index)
         if isinstance(index, slice):
             return match
-        result = []
-        for level, item in enumerate(match.codes):
-            result.append(match.levels[level][match.codes[item].iloc[0]])
-        return tuple(result)
+        if isinstance(index, int):
+            # we are indexing into a single row of the MultiIndex,
+            # return that row as a tuple:
+            return match.to_pandas()[0]
+        else:
+            return match
 
     def to_frame(self, index=True, name=None):
         df = self._source_data
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 1249e126ee9..380e1838534 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -6384,7 +6384,7 @@ def _align_indices(series_list, how="outer", allow_non_unique=False):
     for sr in series_list[1:]:
         if not sr.index.names == head.names:
             all_names_equal = False
-    new_index_names = [None]
+    new_index_names = [None] * head.nlevels
     if all_names_equal:
         new_index_names = head.names
 
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index c8e5a9f071b..18a82b58670 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -1522,3 +1522,31 @@ def test_multiindex_rename_error(names):
         lfunc_args_and_kwargs=([], {"names": names}),
         rfunc_args_and_kwargs=([], {"names": names}),
     )
+
+
+@pytest.mark.parametrize(
+    "key",
+    [0, 1, [], [0, 1], slice(None), slice(0, 0), slice(0, 1), slice(0, 2)],
+)
+def test_multiindex_indexing(key):
+    gi = cudf.MultiIndex.from_frame(
+        cudf.DataFrame({"a": [1, 2, 3], "b": [True, False, False]})
+    )
+    pi = gi.to_pandas()
+
+    assert_eq(gi[key], pi[key], exact=False)
+
+
+def test_multiIndex_duplicate_names():
+    gi = cudf.MultiIndex(
+        levels=[["a", "b"], ["b", "a"]],
+        codes=[[0, 0], [0, 1]],
+        names=["a", "a"],
+    )
+    pi = pd.MultiIndex(
+        levels=[["a", "b"], ["b", "a"]],
+        codes=[[0, 0], [0, 1]],
+        names=["a", "a"],
+    )
+
+    assert_eq(gi, pi)

From f95b43e44a21a473f7b4f1b626b2aab66a8db88a Mon Sep 17 00:00:00 2001
From: Devavret Makkar <devavret@users.noreply.github.com>
Date: Thu, 19 Aug 2021 12:13:58 +0530
Subject: [PATCH 10/46] Parquet writer dictionary encoding refactor (#8476)

Replaces previous parquet dictionary encoding code with one that uses `cuCollections`' static map.

Adds [`cuCollections`](https://github.com/NVIDIA/cuCollections) to `libcudf`

Closes #7873
Fixes #8890

**Currently blocked on Pascal support for static_map in cuCollections**

(More details to be added)

<!--

Thank you for contributing to cuDF :)

Here are some guidelines to help the review process go smoothly.

1. Please write a description in this text box of the changes that are being
   made.

2. Please ensure that you have written units tests for the changes made/features
   added.

3. There are CI checks in place to enforce that committed code follows our style
   and syntax standards. Please see our contribution guide in `CONTRIBUTING.MD`
   in the project root for more information about the checks we perform and how
   you can run them locally.

4. If you are closing an issue please use one of the automatic closing words as
   noted here: https://help.github.com/articles/closing-issues-using-keywords/

5. If your pull request is not ready for review but you want to make use of the
   continuous integration testing facilities please mark your pull request as Draft.
   https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/changing-the-stage-of-a-pull-request#converting-a-pull-request-to-a-draft

6. If your pull request is ready to be reviewed without requiring additional
   work on top of it, then remove it from "Draft" and make it "Ready for Review".
   https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/changing-the-stage-of-a-pull-request#marking-a-pull-request-as-ready-for-review

   If assistance is required to complete the functionality, for example when the
   C/C++ code of a feature is complete but Python bindings are still required,
   then add the label `help wanted` so that others can triage and assist.
   The additional changes then can be implemented on top of the same PR.
   If the assistance is done by members of the rapidsAI team, then no
   additional actions are required by the creator of the original PR for this,
   otherwise the original author of the PR needs to give permission to the
   person(s) assisting to commit to their personal fork of the project. If that
   doesn't happen then a new PR based on the code of the original PR can be
   opened by the person assisting, which then will be the PR that will be
   merged.

7. Once all work has been done and review has taken place please do not add
   features or make changes out of the scope of those requested by the reviewer
   (doing this just add delays as already reviewed code ends up having to be
   re-reviewed/it is hard to tell what is new etc!). Further, please do not
   rebase your branch on the target branch, force push, or rewrite history.
   Doing any of these causes the context of any comments made by reviewers to be lost.
   If conflicts occur against the target branch they should be resolved by
   merging the target branch into the branch used for making the pull request.

Many thanks in advance for your cooperation!

-->

Authors:
  - Devavret Makkar (https://github.com/devavret)
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/8476
---
 cpp/CMakeLists.txt                            |   8 +-
 .../thirdparty/CUDF_GetcuCollections.cmake    |  38 ++
 cpp/src/io/parquet/chunk_dict.cu              | 371 ++++++++++++++++++
 cpp/src/io/parquet/page_dict.cu               | 335 ----------------
 cpp/src/io/parquet/page_enc.cu                | 333 ++++------------
 cpp/src/io/parquet/parquet_gpu.hpp            | 117 ++++--
 cpp/src/io/parquet/writer_impl.cu             | 232 ++++++-----
 cpp/src/io/parquet/writer_impl.hpp            |   9 +-
 cpp/tests/io/parquet_test.cpp                 |   4 +-
 9 files changed, 708 insertions(+), 739 deletions(-)
 create mode 100644 cpp/cmake/thirdparty/CUDF_GetcuCollections.cmake
 create mode 100644 cpp/src/io/parquet/chunk_dict.cu
 delete mode 100644 cpp/src/io/parquet/page_dict.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 81fad82b3ea..6a972891958 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -138,6 +138,9 @@ include(cmake/thirdparty/CUDF_GetArrow.cmake)
 include(cmake/thirdparty/CUDF_GetDLPack.cmake)
 # find libcu++
 include(cmake/thirdparty/CUDF_GetLibcudacxx.cmake)
+# find cuCollections
+# Should come after including thrust and libcudacxx
+include(cmake/thirdparty/CUDF_GetcuCollections.cmake)
 # find or install GoogleTest
 include(cmake/thirdparty/CUDF_GetGTest.cmake)
 # preprocess jitify-able kernels
@@ -285,7 +288,7 @@ add_library(cudf
     src/io/orc/writer_impl.cu
     src/io/parquet/compact_protocol_writer.cpp
     src/io/parquet/page_data.cu
-    src/io/parquet/page_dict.cu
+    src/io/parquet/chunk_dict.cu
     src/io/parquet/page_enc.cu
     src/io/parquet/page_hdr.cu
     src/io/parquet/parquet.cpp
@@ -527,7 +530,8 @@ target_link_libraries(cudf
            PUBLIC ZLIB::ZLIB
                   ${ARROW_LIBRARIES}
                   cudf::Thrust
-                  rmm::rmm)
+                  rmm::rmm
+           PRIVATE cuco::cuco)
 
 if(CUDA_STATIC_RUNTIME)
     # Tell CMake what CUDA language runtime to use
diff --git a/cpp/cmake/thirdparty/CUDF_GetcuCollections.cmake b/cpp/cmake/thirdparty/CUDF_GetcuCollections.cmake
new file mode 100644
index 00000000000..73717249585
--- /dev/null
+++ b/cpp/cmake/thirdparty/CUDF_GetcuCollections.cmake
@@ -0,0 +1,38 @@
+#=============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_cucollections)
+
+    if(TARGET cuco::cuco)
+        return()
+    endif()
+
+    # Find or install cuCollections
+    CPMFindPackage(NAME   cuco
+        GITHUB_REPOSITORY NVIDIA/cuCollections
+        GIT_TAG           0d602ae21ea4f38d23ed816aa948453d97b2ee4e
+        OPTIONS           "BUILD_TESTS OFF"
+                          "BUILD_BENCHMARKS OFF"
+                          "BUILD_EXAMPLES OFF"
+    )
+
+    set(CUCO_INCLUDE_DIR "${cuco_SOURCE_DIR}/include" PARENT_SCOPE)
+
+    # Make sure consumers of cudf can also see cuco::cuco target
+    fix_cmake_global_defaults(cuco::cuco)
+endfunction()
+
+find_and_configure_cucollections()
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
new file mode 100644
index 00000000000..64b3dd69c0d
--- /dev/null
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <io/parquet/parquet_gpu.hpp>
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/table/row_operators.cuh>
+
+#include <rmm/exec_policy.hpp>
+
+namespace cudf {
+namespace io {
+namespace parquet {
+namespace gpu {
+
+template <int block_size>
+__global__ void __launch_bounds__(block_size, 1)
+  initialize_chunk_hash_maps_kernel(device_span<EncColumnChunk> chunks)
+{
+  auto chunk = chunks[blockIdx.x];
+  auto t     = threadIdx.x;
+  // fut: Now that per-chunk dict is same size as ck.num_values, try to not use one block per chunk
+  for (size_t i = 0; i < chunk.dict_map_size; i += block_size) {
+    if (t + i < chunk.dict_map_size) {
+      new (&chunk.dict_map_slots[t + i].first) map_type::atomic_key_type{KEY_SENTINEL};
+      new (&chunk.dict_map_slots[t + i].second) map_type::atomic_mapped_type{VALUE_SENTINEL};
+    }
+  }
+}
+
+template <typename T>
+struct equality_functor {
+  column_device_view const& col;
+  __device__ bool operator()(size_type lhs_idx, size_type rhs_idx)
+  {
+    // We don't call this for nulls so this is fine
+    return equality_compare(col.element<T>(lhs_idx), col.element<T>(rhs_idx));
+  }
+};
+
+template <typename T>
+struct hash_functor {
+  column_device_view const& col;
+  __device__ auto operator()(size_type idx) { return MurmurHash3_32<T>{}(col.element<T>(idx)); }
+};
+
+struct map_insert_fn {
+  map_type::device_mutable_view& map;
+
+  template <typename T>
+  __device__ bool operator()(column_device_view const& col, size_type i)
+  {
+    if constexpr (column_device_view::has_element_accessor<T>()) {
+      auto hash_fn     = hash_functor<T>{col};
+      auto equality_fn = equality_functor<T>{col};
+      return map.insert(std::make_pair(i, i), hash_fn, equality_fn);
+    } else {
+      cudf_assert(false && "Unsupported type to insert in map");
+    }
+    return false;
+  }
+};
+
+struct map_find_fn {
+  map_type::device_view& map;
+
+  template <typename T>
+  __device__ auto operator()(column_device_view const& col, size_type i)
+  {
+    if constexpr (column_device_view::has_element_accessor<T>()) {
+      auto hash_fn     = hash_functor<T>{col};
+      auto equality_fn = equality_functor<T>{col};
+      return map.find(i, hash_fn, equality_fn);
+    } else {
+      cudf_assert(false && "Unsupported type to insert in map");
+    }
+    return map.end();
+  }
+};
+
+template <int block_size>
+__global__ void __launch_bounds__(block_size, 1)
+  populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan<EncColumnChunk> chunks,
+                                  size_type num_rows)
+{
+  auto col_idx = blockIdx.y;
+  auto block_x = blockIdx.x;
+  auto t       = threadIdx.x;
+
+  auto start_row =
+    block_x *
+    max_page_fragment_size;  // This is fragment size. all chunks are multiple of these many rows.
+  size_type end_row = min(start_row + max_page_fragment_size, num_rows);
+
+  __shared__ EncColumnChunk* s_chunk;
+  __shared__ parquet_column_device_view s_col;
+  __shared__ size_type s_start_value_idx;
+  __shared__ size_type s_num_values;
+  if (t == 0) {
+    // Find the chunk this block is a part of
+    size_type num_rowgroups = chunks.size().first;
+    size_type rg_idx        = 0;
+    while (rg_idx < num_rowgroups) {
+      if (auto ck = chunks[rg_idx][col_idx];
+          start_row >= ck.start_row and start_row < ck.start_row + ck.num_rows) {
+        break;
+      }
+      ++rg_idx;
+    }
+    s_chunk = &chunks[rg_idx][col_idx];
+    s_col   = *(s_chunk->col_desc);
+  }
+  __syncthreads();
+  if (not s_chunk->use_dictionary) { return; }
+
+  if (t == 0) {
+    // Find the bounds of values in leaf column to be inserted into the map for current chunk
+    auto col             = *(s_col.parent_column);
+    auto start_value_idx = start_row;
+    auto end_value_idx   = end_row;
+    while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
+      if (col.type().id() == type_id::STRUCT) {
+        start_value_idx += col.offset();
+        end_value_idx += col.offset();
+        col = col.child(0);
+      } else {
+        auto offset_col = col.child(lists_column_view::offsets_column_index);
+        start_value_idx = offset_col.element<size_type>(start_value_idx + col.offset());
+        end_value_idx   = offset_col.element<size_type>(end_value_idx + col.offset());
+        col             = col.child(lists_column_view::child_column_index);
+      }
+    }
+    s_start_value_idx = start_value_idx;
+    s_num_values      = end_value_idx - start_value_idx;
+  }
+  __syncthreads();
+
+  column_device_view const& data_col = *s_col.leaf_column;
+  using block_reduce                 = cub::BlockReduce<size_type, block_size>;
+  __shared__ typename block_reduce::TempStorage reduce_storage;
+
+  // Make a view of the hash map
+  auto hash_map_mutable = map_type::device_mutable_view(
+    s_chunk->dict_map_slots, s_chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
+  auto hash_map = map_type::device_view(
+    s_chunk->dict_map_slots, s_chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
+
+  __shared__ int total_num_dict_entries;
+  for (size_type i = 0; i < s_num_values; i += block_size) {
+    // add the value to hash map
+    size_type val_idx = i + t + s_start_value_idx;
+    bool is_valid =
+      (i + t < s_num_values && val_idx < data_col.size()) and data_col.is_valid(val_idx);
+
+    // insert element at val_idx to hash map and count successful insertions
+    size_type is_unique      = 0;
+    size_type uniq_elem_size = 0;
+    if (is_valid) {
+      auto found_slot = type_dispatcher(data_col.type(), map_find_fn{hash_map}, data_col, val_idx);
+      if (found_slot == hash_map.end()) {
+        is_unique =
+          type_dispatcher(data_col.type(), map_insert_fn{hash_map_mutable}, data_col, val_idx);
+        uniq_elem_size = [&]() -> size_type {
+          if (not is_unique) { return 0; }
+          switch (s_col.physical_type) {
+            case Type::INT32: return 4;
+            case Type::INT64: return 8;
+            case Type::INT96: return 12;
+            case Type::FLOAT: return 4;
+            case Type::DOUBLE: return 8;
+            case Type::BYTE_ARRAY:
+              if (data_col.type().id() == type_id::STRING) {
+                // Strings are stored as 4 byte length + string bytes
+                return 4 + data_col.element<string_view>(val_idx).size_bytes();
+              }
+            case Type::FIXED_LEN_BYTE_ARRAY:
+            default: cudf_assert(false && "Unsupported type for dictionary encoding"); return 0;
+          }
+        }();
+      }
+    }
+
+    __syncthreads();
+    auto num_unique = block_reduce(reduce_storage).Sum(is_unique);
+    __syncthreads();
+    auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size);
+    if (t == 0) {
+      total_num_dict_entries = atomicAdd(&s_chunk->num_dict_entries, num_unique);
+      total_num_dict_entries += num_unique;
+      atomicAdd(&s_chunk->uniq_data_size, uniq_data_size);
+    }
+    __syncthreads();
+
+    // Check if the num unique values in chunk has already exceeded max dict size and early exit
+    if (total_num_dict_entries > MAX_DICT_SIZE) { return; }
+  }
+}
+
+template <int block_size>
+__global__ void __launch_bounds__(block_size, 1)
+  collect_map_entries_kernel(device_span<EncColumnChunk> chunks)
+{
+  auto& chunk = chunks[blockIdx.x];
+  if (not chunk.use_dictionary) { return; }
+
+  auto t = threadIdx.x;
+  auto map =
+    map_type::device_view(chunk.dict_map_slots, chunk.dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
+
+  __shared__ size_type counter;
+  if (t == 0) counter = 0;
+  __syncthreads();
+  for (size_t i = 0; i < chunk.dict_map_size; i += block_size) {
+    if (t + i < chunk.dict_map_size) {
+      auto slot = map.begin_slot() + t + i;
+      auto key  = static_cast<map_type::key_type>(slot->first);
+      if (key != KEY_SENTINEL) {
+        auto loc = atomicAdd(&counter, 1);
+        cudf_assert(loc < MAX_DICT_SIZE && "Number of filled slots exceeds max dict size");
+        chunk.dict_data[loc] = key;
+        // If sorting dict page ever becomes a hard requirement, enable the following statement and
+        // add a dict sorting step before storing into the slot's second field.
+        // chunk.dict_data_idx[loc] = t + i;
+        slot->second.store(loc);
+        // TODO: ^ This doesn't need to be atomic. Try casting to value_type ptr and just writing.
+      }
+    }
+  }
+}
+
+template <int block_size>
+__global__ void __launch_bounds__(block_size, 1)
+  get_dictionary_indices_kernel(cudf::detail::device_2dspan<EncColumnChunk> chunks,
+                                size_type num_rows)
+{
+  auto col_idx = blockIdx.y;
+  auto block_x = blockIdx.x;
+  auto t       = threadIdx.x;
+
+  size_type start_row = block_x * max_page_fragment_size;
+  size_type end_row   = min(start_row + max_page_fragment_size, num_rows);
+
+  __shared__ EncColumnChunk s_chunk;
+  __shared__ parquet_column_device_view s_col;
+  __shared__ size_type s_start_value_idx;
+  __shared__ size_type s_ck_start_val_idx;
+  __shared__ size_type s_num_values;
+
+  if (t == 0) {
+    // Find the chunk this block is a part of
+    size_type num_rowgroups = chunks.size().first;
+    size_type rg_idx        = 0;
+    while (rg_idx < num_rowgroups) {
+      if (auto ck = chunks[rg_idx][col_idx];
+          start_row >= ck.start_row and start_row < ck.start_row + ck.num_rows) {
+        break;
+      }
+      ++rg_idx;
+    }
+    s_chunk = chunks[rg_idx][col_idx];
+    s_col   = *(s_chunk.col_desc);
+
+    // Find the bounds of values in leaf column to be inserted into the map for current chunk
+
+    auto col                 = *(s_col.parent_column);
+    auto start_value_idx     = start_row;
+    auto end_value_idx       = end_row;
+    auto chunk_start_val_idx = s_chunk.start_row;
+    while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
+      if (col.type().id() == type_id::STRUCT) {
+        start_value_idx += col.offset();
+        chunk_start_val_idx += col.offset();
+        end_value_idx += col.offset();
+        col = col.child(0);
+      } else {
+        auto offset_col     = col.child(lists_column_view::offsets_column_index);
+        start_value_idx     = offset_col.element<size_type>(start_value_idx + col.offset());
+        chunk_start_val_idx = offset_col.element<size_type>(chunk_start_val_idx + col.offset());
+        end_value_idx       = offset_col.element<size_type>(end_value_idx + col.offset());
+        col                 = col.child(lists_column_view::child_column_index);
+      }
+    }
+    s_start_value_idx  = start_value_idx;
+    s_ck_start_val_idx = chunk_start_val_idx;
+    s_num_values       = end_value_idx - start_value_idx;
+  }
+  __syncthreads();
+
+  if (not s_chunk.use_dictionary) { return; }
+
+  column_device_view const& data_col = *s_col.leaf_column;
+
+  auto map = map_type::device_view(
+    s_chunk.dict_map_slots, s_chunk.dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
+
+  for (size_t i = 0; i < s_num_values; i += block_size) {
+    if (t + i < s_num_values) {
+      auto val_idx = s_start_value_idx + t + i;
+      bool is_valid =
+        (i + t < s_num_values && val_idx < data_col.size()) ? data_col.is_valid(val_idx) : false;
+
+      if (is_valid) {
+        auto found_slot = type_dispatcher(data_col.type(), map_find_fn{map}, data_col, val_idx);
+        cudf_assert(found_slot != map.end() &&
+                    "Unable to find value in map in dictionary index construction");
+        if (found_slot != map.end()) {
+          // No need for atomic as this is not going to be modified by any other thread
+          auto* val_ptr = reinterpret_cast<map_type::mapped_type*>(&found_slot->second);
+          s_chunk.dict_index[val_idx - s_ck_start_val_idx] = *val_ptr;
+        }
+      }
+    }
+  }
+}
+
+void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
+{
+  constexpr int block_size = 1024;
+  initialize_chunk_hash_maps_kernel<block_size>
+    <<<chunks.size(), block_size, 0, stream.value()>>>(chunks);
+}
+
+void populate_chunk_hash_maps(cudf::detail::device_2dspan<EncColumnChunk> chunks,
+                              size_type num_rows,
+                              rmm::cuda_stream_view stream)
+{
+  constexpr int block_size = 256;
+  auto const grid_x        = cudf::detail::grid_1d(num_rows, max_page_fragment_size);
+  auto const num_columns   = chunks.size().second;
+  dim3 const dim_grid(grid_x.num_blocks, num_columns);
+
+  populate_chunk_hash_maps_kernel<block_size>
+    <<<dim_grid, block_size, 0, stream.value()>>>(chunks, num_rows);
+}
+
+void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
+{
+  constexpr int block_size = 1024;
+  collect_map_entries_kernel<block_size><<<chunks.size(), block_size, 0, stream.value()>>>(chunks);
+}
+
+void get_dictionary_indices(cudf::detail::device_2dspan<EncColumnChunk> chunks,
+                            size_type num_rows,
+                            rmm::cuda_stream_view stream)
+{
+  constexpr int block_size = 256;
+  auto const grid_x        = cudf::detail::grid_1d(num_rows, max_page_fragment_size);
+  auto const num_columns   = chunks.size().second;
+  dim3 const dim_grid(grid_x.num_blocks, num_columns);
+
+  get_dictionary_indices_kernel<block_size>
+    <<<dim_grid, block_size, 0, stream.value()>>>(chunks, num_rows);
+}
+}  // namespace gpu
+}  // namespace parquet
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/parquet/page_dict.cu b/cpp/src/io/parquet/page_dict.cu
deleted file mode 100644
index 0c55828b120..00000000000
--- a/cpp/src/io/parquet/page_dict.cu
+++ /dev/null
@@ -1,335 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <io/utilities/block_utils.cuh>
-#include "parquet_gpu.hpp"
-
-#include <cudf/utilities/error.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-
-#include <cub/cub.cuh>
-
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
-struct dict_state_s {
-  uint32_t row_cnt;
-  PageFragment* cur_fragment;
-  uint32_t* hashmap;
-  uint32_t total_dict_entries;  //!< Total number of entries in dictionary
-  uint32_t dictionary_size;     //!< Total dictionary size in bytes
-  uint32_t num_dict_entries;    //!< Dictionary entries in current fragment to add
-  uint32_t frag_dict_size;
-  EncColumnChunk ck;
-  parquet_column_device_view col;
-  PageFragment frag;
-  volatile uint32_t scratch_red[32];
-  uint16_t frag_dict[max_page_fragment_size];
-};
-
-/**
- * @brief Computes a 16-bit dictionary hash
- */
-inline __device__ uint32_t uint32_hash16(uint32_t v) { return (v + (v >> 16)) & 0xffff; }
-
-inline __device__ uint32_t uint64_hash16(uint64_t v)
-{
-  return uint32_hash16((uint32_t)(v + (v >> 32)));
-}
-
-inline __device__ uint32_t hash_string(const string_view& val)
-{
-  const char* p = val.data();
-  uint32_t len  = val.size_bytes();
-  uint32_t hash = len;
-  if (len > 0) {
-    uint32_t align_p    = 3 & reinterpret_cast<uintptr_t>(p);
-    const uint32_t* p32 = reinterpret_cast<const uint32_t*>(p - align_p);
-    uint32_t ofs        = align_p * 8;
-    uint32_t v;
-    while (len > 4) {
-      v = *p32++;
-      if (ofs) { v = __funnelshift_r(v, *p32, ofs); }
-      hash = __funnelshift_l(hash, hash, 5) + v;
-      len -= 4;
-    }
-    v = *p32;
-    if (ofs) { v = __funnelshift_r(v, (align_p + len > 4) ? p32[1] : 0, ofs); }
-    v &= ((2 << (len * 8 - 1)) - 1);
-    hash = __funnelshift_l(hash, hash, 5) + v;
-  }
-  return uint32_hash16(hash);
-}
-
-/**
- * @brief Fetch a page fragment and its dictionary entries in row-ascending order
- *
- * @param[in,out] s dictionary state
- * @param[in,out] dict_data fragment dictionary data for the current column (zeroed out after
- *fetching)
- * @param[in] frag_start_row row position of current fragment
- * @param[in] t thread id
- */
-__device__ void FetchDictionaryFragment(dict_state_s* s,
-                                        uint32_t* dict_data,
-                                        uint32_t frag_start_row,
-                                        uint32_t t)
-{
-  if (t == 0) s->frag = *s->cur_fragment;
-  __syncthreads();
-  // Store the row values in shared mem and set the corresponding dict_data to zero (end-of-list)
-  // It's easiest to do this here since we're only dealing with values all within a 5K-row window
-  for (uint32_t i = t; i < s->frag.num_dict_vals; i += 1024) {
-    uint32_t r      = dict_data[frag_start_row + i] - frag_start_row;
-    s->frag_dict[i] = r;
-  }
-  __syncthreads();
-  for (uint32_t i = t; i < s->frag.num_dict_vals; i += 1024) {
-    uint32_t r                    = s->frag_dict[i];
-    dict_data[frag_start_row + r] = 0;
-  }
-  __syncthreads();
-}
-
-/// Generate dictionary indices in ascending row order
-template <int block_size>
-__device__ void GenerateDictionaryIndices(dict_state_s* s, uint32_t t)
-{
-  using block_scan = cub::BlockScan<uint32_t, block_size>;
-  __shared__ typename block_scan::TempStorage temp_storage;
-  uint32_t* dict_index      = s->col.dict_index;
-  uint32_t* dict_data       = s->col.dict_data + s->ck.start_row;
-  uint32_t num_dict_entries = 0;
-
-  for (uint32_t i = 0; i < s->row_cnt; i += 1024) {
-    uint32_t row = s->ck.start_row + i + t;
-    uint32_t is_valid =
-      (i + t < s->row_cnt && row < s->col.num_rows) ? s->col.leaf_column->is_valid(row) : 0;
-    uint32_t dict_idx = (is_valid) ? dict_index[row] : 0;
-    uint32_t is_unique =
-      (is_valid &&
-       dict_idx ==
-         row);  // Any value that doesn't have bit31 set should have dict_idx=row at this point
-    uint32_t block_num_dict_entries;
-    uint32_t pos;
-    block_scan(temp_storage).ExclusiveSum(is_unique, pos, block_num_dict_entries);
-    pos += num_dict_entries;
-    num_dict_entries += block_num_dict_entries;
-    if (is_valid && is_unique) {
-      dict_data[pos]  = row;
-      dict_index[row] = pos;
-    }
-    __syncthreads();
-    if (is_valid && !is_unique) {
-      // NOTE: Should have at most 3 iterations (once for early duplicate elimination, once for
-      // final dictionary duplicate elimination and once for re-ordering) (If something went wrong
-      // building the dictionary, it will likely hang or crash right here)
-      do {
-        dict_idx = dict_index[dict_idx & 0x7fffffff];
-      } while (dict_idx > 0x7fffffff);
-      dict_index[row] = dict_idx;
-    }
-  }
-}
-
-// blockDim(1024, 1, 1)
-template <int block_size>
-__global__ void __launch_bounds__(block_size, 1)
-  gpuBuildChunkDictionaries(device_span<EncColumnChunk> chunks, uint32_t* dev_scratch)
-{
-  __shared__ __align__(8) dict_state_s state_g;
-  using block_reduce = cub::BlockReduce<uint32_t, block_size>;
-  __shared__ typename block_reduce::TempStorage temp_storage;
-
-  dict_state_s* const s = &state_g;
-  uint32_t t            = threadIdx.x;
-  uint32_t dtype, dtype_len, dtype_len_in;
-
-  if (t == 0) s->ck = chunks[blockIdx.x];
-  __syncthreads();
-
-  if (!s->ck.has_dictionary) { return; }
-
-  if (t == 0) s->col = *s->ck.col_desc;
-  __syncthreads();
-
-  if (!t) {
-    s->hashmap               = dev_scratch + s->ck.dictionary_id * (size_t)(1 << kDictHashBits);
-    s->row_cnt               = 0;
-    s->cur_fragment          = s->ck.fragments;
-    s->total_dict_entries    = 0;
-    s->dictionary_size       = 0;
-    s->ck.num_dict_fragments = 0;
-  }
-  dtype     = s->col.physical_type;
-  dtype_len = (dtype == INT96) ? 12 : (dtype == INT64 || dtype == DOUBLE) ? 8 : 4;
-  if (dtype == INT32) {
-    dtype_len_in = GetDtypeLogicalLen(s->col.leaf_column);
-  } else if (dtype == INT96) {
-    dtype_len_in = 8;
-  } else {
-    dtype_len_in = dtype_len;
-  }
-  __syncthreads();
-  while (s->row_cnt < s->ck.num_rows) {
-    uint32_t frag_start_row = s->ck.start_row + s->row_cnt, num_dict_entries, frag_dict_size;
-    FetchDictionaryFragment(s, s->col.dict_data, frag_start_row, t);
-    __syncthreads();
-    num_dict_entries = s->frag.num_dict_vals;
-    if (!t) {
-      s->num_dict_entries = 0;
-      s->frag_dict_size   = 0;
-    }
-    for (uint32_t i = 0; i < num_dict_entries; i += 1024) {
-      bool is_valid    = (i + t < num_dict_entries);
-      uint32_t len     = 0;
-      uint32_t is_dupe = 0;
-      uint32_t row, hash, next, *next_addr;
-      uint32_t new_dict_entries;
-
-      if (is_valid) {
-        row = frag_start_row + s->frag_dict[i + t];
-        len = dtype_len;
-        if (dtype == BYTE_ARRAY) {
-          auto str1 = s->col.leaf_column->element<string_view>(row);
-          len += str1.size_bytes();
-          hash = hash_string(str1);
-          // Walk the list of rows with the same hash
-          next_addr = &s->hashmap[hash];
-          while ((next = atomicCAS(next_addr, 0, row + 1)) != 0) {
-            auto const current = next - 1;
-            auto str2          = s->col.leaf_column->element<string_view>(current);
-            if (str1 == str2) {
-              is_dupe = 1;
-              break;
-            }
-            next_addr = &s->col.dict_data[next - 1];
-          }
-        } else {
-          uint64_t val;
-
-          if (dtype_len_in == 8) {
-            val  = s->col.leaf_column->element<uint64_t>(row);
-            hash = uint64_hash16(val);
-          } else {
-            val  = (dtype_len_in == 4)   ? s->col.leaf_column->element<uint32_t>(row)
-                   : (dtype_len_in == 2) ? s->col.leaf_column->element<uint16_t>(row)
-                                         : s->col.leaf_column->element<uint8_t>(row);
-            hash = uint32_hash16(val);
-          }
-          // Walk the list of rows with the same hash
-          next_addr = &s->hashmap[hash];
-          while ((next = atomicCAS(next_addr, 0, row + 1)) != 0) {
-            auto const current = next - 1;
-            uint64_t val2 = (dtype_len_in == 8)   ? s->col.leaf_column->element<uint64_t>(current)
-                            : (dtype_len_in == 4) ? s->col.leaf_column->element<uint32_t>(current)
-                            : (dtype_len_in == 2) ? s->col.leaf_column->element<uint16_t>(current)
-                                                  : s->col.leaf_column->element<uint8_t>(current);
-            if (val2 == val) {
-              is_dupe = 1;
-              break;
-            }
-            next_addr = &s->col.dict_data[next - 1];
-          }
-        }
-      }
-      // Count the non-duplicate entries
-      frag_dict_size   = block_reduce(temp_storage).Sum((is_valid && !is_dupe) ? len : 0);
-      new_dict_entries = __syncthreads_count(is_valid && !is_dupe);
-      if (t == 0) {
-        s->frag_dict_size += frag_dict_size;
-        s->num_dict_entries += new_dict_entries;
-      }
-      if (is_valid) {
-        if (!is_dupe) {
-          s->col.dict_index[row] = row;
-        } else {
-          s->col.dict_index[row] = (next - 1) | (1u << 31);
-        }
-      }
-      __syncthreads();
-      // At this point, the dictionary order is non-deterministic, and we want insertion order
-      // Make sure that the non-duplicate entry corresponds to the lower row number
-      // (The entry in dict_data (next-1) used for duplicate elimination does not need
-      // to be the lowest row number)
-      bool reorder_check = (is_valid && is_dupe && next - 1 > row);
-      if (reorder_check) {
-        next = s->col.dict_index[next - 1];
-        while (next & (1u << 31)) {
-          next = s->col.dict_index[next & 0x7fffffff];
-        }
-      }
-      if (__syncthreads_or(reorder_check)) {
-        if (reorder_check) { atomicMin(&s->col.dict_index[next], row); }
-        __syncthreads();
-        if (reorder_check && s->col.dict_index[next] == row) {
-          s->col.dict_index[next] = row | (1u << 31);
-          s->col.dict_index[row]  = row;
-        }
-        __syncthreads();
-      }
-    }
-    __syncthreads();
-    num_dict_entries = s->num_dict_entries;
-    frag_dict_size   = s->frag_dict_size;
-    if (s->total_dict_entries + num_dict_entries > 65536 ||
-        (s->dictionary_size != 0 && s->dictionary_size + frag_dict_size > 512 * 1024)) {
-      break;
-    }
-    __syncthreads();
-    if (!t) {
-      if (num_dict_entries != s->frag.num_dict_vals) {
-        s->cur_fragment->num_dict_vals = num_dict_entries;
-      }
-      if (frag_dict_size != s->frag.dict_data_size) { s->frag.dict_data_size = frag_dict_size; }
-      s->total_dict_entries += num_dict_entries;
-      s->dictionary_size += frag_dict_size;
-      s->row_cnt += s->frag.num_rows;
-      s->cur_fragment++;
-      s->ck.num_dict_fragments++;
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-  GenerateDictionaryIndices<block_size>(s, t);
-  if (!t) {
-    chunks[blockIdx.x].num_dict_fragments = s->ck.num_dict_fragments;
-    chunks[blockIdx.x].dictionary_size    = s->dictionary_size;
-    chunks[blockIdx.x].total_dict_entries = s->total_dict_entries;
-  }
-}
-
-/**
- * @brief Launches kernel for building chunk dictionaries
- *
- * @param[in,out] chunks Column chunks
- * @param[in] dev_scratch Device scratch data (kDictScratchSize per dictionary)
- * @param[in] stream CUDA stream to use, default 0
- */
-void BuildChunkDictionaries(device_span<EncColumnChunk> chunks,
-                            uint32_t* dev_scratch,
-                            rmm::cuda_stream_view stream)
-{
-  auto num_chunks = chunks.size();
-  gpuBuildChunkDictionaries<1024><<<num_chunks, 1024, 0, stream.value()>>>(chunks, dev_scratch);
-}
-
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 20a7ab7ca6d..70b2e27f75d 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -48,14 +48,7 @@ constexpr uint32_t rle_buffer_size = (1 << 9);
 struct frag_init_state_s {
   parquet_column_device_view col;
   PageFragment frag;
-  uint32_t total_dupes;
   size_type start_value_idx;
-  volatile uint32_t scratch_red[32];
-  uint32_t dict[max_page_fragment_size];
-  union {
-    uint16_t u16[1 << (init_hash_bits)];
-    uint32_t u32[1 << (init_hash_bits - 1)];
-  } map;
 };
 
 struct page_enc_state_s {
@@ -68,6 +61,7 @@ struct page_enc_state_s {
   uint32_t rle_lit_count;
   uint32_t rle_rpt_count;
   uint32_t page_start_val;
+  uint32_t chunk_start_val;
   volatile uint32_t rpt_map[4];
   volatile uint32_t scratch_red[32];
   EncPage page;
@@ -124,31 +118,22 @@ __global__ void __launch_bounds__(block_size)
   __shared__ __align__(16) frag_init_state_s state_g;
 
   using block_reduce = cub::BlockReduce<uint32_t, block_size>;
-  using block_scan   = cub::BlockScan<uint32_t, block_size>;
-  __shared__ union {
-    typename block_reduce::TempStorage reduce_storage;
-    typename block_scan::TempStorage scan_storage;
-  } temp_storage;
+  __shared__ typename block_reduce::TempStorage reduce_storage;
 
   frag_init_state_s* const s = &state_g;
   uint32_t t                 = threadIdx.x;
-  uint32_t start_row, dtype_len, dtype_len_in, dtype;
+  uint32_t start_row, dtype_len, dtype;
 
   if (t == 0) s->col = col_desc[blockIdx.x];
-  for (uint32_t i = 0; i < sizeof(s->map) / sizeof(uint32_t); i += block_size) {
-    if (i + t < sizeof(s->map) / sizeof(uint32_t)) s->map.u32[i + t] = 0;
-  }
   __syncthreads();
   start_row = blockIdx.y * fragment_size;
   if (!t) {
     // frag.num_rows = fragment_size except for the last page fragment which can be smaller.
     // num_rows is fixed but fragment size could be larger if the data is strings or nested.
     s->frag.num_rows           = min(fragment_size, max_num_rows - min(start_row, max_num_rows));
-    s->frag.non_nulls          = 0;
     s->frag.num_dict_vals      = 0;
     s->frag.fragment_data_size = 0;
     s->frag.dict_data_size     = 0;
-    s->total_dupes             = 0;
 
     // To use num_vals instead of num_rows, we need to calculate num_vals on the fly.
     // For list<list<int>>, values between i and i+50 can be calculated by
@@ -195,16 +180,6 @@ __global__ void __launch_bounds__(block_size)
               : (dtype == INT64 || dtype == DOUBLE) ? 8
               : (dtype == BOOLEAN)                  ? 1
                                                     : 4;
-  if (dtype == INT32) {
-    dtype_len_in = GetDtypeLogicalLen(s->col.leaf_column);
-  } else if (dtype == INT96) {
-    // cudf doesn't support INT96 internally and uses INT64, so treat INT96 as an INT64 for
-    // computing dictionary hash values and reading the data, but we do treat it as 12 bytes for
-    // dtype_len, which determines how much memory we need to allocate for the fragment.
-    dtype_len_in = 8;
-  } else {
-    dtype_len_in = dtype_len;
-  }
   __syncthreads();
 
   size_type nvals           = s->frag.num_leaf_values;
@@ -215,167 +190,22 @@ __global__ void __launch_bounds__(block_size)
     uint32_t is_valid = (i + t < nvals && val_idx < s->col.leaf_column->size())
                           ? s->col.leaf_column->is_valid(val_idx)
                           : 0;
-    uint32_t len, nz_pos, hash;
+    uint32_t len;
     if (is_valid) {
       len = dtype_len;
       if (dtype != BOOLEAN) {
         if (dtype == BYTE_ARRAY) {
           auto str = s->col.leaf_column->element<string_view>(val_idx);
           len += str.size_bytes();
-          hash = hash_string(str);
-        } else if (dtype_len_in == 8) {
-          hash = uint64_init_hash(s->col.leaf_column->element<uint64_t>(val_idx));
-        } else {
-          hash =
-            uint32_init_hash((dtype_len_in == 4)   ? s->col.leaf_column->element<uint32_t>(val_idx)
-                             : (dtype_len_in == 2) ? s->col.leaf_column->element<uint16_t>(val_idx)
-                                                   : s->col.leaf_column->element<uint8_t>(val_idx));
         }
       }
     } else {
       len = 0;
     }
 
-    uint32_t non_nulls;
-    block_scan(temp_storage.scan_storage).ExclusiveSum(is_valid, nz_pos, non_nulls);
-    nz_pos += s->frag.non_nulls;
-    __syncthreads();
-    len = block_reduce(temp_storage.reduce_storage).Sum(len);
-    if (!t) {
-      s->frag.non_nulls += non_nulls;
-      s->frag.fragment_data_size += len;
-    }
-    __syncthreads();
-    if (is_valid && dtype != BOOLEAN) {
-      uint32_t* dict_index = s->col.dict_index;
-      if (dict_index) {
-        atomicAdd(&s->map.u32[hash >> 1], (hash & 1) ? 1 << 16 : 1);
-        dict_index[start_value_idx + nz_pos] =
-          ((i + t) << init_hash_bits) |
-          hash;  // Store the hash along with the index, so we don't have to recompute it
-      }
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-  // Reorder the 16-bit local indices according to the hash values
-  if (s->col.dict_index) {
-    static_assert((init_hash_bits == 12), "Hardcoded for init_hash_bits=12");
-    // Cumulative sum of hash map counts
-    uint32_t count01 = s->map.u32[t * 4 + 0];
-    uint32_t count23 = s->map.u32[t * 4 + 1];
-    uint32_t count45 = s->map.u32[t * 4 + 2];
-    uint32_t count67 = s->map.u32[t * 4 + 3];
-    uint32_t sum01   = count01 + (count01 << 16);
-    uint32_t sum23   = count23 + (count23 << 16);
-    uint32_t sum45   = count45 + (count45 << 16);
-    uint32_t sum67   = count67 + (count67 << 16);
-    sum23 += (sum01 >> 16) * 0x10001;
-    sum45 += (sum23 >> 16) * 0x10001;
-    sum67 += (sum45 >> 16) * 0x10001;
-    uint32_t sum_w = sum67 >> 16;
-    block_scan(temp_storage.scan_storage).InclusiveSum(sum_w, sum_w);
-    sum_w                 = (sum_w - (sum67 >> 16)) * 0x10001;
-    s->map.u32[t * 4 + 0] = sum_w + sum01 - count01;
-    s->map.u32[t * 4 + 1] = sum_w + sum23 - count23;
-    s->map.u32[t * 4 + 2] = sum_w + sum45 - count45;
-    s->map.u32[t * 4 + 3] = sum_w + sum67 - count67;
-  }
-  __syncthreads();
-  // Put the indices back in hash order
-  if (s->col.dict_index) {
-    uint32_t* dict_index = s->col.dict_index + start_row;
-    uint32_t nnz         = s->frag.non_nulls;
-    for (uint32_t i = 0; i < nnz; i += block_size) {
-      uint32_t pos = 0, hash = 0, pos_old, pos_new, sh, colliding_row, val = 0;
-      bool collision;
-      if (i + t < nnz) {
-        val     = dict_index[i + t];
-        hash    = val & ((1 << init_hash_bits) - 1);
-        sh      = (hash & 1) ? 16 : 0;
-        pos_old = s->map.u16[hash];
-      }
-      // The isolation of the atomicAdd, along with pos_old/pos_new is to guarantee deterministic
-      // behavior for the first row in the hash map that will be used for early duplicate detection
-      __syncthreads();
-      if (i + t < nnz) {
-        pos          = (atomicAdd(&s->map.u32[hash >> 1], 1 << sh) >> sh) & 0xffff;
-        s->dict[pos] = val;
-      }
-      __syncthreads();
-      collision = false;
-      if (i + t < nnz) {
-        pos_new   = s->map.u16[hash];
-        collision = (pos != pos_old && pos_new > pos_old + 1);
-        if (collision) { colliding_row = s->dict[pos_old]; }
-      }
-      __syncthreads();
-      if (collision) { atomicMin(&s->dict[pos_old], val); }
-      __syncthreads();
-      // Resolve collision
-      if (collision && val == s->dict[pos_old]) { s->dict[pos] = colliding_row; }
-    }
+    len = block_reduce(reduce_storage).Sum(len);
+    if (!t) { s->frag.fragment_data_size += len; }
     __syncthreads();
-    // Now that the values are ordered by hash, compare every entry with the first entry in the hash
-    // map, the position of the first entry can be inferred from the hash map counts
-    uint32_t dupe_data_size = 0;
-    for (uint32_t i = 0; i < nnz; i += block_size) {
-      uint32_t ck_row = 0, ck_row_ref = 0, is_dupe = 0;
-      if (i + t < nnz) {
-        uint32_t dict_val = s->dict[i + t];
-        uint32_t hash     = dict_val & ((1 << init_hash_bits) - 1);
-        ck_row            = start_row + (dict_val >> init_hash_bits);
-        ck_row_ref = start_row + (s->dict[(hash > 0) ? s->map.u16[hash - 1] : 0] >> init_hash_bits);
-        if (ck_row_ref != ck_row) {
-          if (dtype == BYTE_ARRAY) {
-            auto str1 = s->col.leaf_column->element<string_view>(ck_row);
-            auto str2 = s->col.leaf_column->element<string_view>(ck_row_ref);
-            is_dupe   = (str1 == str2);
-            dupe_data_size += (is_dupe) ? 4 + str1.size_bytes() : 0;
-          } else {
-            if (dtype_len_in == 8) {
-              auto v1 = s->col.leaf_column->element<uint64_t>(ck_row);
-              auto v2 = s->col.leaf_column->element<uint64_t>(ck_row_ref);
-              is_dupe = (v1 == v2);
-              dupe_data_size += (is_dupe) ? 8 : 0;
-            } else {
-              uint32_t v1, v2;
-              if (dtype_len_in == 4) {
-                v1 = s->col.leaf_column->element<uint32_t>(ck_row);
-                v2 = s->col.leaf_column->element<uint32_t>(ck_row_ref);
-              } else if (dtype_len_in == 2) {
-                v1 = s->col.leaf_column->element<uint16_t>(ck_row);
-                v2 = s->col.leaf_column->element<uint16_t>(ck_row_ref);
-              } else {
-                v1 = s->col.leaf_column->element<uint8_t>(ck_row);
-                v2 = s->col.leaf_column->element<uint8_t>(ck_row_ref);
-              }
-              is_dupe = (v1 == v2);
-              dupe_data_size += (is_dupe) ? 4 : 0;
-            }
-          }
-        }
-      }
-      uint32_t dupes_in_block;
-      uint32_t dupes_before;
-      block_scan(temp_storage.scan_storage).InclusiveSum(is_dupe, dupes_before, dupes_in_block);
-      dupes_before += s->total_dupes;
-      __syncthreads();
-      if (t == 0) { s->total_dupes += dupes_in_block; }
-      if (i + t < nnz) {
-        if (!is_dupe) {
-          s->col.dict_data[start_row + i + t - dupes_before] = ck_row;
-        } else {
-          s->col.dict_index[ck_row] = ck_row_ref | (1u << 31);
-        }
-      }
-    }
-    __syncthreads();
-    dupe_data_size = block_reduce(temp_storage.reduce_storage).Sum(dupe_data_size);
-    if (!t) {
-      s->frag.dict_data_size = s->frag.fragment_data_size - dupe_data_size;
-      s->frag.num_dict_vals  = s->frag.non_nulls - s->total_dupes;
-    }
   }
   __syncthreads();
   if (t == 0) frag[blockIdx.x][blockIdx.y] = s->frag;
@@ -449,22 +279,21 @@ __global__ void __launch_bounds__(128)
       pagestats_g.start_chunk = ck_g.first_fragment;
       pagestats_g.num_chunks  = 0;
     }
-    if (ck_g.has_dictionary) {
+    if (ck_g.use_dictionary) {
       if (!t) {
         page_g.page_data       = ck_g.uncompressed_bfr + page_offset;
         page_g.compressed_data = ck_g.compressed_bfr + comp_page_offset;
         page_g.num_fragments   = 0;
         page_g.page_type       = PageType::DICTIONARY_PAGE;
-        page_g.dict_bits_plus1 = 0;
         page_g.chunk           = &chunks[blockIdx.y][blockIdx.x];
         page_g.chunk_id        = blockIdx.y * num_columns + blockIdx.x;
         page_g.hdr_size        = 0;
         page_g.max_hdr_size    = 32;
-        page_g.max_data_size   = ck_g.dictionary_size;
+        page_g.max_data_size   = ck_g.uniq_data_size;
         page_g.start_row       = cur_row;
-        page_g.num_rows        = ck_g.total_dict_entries;
-        page_g.num_leaf_values = ck_g.total_dict_entries;
-        page_g.num_values      = ck_g.total_dict_entries;
+        page_g.num_rows        = ck_g.num_dict_entries;
+        page_g.num_leaf_values = ck_g.num_dict_entries;
+        page_g.num_values      = ck_g.num_dict_entries;  // TODO: shouldn't matter for dict page
         page_offset += page_g.max_hdr_size + page_g.max_data_size;
         comp_page_offset += page_g.max_hdr_size + GetMaxCompressedBfrSize(page_g.max_data_size);
       }
@@ -483,7 +312,7 @@ __global__ void __launch_bounds__(128)
     // This doesn't actually deal with data. It's agnostic. It only cares about number of rows and
     // page size.
     do {
-      uint32_t fragment_data_size, max_page_size, minmax_len = 0;
+      uint32_t minmax_len = 0;
       __syncwarp();
       if (num_rows < ck_g.num_rows) {
         if (t == 0) { frag_g = ck_g.fragments[fragments_in_chunk]; }
@@ -496,50 +325,27 @@ __global__ void __launch_bounds__(128)
         frag_g.num_rows           = 0;
       }
       __syncwarp();
-      if (ck_g.has_dictionary && fragments_in_chunk < ck_g.num_dict_fragments) {
-        fragment_data_size =
-          frag_g.num_leaf_values * 2;  // Assume worst-case of 2-bytes per dictionary index
-      } else {
-        fragment_data_size = frag_g.fragment_data_size;
-      }
+      uint32_t fragment_data_size =
+        (ck_g.use_dictionary)
+          ? frag_g.num_leaf_values * 2  // Assume worst-case of 2-bytes per dictionary index
+          : frag_g.fragment_data_size;
       // TODO (dm): this convoluted logic to limit page size needs refactoring
-      max_page_size = (values_in_page * 2 >= ck_g.num_values)   ? 256 * 1024
-                      : (values_in_page * 3 >= ck_g.num_values) ? 384 * 1024
-                                                                : 512 * 1024;
+      uint32_t max_page_size = (values_in_page * 2 >= ck_g.num_values)   ? 256 * 1024
+                               : (values_in_page * 3 >= ck_g.num_values) ? 384 * 1024
+                                                                         : 512 * 1024;
       if (num_rows >= ck_g.num_rows ||
-          (values_in_page > 0 &&
-           (page_size + fragment_data_size > max_page_size ||
-            (ck_g.has_dictionary && fragments_in_chunk == ck_g.num_dict_fragments)))) {
-        uint32_t dict_bits_plus1;
-
-        if (ck_g.has_dictionary && page_start < ck_g.num_dict_fragments) {
-          uint32_t dict_bits;
-          if (num_dict_entries <= 2) {
-            dict_bits = 1;
-          } else if (num_dict_entries <= 4) {
-            dict_bits = 2;
-          } else if (num_dict_entries <= 16) {
-            dict_bits = 4;
-          } else if (num_dict_entries <= 256) {
-            dict_bits = 8;
-          } else if (num_dict_entries <= 4096) {
-            dict_bits = 12;
-          } else {
-            dict_bits = 16;
-          }
-          page_size       = 1 + 5 + ((values_in_page * dict_bits + 7) >> 3) + (values_in_page >> 8);
-          dict_bits_plus1 = dict_bits + 1;
-        } else {
-          dict_bits_plus1 = 0;
+          (values_in_page > 0 && (page_size + fragment_data_size > max_page_size))) {
+        if (ck_g.use_dictionary) {
+          page_size =
+            1 + 5 + ((values_in_page * ck_g.dict_rle_bits + 7) >> 3) + (values_in_page >> 8);
         }
         if (!t) {
-          page_g.num_fragments   = fragments_in_chunk - page_start;
-          page_g.chunk           = &chunks[blockIdx.y][blockIdx.x];
-          page_g.chunk_id        = blockIdx.y * num_columns + blockIdx.x;
-          page_g.page_type       = PageType::DATA_PAGE;
-          page_g.dict_bits_plus1 = dict_bits_plus1;
-          page_g.hdr_size        = 0;
-          page_g.max_hdr_size    = 32;  // Max size excluding statistics
+          page_g.num_fragments = fragments_in_chunk - page_start;
+          page_g.chunk         = &chunks[blockIdx.y][blockIdx.x];
+          page_g.chunk_id      = blockIdx.y * num_columns + blockIdx.x;
+          page_g.page_type     = PageType::DATA_PAGE;
+          page_g.hdr_size      = 0;
+          page_g.max_hdr_size  = 32;  // Max size excluding statistics
           if (ck_g.stats) {
             uint32_t stats_hdr_len = 16;
             if (col_g.stats_dtype == dtype_string) {
@@ -611,8 +417,8 @@ __global__ void __launch_bounds__(128)
       ck_g.num_pages          = num_pages;
       ck_g.bfr_size           = page_offset;
       ck_g.compressed_size    = comp_page_offset;
-      pagestats_g.start_chunk = ck_g.first_page + ck_g.has_dictionary;  // Exclude dictionary
-      pagestats_g.num_chunks  = num_pages - ck_g.has_dictionary;
+      pagestats_g.start_chunk = ck_g.first_page + ck_g.use_dictionary;  // Exclude dictionary
+      pagestats_g.num_chunks  = num_pages - ck_g.use_dictionary;
     }
   }
   __syncthreads();
@@ -1069,7 +875,10 @@ __global__ void __launch_bounds__(128, 8)
   } else {
     dtype_len_in = dtype_len_out;
   }
-  dict_bits = (dtype == BOOLEAN) ? 1 : (s->page.dict_bits_plus1 - 1);
+  dict_bits = (dtype == BOOLEAN) ? 1
+              : (s->ck.use_dictionary and s->page.page_type != PageType::DICTIONARY_PAGE)
+                ? s->ck.dict_rle_bits
+                : -1;
   if (t == 0) {
     uint8_t* dst   = s->cur;
     s->rle_run     = 0;
@@ -1080,37 +889,56 @@ __global__ void __launch_bounds__(128, 8)
       dst[0]     = dict_bits;
       s->rle_out = dst + 1;
     }
-    s->page_start_val = s->page.start_row;
-    if (s->col.parent_column != nullptr) {
+    s->page_start_val    = s->page.start_row;  // Dictionary page's start row is chunk's start row
+    auto chunk_start_val = s->ck.start_row;
+    if (s->col.parent_column != nullptr) {  // TODO: remove this check. parent is now never nullptr
       auto col                    = *(s->col.parent_column);
       auto current_page_start_val = s->page_start_val;
+      // TODO: We do this so much. Add a global function that converts row idx to val idx
       while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
         if (col.type().id() == type_id::STRUCT) {
           current_page_start_val += col.offset();
+          chunk_start_val += col.offset();
           col = col.child(0);
         } else {
-          current_page_start_val = col.child(lists_column_view::offsets_column_index)
-                                     .element<size_type>(current_page_start_val + col.offset());
-          col = col.child(lists_column_view::child_column_index);
+          auto offset_col = col.child(lists_column_view::offsets_column_index);
+          current_page_start_val =
+            offset_col.element<size_type>(current_page_start_val + col.offset());
+          chunk_start_val = offset_col.element<size_type>(chunk_start_val + col.offset());
+          col             = col.child(lists_column_view::child_column_index);
         }
       }
-      s->page_start_val = current_page_start_val;
+      s->page_start_val  = current_page_start_val;
+      s->chunk_start_val = chunk_start_val;
     }
   }
   __syncthreads();
   for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) {
-    uint32_t nvals   = min(s->page.num_leaf_values - cur_val_idx, 128);
-    uint32_t val_idx = s->page_start_val + cur_val_idx + t;
-    uint32_t is_valid, len, pos;
+    uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, 128);
+    uint32_t len, pos;
+
+    auto [is_valid, val_idx] = [&]() {
+      uint32_t val_idx;
+      uint32_t is_valid;
+
+      size_type val_idx_in_block = cur_val_idx + t;
+      if (s->page.page_type == PageType::DICTIONARY_PAGE) {
+        val_idx  = val_idx_in_block;
+        is_valid = (val_idx < s->page.num_leaf_values);
+        if (is_valid) { val_idx = s->ck.dict_data[val_idx]; }
+      } else {
+        size_type val_idx_in_leaf_col = s->page_start_val + val_idx_in_block;
+
+        is_valid = (val_idx_in_leaf_col < s->col.leaf_column->size() &&
+                    val_idx_in_block < s->page.num_leaf_values)
+                     ? s->col.leaf_column->is_valid(val_idx_in_leaf_col)
+                     : 0;
+        val_idx =
+          (s->ck.use_dictionary) ? val_idx_in_leaf_col - s->chunk_start_val : val_idx_in_leaf_col;
+      }
+      return std::make_tuple(is_valid, val_idx);
+    }();
 
-    if (s->page.page_type == PageType::DICTIONARY_PAGE) {
-      is_valid = (cur_val_idx + t < s->page.num_leaf_values);
-      val_idx  = (is_valid) ? s->col.dict_data[val_idx] : val_idx;
-    } else {
-      is_valid = (val_idx < s->col.leaf_column->size() && cur_val_idx + t < s->page.num_leaf_values)
-                   ? s->col.leaf_column->is_valid(val_idx)
-                   : 0;
-    }
     cur_val_idx += nvals;
     if (dict_bits >= 0) {
       // Dictionary encoding
@@ -1124,7 +952,7 @@ __global__ void __launch_bounds__(128, 8)
           if (dtype == BOOLEAN) {
             v = s->col.leaf_column->element<uint8_t>(val_idx);
           } else {
-            v = s->col.dict_index[val_idx];
+            v = s->ck.dict_index[val_idx];
           }
           s->vals[(rle_numvals + pos) & (rle_buffer_size - 1)] = v;
         }
@@ -1531,13 +1359,12 @@ __global__ void __launch_bounds__(128)
     // data pages (actual encoding is identical).
     Encoding encoding;
     if (enable_bool_rle) {
-      encoding = (col_g.physical_type != BOOLEAN)
-                   ? (page_type == PageType::DICTIONARY_PAGE || page_g.dict_bits_plus1 != 0)
-                       ? Encoding::PLAIN_DICTIONARY
-                       : Encoding::PLAIN
-                   : Encoding::RLE;
+      encoding = (col_g.physical_type == BOOLEAN) ? Encoding::RLE
+                 : (page_type == PageType::DICTIONARY_PAGE || page_g.chunk->use_dictionary)
+                   ? Encoding::PLAIN_DICTIONARY
+                   : Encoding::PLAIN;
     } else {
-      encoding = (page_type == PageType::DICTIONARY_PAGE || page_g.dict_bits_plus1 != 0)
+      encoding = (page_type == PageType::DICTIONARY_PAGE || page_g.chunk->use_dictionary)
                    ? Encoding::PLAIN_DICTIONARY
                    : Encoding::PLAIN;
     }
@@ -1562,7 +1389,7 @@ __global__ void __launch_bounds__(128)
     } else {
       // DictionaryPageHeader
       encoder.field_struct_begin(7);
-      encoder.field_int32(1, ck_g.total_dict_entries);  // number of values in dictionary
+      encoder.field_int32(1, ck_g.num_dict_entries);  // number of values in dictionary
       encoder.field_int32(2, encoding);
       encoder.field_struct_end(7);
     }
@@ -1613,12 +1440,12 @@ __global__ void __launch_bounds__(1024)
     memcpy_block<1024, true>(dst, src, data_len, t);
     dst += data_len;
     __syncthreads();
-    if (!t && page == 0 && ck_g.has_dictionary) { ck_g.dictionary_size = hdr_len + data_len; }
+    if (!t && page == 0 && ck_g.use_dictionary) { ck_g.dictionary_size = hdr_len + data_len; }
   }
   if (t == 0) {
     chunks[blockIdx.x].bfr_size        = uncompressed_size;
     chunks[blockIdx.x].compressed_size = (dst - dst_base);
-    if (ck_g.has_dictionary) { chunks[blockIdx.x].dictionary_size = ck_g.dictionary_size; }
+    if (ck_g.use_dictionary) { chunks[blockIdx.x].dictionary_size = ck_g.dictionary_size; }
   }
 }
 
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 975d2545cd1..cdd7c6b6674 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -28,6 +28,8 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <cuco/static_map.cuh>
+
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
@@ -42,6 +44,10 @@ namespace parquet {
 
 using cudf::io::detail::string_index_pair;
 
+// Total number of unsigned 16 bit values
+constexpr size_type MAX_DICT_SIZE =
+  std::numeric_limits<uint16_t>::max() - std::numeric_limits<uint16_t>::min() + 1;
+
 /**
  * @brief Struct representing an input column in the file.
  */
@@ -56,6 +62,11 @@ struct input_column_info {
 
 namespace gpu {
 
+auto constexpr KEY_SENTINEL   = size_type{-1};
+auto constexpr VALUE_SENTINEL = size_type{-1};
+using map_type                = cuco::static_map<size_type, size_type>;
+using slot_type               = map_type::pair_atomic_type;
+
 /**
  * @brief Enums for the flags in the page header
  */
@@ -222,8 +233,6 @@ struct ColumnChunkDesc {
  * @brief Struct describing an encoder column
  */
 struct parquet_column_device_view : stats_column_desc {
-  uint32_t* dict_index;    //!< Dictionary index [row]
-  uint32_t* dict_data;     //!< Dictionary data (unique row indices)
   uint8_t physical_type;   //!< physical data type
   uint8_t converted_type;  //!< logical data type
   uint8_t level_bits;  //!< bits to encode max definition (lower nibble) & repetition (upper nibble)
@@ -236,9 +245,9 @@ struct parquet_column_device_view : stats_column_desc {
   size_type const* level_offsets;  //!< Offset array for per-row pre-calculated rep/def level values
   uint8_t const* rep_values;       //!< Pre-calculated repetition level values
   uint8_t const* def_values;       //!< Pre-calculated definition level values
-  uint8_t* nullability;  //!< Array of nullability of each nesting level. e.g. nullable[0] is
-                         //!< nullability of parent_column. May be different from col.nullable() in
-                         //!< case of chunked writing.
+  uint8_t const* nullability;  //!< Array of nullability of each nesting level. e.g. nullable[0] is
+                               //!< nullability of parent_column. May be different from
+                               //!< col.nullable() in case of chunked writing.
 };
 
 constexpr int max_page_fragment_size = 5000;  //!< Max number of rows in a page fragment
@@ -253,7 +262,6 @@ struct PageFragment {
   uint32_t start_value_idx;
   uint32_t num_leaf_values;  //!< Number of leaf values in fragment. Does not include nulls at
                              //!< non-leaf level
-  uint32_t non_nulls;        //!< Number of non-null values
   uint16_t num_rows;         //!< Number of rows in fragment
   uint16_t num_dict_vals;    //!< Number of unique dictionary entries
 };
@@ -292,26 +300,33 @@ struct EncPage;
  */
 struct EncColumnChunk {
   parquet_column_device_view const* col_desc;  //!< Column description
-  PageFragment* fragments;                     //!< First fragment in chunk
-  uint8_t* uncompressed_bfr;                   //!< Uncompressed page data
-  uint8_t* compressed_bfr;                     //!< Compressed page data
-  statistics_chunk const* stats;               //!< Fragment statistics
-  uint32_t bfr_size;                           //!< Uncompressed buffer size
-  uint32_t compressed_size;                    //!< Compressed buffer size
-  uint32_t start_row;                          //!< First row of chunk
-  uint32_t num_rows;                           //!< Number of rows in chunk
-  uint32_t num_values;      //!< Number of values in chunk. Different from num_rows for nested types
+  size_type col_desc_id;
+  PageFragment* fragments;        //!< First fragment in chunk
+  uint8_t* uncompressed_bfr;      //!< Uncompressed page data
+  uint8_t* compressed_bfr;        //!< Compressed page data
+  statistics_chunk const* stats;  //!< Fragment statistics
+  uint32_t bfr_size;              //!< Uncompressed buffer size
+  uint32_t compressed_size;       //!< Compressed buffer size
+  uint32_t start_row;             //!< First row of chunk
+  uint32_t num_rows;              //!< Number of rows in chunk
+  size_type num_values;     //!< Number of values in chunk. Different from num_rows for nested types
   uint32_t first_fragment;  //!< First fragment of chunk
   EncPage* pages;           //!< Ptr to pages that belong to this chunk
   uint32_t first_page;      //!< First page of chunk
   uint32_t num_pages;       //!< Number of pages in chunk
-  uint32_t dictionary_id;   //!< Dictionary id for this chunk
   uint8_t is_compressed;    //!< Nonzero if the chunk uses compression
-  uint8_t has_dictionary;   //!< Nonzero if the chunk uses dictionary encoding
-  uint16_t num_dict_fragments;  //!< Number of fragments using dictionary
-  uint32_t dictionary_size;     //!< Size of dictionary
-  uint32_t total_dict_entries;  //!< Total number of entries in dictionary
-  uint32_t ck_stat_size;        //!< Size of chunk-level statistics (included in 1st page header)
+  uint32_t dictionary_size;    //!< Size of dictionary page including header
+  uint32_t ck_stat_size;       //!< Size of chunk-level statistics (included in 1st page header)
+  slot_type* dict_map_slots;   //!< Hash map storage for calculating dict encoding for this chunk
+  size_type dict_map_size;     //!< Size of dict_map_slots
+  size_type num_dict_entries;  //!< Total number of entries in dictionary
+  size_type
+    uniq_data_size;  //!< Size of dictionary page (set of all unique values) if dict enc is used
+  size_type plain_data_size;  //!< Size of data in this chunk if plain encoding is used
+  size_type* dict_data;       //!< Dictionary data (unique row indices)
+  uint16_t* dict_index;   //!< Index of value in dictionary page. column[dict_data[dict_index[row]]]
+  uint8_t dict_rle_bits;  //!< Bit size for encoding dictionary indices
+  bool use_dictionary;    //!< True if the chunk uses dictionary encoding
 };
 
 /**
@@ -322,7 +337,6 @@ struct EncPage {
   uint8_t* compressed_data;  //!< Ptr to compressed page
   uint16_t num_fragments;    //!< Number of fragments in page
   PageType page_type;        //!< Page type
-  uint8_t dict_bits_plus1;   //!< 0=plain, nonzero:bits to encoding dictionary indices + 1
   EncColumnChunk* chunk;     //!< Chunk that this page belongs to
   uint32_t chunk_id;         //!< Index in chunk array
   uint32_t hdr_size;         //!< Size of page header
@@ -449,7 +463,7 @@ dremel_data get_dremel_data(column_view h_col,
  * @param[in] num_columns Number of columns
  * @param[in] fragment_size Number of rows per fragment
  * @param[in] num_rows Number of rows per column
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void InitPageFragments(cudf::detail::device_2dspan<PageFragment> frag,
                        device_span<parquet_column_device_view const> col_desc,
@@ -463,13 +477,57 @@ void InitPageFragments(cudf::detail::device_2dspan<PageFragment> frag,
  * @param[out] groups Statistics groups [num_columns x num_fragments]
  * @param[in] fragments Page fragments [num_columns x num_fragments]
  * @param[in] col_desc Column description [num_columns]
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void InitFragmentStatistics(cudf::detail::device_2dspan<statistics_group> groups,
                             cudf::detail::device_2dspan<PageFragment const> fragments,
                             device_span<gpu::parquet_column_device_view const> col_desc,
                             rmm::cuda_stream_view stream);
 
+/**
+ * @brief Initialize per-chunk hash maps used for dictionary with sentinel values
+ *
+ * @param chunks Flat span of chunks to intialize hash maps for
+ * @param stream CUDA stream to use
+ */
+void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream);
+
+/**
+ * @brief Insert chunk values into their respective hash maps
+ *
+ * @param chunks Column chunks [rowgroup][column]
+ * @param num_rows Number of rows per column
+ * @param stream CUDA stream to use
+ */
+void populate_chunk_hash_maps(cudf::detail::device_2dspan<EncColumnChunk> chunks,
+                              size_type num_rows,
+                              rmm::cuda_stream_view stream);
+
+/**
+ * @brief Compact dictionary hash map entries into chunk.dict_data
+ *
+ * @param chunks Flat span of chunks to compact hash maps for
+ * @param stream CUDA stream to use
+ */
+void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream);
+
+/**
+ * @brief Get the Dictionary Indices for each row
+ *
+ * For each row of a chunk, gets the indices into chunk.dict_data which contains the value otherwise
+ * stored in input column [row]. Stores these indices into chunk.dict_index.
+ *
+ * Since dict_data itself contains indices into the original cudf column, this means that
+ * col[row] == col[dict_data[dict_index[row - chunk.start_row]]]
+ *
+ * @param chunks Column chunks [rowgroup][column]
+ * @param num_rows Number of rows per column
+ * @param stream CUDA stream to use
+ */
+void get_dictionary_indices(cudf::detail::device_2dspan<EncColumnChunk> chunks,
+                            size_type num_rows,
+                            rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernel for initializing encoder data pages
  *
@@ -538,17 +596,6 @@ void GatherPages(device_span<EncColumnChunk> chunks,
                  device_span<gpu::EncPage const> pages,
                  rmm::cuda_stream_view stream);
 
-/**
- * @brief Launches kernel for building chunk dictionaries
- *
- * @param[in] chunks Column chunks
- * @param[in] dev_scratch Device scratch data (kDictScratchSize bytes per dictionary)
- * @param[in] stream CUDA stream to use, default 0
- */
-void BuildChunkDictionaries(device_span<EncColumnChunk> chunks,
-                            uint32_t* dev_scratch,
-                            rmm::cuda_stream_view stream);
-
 }  // namespace gpu
 }  // namespace parquet
 }  // namespace io
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 6893a6c7eec..0d4ce40354f 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -559,7 +559,7 @@ struct parquet_column_view {
                       rmm::cuda_stream_view stream);
 
   column_view leaf_column_view() const;
-  gpu::parquet_column_device_view get_device_view(rmm::cuda_stream_view stream);
+  gpu::parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const;
 
   column_view cudf_column_view() const { return cudf_col; }
   parquet::Type physical_type() const { return schema_node.type; }
@@ -571,26 +571,6 @@ struct parquet_column_view {
   uint8_t max_rep_level() const noexcept { return _max_rep_level; }
   bool is_list() const noexcept { return _is_list; }
 
-  // Dictionary related member functions
-  uint32_t* get_dict_data() { return (_dict_data.size()) ? _dict_data.data() : nullptr; }
-  uint32_t* get_dict_index() { return (_dict_index.size()) ? _dict_index.data() : nullptr; }
-  void use_dictionary(bool use_dict) { _dictionary_used = use_dict; }
-  void alloc_dictionary(size_t max_num_rows, rmm::cuda_stream_view stream)
-  {
-    _dict_data.resize(max_num_rows, stream);
-    _dict_index.resize(max_num_rows, stream);
-  }
-  bool check_dictionary_used(rmm::cuda_stream_view stream)
-  {
-    if (!_dictionary_used) {
-      _dict_data.resize(0, stream);
-      _dict_data.shrink_to_fit(stream);
-      _dict_index.resize(0, stream);
-      _dict_index.shrink_to_fit(stream);
-    }
-    return _dictionary_used;
-  }
-
  private:
   // Schema related members
   schema_tree_node schema_node;
@@ -610,11 +590,6 @@ struct parquet_column_view {
   rmm::device_uvector<uint8_t> _def_level;
   std::vector<uint8_t> _nullability;
   size_type _data_count = 0;
-
-  // Dictionary related members
-  bool _dictionary_used = false;
-  rmm::device_uvector<uint32_t> _dict_data;
-  rmm::device_uvector<uint32_t> _dict_index;
 };
 
 parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
@@ -624,9 +599,7 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
     _d_nullability(0, stream),
     _dremel_offsets(0, stream),
     _rep_level(0, stream),
-    _def_level(0, stream),
-    _dict_data(0, stream),
-    _dict_index(0, stream)
+    _def_level(0, stream)
 {
   // Construct single inheritance column_view from linked_column_view
   auto curr_col                           = schema_node.leaf_column.get();
@@ -737,21 +710,14 @@ column_view parquet_column_view::leaf_column_view() const
   return col;
 }
 
-gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream_view stream)
+gpu::parquet_column_device_view parquet_column_view::get_device_view(
+  rmm::cuda_stream_view stream) const
 {
   column_view col  = leaf_column_view();
   auto desc        = gpu::parquet_column_device_view{};  // Zero out all fields
   desc.stats_dtype = schema_node.stats_dtype;
   desc.ts_scale    = schema_node.ts_scale;
 
-  // TODO (dm): Enable dictionary for list and struct after refactor
-  if (physical_type() != BOOLEAN && physical_type() != UNDEFINED_TYPE &&
-      !is_nested(cudf_col.type())) {
-    alloc_dictionary(_data_count, stream);
-    desc.dict_index = get_dict_index();
-    desc.dict_data  = get_dict_data();
-  }
-
   if (is_list()) {
     desc.level_offsets = _dremel_offsets.data();
     desc.rep_values    = _rep_level.data();
@@ -759,15 +725,9 @@ gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_s
   }
   desc.num_rows      = cudf_col.size();
   desc.physical_type = static_cast<uint8_t>(physical_type());
-  auto count_bits    = [](uint16_t number) {
-    int16_t nbits = 0;
-    while (number > 0) {
-      nbits++;
-      number >>= 1;
-    }
-    return nbits;
-  };
-  desc.level_bits  = count_bits(max_rep_level()) << 4 | count_bits(max_def_level());
+
+  desc.level_bits = CompactProtocolReader::NumRequiredBits(max_rep_level()) << 4 |
+                    CompactProtocolReader::NumRequiredBits(max_def_level());
   desc.nullability = _d_nullability.data();
   return desc;
 }
@@ -798,22 +758,99 @@ void writer::impl::gather_fragment_statistics(
   stream.synchronize();
 }
 
-void writer::impl::build_chunk_dictionaries(
-  hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
-  device_span<gpu::parquet_column_device_view const> col_desc,
-  uint32_t num_columns,
-  uint32_t num_dictionaries)
+void writer::impl::init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
+                                   device_span<gpu::parquet_column_device_view const> col_desc,
+                                   uint32_t num_columns)
 {
   chunks.host_to_device(stream);
-  if (num_dictionaries > 0) {
-    size_t dict_scratch_size = (size_t)num_dictionaries * gpu::kDictScratchSize;
-    auto dict_scratch        = cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-      dict_scratch_size / sizeof(uint32_t), stream);
+  gpu::InitEncoderPages(chunks, {}, col_desc, num_columns, nullptr, nullptr, stream);
+  chunks.device_to_host(stream, true);
+}
+
+auto build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
+                              host_span<gpu::parquet_column_device_view const> col_desc,
+                              uint32_t num_rows,
+                              rmm::cuda_stream_view stream)
+{
+  // At this point, we know all chunks and their sizes. We want to allocate dictionaries for each
+  // chunk that can have dictionary
+
+  auto h_chunks = chunks.host_view().flat_view();
+
+  std::vector<rmm::device_uvector<size_type>> dict_data;
+  std::vector<rmm::device_uvector<uint16_t>> dict_index;
+
+  if (h_chunks.size() == 0) { return std::make_pair(std::move(dict_data), std::move(dict_index)); }
 
-    gpu::BuildChunkDictionaries(chunks.device_view().flat_view(), dict_scratch.data(), stream);
+  // Allocate slots for each chunk
+  std::vector<rmm::device_uvector<gpu::slot_type>> hash_maps_storage;
+  hash_maps_storage.reserve(h_chunks.size());
+  for (auto& chunk : h_chunks) {
+    if (col_desc[chunk.col_desc_id].physical_type == Type::BOOLEAN) {
+      chunk.use_dictionary = false;
+    } else {
+      chunk.use_dictionary = true;
+      auto& inserted_map   = hash_maps_storage.emplace_back(chunk.num_values, stream);
+      chunk.dict_map_slots = inserted_map.data();
+      chunk.dict_map_size  = inserted_map.size();
+    }
   }
-  gpu::InitEncoderPages(chunks, {}, col_desc, num_columns, nullptr, nullptr, stream);
+
+  chunks.host_to_device(stream);
+
+  gpu::initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream);
+  gpu::populate_chunk_hash_maps(chunks, num_rows, stream);
+
   chunks.device_to_host(stream, true);
+
+  // Make decision about which chunks have dictionary
+  for (auto& ck : h_chunks) {
+    if (not ck.use_dictionary) { continue; }
+    std::tie(ck.use_dictionary, ck.dict_rle_bits) = [&]() {
+      // calculate size of chunk if dictionary is used
+
+      // If we have N unique values then the idx for the last value is N - 1 and nbits is the number
+      // of bits required to encode indices into the dictionary
+      auto max_dict_index = (ck.num_dict_entries > 0) ? ck.num_dict_entries - 1 : 0;
+      auto nbits          = CompactProtocolReader::NumRequiredBits(max_dict_index);
+
+      // We don't use dictionary if the indices are > 16 bits because that's the maximum bitpacking
+      // bitsize we efficiently support
+      if (nbits > 16) { return std::make_pair(false, 0); }
+
+      // Only these bit sizes are allowed for RLE encoding because it's compute optimized
+      constexpr auto allowed_bitsizes = std::array<size_type, 6>{1, 2, 4, 8, 12, 16};
+
+      // ceil to (1/2/4/8/12/16)
+      auto rle_bits = *std::lower_bound(allowed_bitsizes.begin(), allowed_bitsizes.end(), nbits);
+      auto rle_byte_size = util::div_rounding_up_safe(ck.num_values * rle_bits, 8);
+
+      auto dict_enc_size = ck.uniq_data_size + rle_byte_size;
+
+      bool use_dict = (ck.plain_data_size > dict_enc_size);
+      if (not use_dict) { rle_bits = 0; }
+      return std::make_pair(use_dict, rle_bits);
+    }();
+  }
+
+  // TODO: (enh) Deallocate hash map storage for chunks that don't use dict and clear pointers.
+
+  dict_data.reserve(h_chunks.size());
+  dict_index.reserve(h_chunks.size());
+  for (auto& chunk : h_chunks) {
+    if (not chunk.use_dictionary) { continue; }
+
+    size_t dict_data_size     = std::min(MAX_DICT_SIZE, chunk.dict_map_size);
+    auto& inserted_dict_data  = dict_data.emplace_back(dict_data_size, stream);
+    auto& inserted_dict_index = dict_index.emplace_back(chunk.num_values, stream);
+    chunk.dict_data           = inserted_dict_data.data();
+    chunk.dict_index          = inserted_dict_index.data();
+  }
+  chunks.host_to_device(stream);
+  gpu::collect_map_entries(chunks.device_view().flat_view(), stream);
+  gpu::get_dictionary_indices(chunks.device_view(), num_rows, stream);
+
+  return std::make_pair(std::move(dict_data), std::move(dict_index));
 }
 
 void writer::impl::init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
@@ -1013,10 +1050,8 @@ void writer::impl::write(table_view const& table)
 
   // Initialize column description
   hostdevice_vector<gpu::parquet_column_device_view> col_desc(parquet_columns.size(), stream);
-  // This should've been `auto const&` but isn't since dictionary space is allocated when calling
-  // get_device_view(). Fix during dictionary refactor.
   std::transform(
-    parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [&](auto& pcol) {
+    parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [&](auto const& pcol) {
       return pcol.get_device_view(stream);
     });
 
@@ -1027,11 +1062,9 @@ void writer::impl::write(table_view const& table)
   // ideally want the page size to be below 1MB so as to have enough pages to get good
   // compression/decompression performance).
   using cudf::io::parquet::gpu::max_page_fragment_size;
-  constexpr uint32_t fragment_size = 5000;
-  static_assert(fragment_size <= max_page_fragment_size,
-                "fragment size cannot be greater than max_page_fragment_size");
 
-  uint32_t num_fragments = (uint32_t)((num_rows + fragment_size - 1) / fragment_size);
+  uint32_t num_fragments =
+    (uint32_t)((num_rows + max_page_fragment_size - 1) / max_page_fragment_size);
   cudf::detail::hostdevice_2dvector<gpu::PageFragment> fragments(
     num_columns, num_fragments, stream);
 
@@ -1041,7 +1074,7 @@ void writer::impl::write(table_view const& table)
     leaf_column_views = create_leaf_column_device_views<gpu::parquet_column_device_view>(
       col_desc, *parent_column_table_device_view, stream);
 
-    init_page_fragments(fragments, col_desc, num_rows, fragment_size);
+    init_page_fragments(fragments, col_desc, num_rows, max_page_fragment_size);
   }
 
   size_t global_rowgroup_base = md.row_groups.size();
@@ -1056,11 +1089,12 @@ void writer::impl::write(table_view const& table)
     for (auto i = 0; i < num_columns; i++) {
       fragment_data_size += fragments[i][f].fragment_data_size;
     }
-    if (f > rowgroup_start && (rowgroup_size + fragment_data_size > max_rowgroup_size_ ||
-                               (f + 1 - rowgroup_start) * fragment_size > max_rowgroup_rows_)) {
+    if (f > rowgroup_start &&
+        (rowgroup_size + fragment_data_size > max_rowgroup_size_ ||
+         (f + 1 - rowgroup_start) * max_page_fragment_size > max_rowgroup_rows_)) {
       // update schema
       md.row_groups.resize(md.row_groups.size() + 1);
-      md.row_groups[global_r++].num_rows = (f - rowgroup_start) * fragment_size;
+      md.row_groups[global_r++].num_rows = (f - rowgroup_start) * max_page_fragment_size;
       num_rowgroups++;
       rowgroup_start = f;
       rowgroup_size  = 0;
@@ -1069,7 +1103,7 @@ void writer::impl::write(table_view const& table)
     if (f + 1 == num_fragments) {
       // update schema
       md.row_groups.resize(md.row_groups.size() + 1);
-      md.row_groups[global_r++].num_rows = num_rows - rowgroup_start * fragment_size;
+      md.row_groups[global_r++].num_rows = num_rows - rowgroup_start * max_page_fragment_size;
       num_rowgroups++;
     }
   }
@@ -1087,20 +1121,19 @@ void writer::impl::write(table_view const& table)
   // Initialize row groups and column chunks
   uint32_t num_chunks = num_rowgroups * num_columns;
   hostdevice_2dvector<gpu::EncColumnChunk> chunks(num_rowgroups, num_columns, stream);
-  uint32_t num_dictionaries = 0;
   for (uint32_t r = 0, global_r = global_rowgroup_base, f = 0, start_row = 0; r < num_rowgroups;
        r++, global_r++) {
-    uint32_t fragments_in_chunk =
-      (uint32_t)((md.row_groups[global_r].num_rows + fragment_size - 1) / fragment_size);
+    uint32_t fragments_in_chunk = (uint32_t)(
+      (md.row_groups[global_r].num_rows + max_page_fragment_size - 1) / max_page_fragment_size);
     md.row_groups[global_r].total_byte_size = 0;
     md.row_groups[global_r].columns.resize(num_columns);
     for (int i = 0; i < num_columns; i++) {
       gpu::EncColumnChunk* ck = &chunks[r][i];
-      bool dict_enable        = false;
 
-      *ck           = {};
-      ck->col_desc  = col_desc.device_ptr() + i;
-      ck->fragments = &fragments.device_view()[i][f];
+      *ck             = {};
+      ck->col_desc    = col_desc.device_ptr() + i;
+      ck->col_desc_id = i;
+      ck->fragments   = &fragments.device_view()[i][f];
       ck->stats = (frag_stats.size() != 0) ? frag_stats.data() + i * num_fragments + f : nullptr;
       ck->start_row        = start_row;
       ck->num_rows         = (uint32_t)md.row_groups[global_r].num_rows;
@@ -1110,30 +1143,12 @@ void writer::impl::write(table_view const& table)
         std::accumulate(chunk_fragments.begin(), chunk_fragments.end(), 0, [](uint32_t l, auto r) {
           return l + r.num_values;
         });
-      ck->dictionary_id = num_dictionaries;
-      if (col_desc[i].dict_data) {
-        size_t plain_size      = 0;
-        size_t dict_size       = 1;
-        uint32_t num_dict_vals = 0;
-        for (uint32_t j = 0; j < fragments_in_chunk && num_dict_vals < 65536; j++) {
-          plain_size += chunk_fragments[j].fragment_data_size;
-          dict_size += chunk_fragments[j].dict_data_size +
-                       ((num_dict_vals > 256) ? 2 : 1) * chunk_fragments[j].non_nulls;
-          num_dict_vals += chunk_fragments[j].num_dict_vals;
-        }
-        if (dict_size < plain_size) {
-          parquet_columns[i].use_dictionary(true);
-          dict_enable = true;
-          num_dictionaries++;
-        }
-      }
-      ck->has_dictionary                                     = dict_enable;
+      ck->plain_data_size = std::accumulate(
+        chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, gpu::PageFragment frag) {
+          return sum + frag.fragment_data_size;
+        });
       md.row_groups[global_r].columns[i].meta_data.type      = parquet_columns[i].physical_type();
       md.row_groups[global_r].columns[i].meta_data.encodings = {Encoding::PLAIN, Encoding::RLE};
-      if (dict_enable) {
-        md.row_groups[global_r].columns[i].meta_data.encodings.push_back(
-          Encoding::PLAIN_DICTIONARY);
-      }
       md.row_groups[global_r].columns[i].meta_data.path_in_schema =
         parquet_columns[i].get_path_in_schema();
       md.row_groups[global_r].columns[i].meta_data.codec      = UNCOMPRESSED;
@@ -1143,15 +1158,18 @@ void writer::impl::write(table_view const& table)
     start_row += (uint32_t)md.row_groups[global_r].num_rows;
   }
 
-  // Free unused dictionaries
-  for (auto& col : parquet_columns) {
-    col.check_dictionary_used(stream);
+  auto dict_info_owner = build_chunk_dictionaries(chunks, col_desc, num_rows, stream);
+  for (uint32_t rg = 0, global_rg = global_rowgroup_base; rg < num_rowgroups; rg++, global_rg++) {
+    for (int col = 0; col < num_columns; col++) {
+      if (chunks.host_view()[rg][col].use_dictionary) {
+        md.row_groups[global_rg].columns[col].meta_data.encodings.push_back(
+          Encoding::PLAIN_DICTIONARY);
+      }
+    }
   }
 
   // Build chunk dictionaries and count pages
-  if (num_chunks != 0) {
-    build_chunk_dictionaries(chunks, col_desc, num_columns, num_dictionaries);
-  }
+  if (num_chunks != 0) { init_page_sizes(chunks, col_desc, num_columns); }
 
   // Initialize batches of rowgroups to encode (mainly to limit peak memory usage)
   std::vector<uint32_t> batch_list;
@@ -1301,9 +1319,9 @@ void writer::impl::write(table_view const& table)
         }
         md.row_groups[global_r].total_byte_size += ck->compressed_size;
         md.row_groups[global_r].columns[i].meta_data.data_page_offset =
-          current_chunk_offset + ((ck->has_dictionary) ? ck->dictionary_size : 0);
+          current_chunk_offset + ((ck->use_dictionary) ? ck->dictionary_size : 0);
         md.row_groups[global_r].columns[i].meta_data.dictionary_page_offset =
-          (ck->has_dictionary) ? current_chunk_offset : 0;
+          (ck->use_dictionary) ? current_chunk_offset : 0;
         md.row_groups[global_r].columns[i].meta_data.total_uncompressed_size = ck->bfr_size;
         md.row_groups[global_r].columns[i].meta_data.total_compressed_size   = ck->compressed_size;
         current_chunk_offset += ck->compressed_size;
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 8d9bdc8adbd..8fb1a8294fb 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -153,12 +153,11 @@ class writer::impl {
    * @param chunks column chunk array
    * @param col_desc column description array
    * @param num_columns Total number of columns
-   * @param num_dictionaries Total number of dictionaries
    */
-  void build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
-                                device_span<gpu::parquet_column_device_view const> col_desc,
-                                uint32_t num_columns,
-                                uint32_t num_dictionaries);
+  void init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
+                       device_span<gpu::parquet_column_device_view const> col_desc,
+                       uint32_t num_columns);
+
   /**
    * @brief Initialize encoder pages
    *
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 8fdfc6f9165..70b4bd1d873 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -275,10 +275,10 @@ inline auto random_values(size_t size)
 TYPED_TEST(ParquetWriterNumericTypeTest, SingleColumn)
 {
   auto sequence =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return TypeParam(i); });
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return TypeParam(i % 400); });
   auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
-  constexpr auto num_rows = 100;
+  constexpr auto num_rows = 800;
   column_wrapper<TypeParam> col(sequence, sequence + num_rows, validity);
 
   std::vector<std::unique_ptr<column>> cols;

From 80bf8d9f2fa2719d20e6d7d4c8d12ea3448324cf Mon Sep 17 00:00:00 2001
From: Devavret Makkar <devavret@users.noreply.github.com>
Date: Thu, 19 Aug 2021 14:42:28 +0530
Subject: [PATCH 11/46] Add nested column selection to parquet reader (#8933)

Closes #8850

Adds ability to select specific children of a nested column. The python API mimics pyarrow and the format is
```python
cudf.read_parquet("test.parquet", columns=["struct1.child1.grandchild2", "struct1.child2"])
```
The C++ API takes each path as a vector
```c++
cudf::io::parquet_reader_options read_args =
  cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath))
    .columns({{"struct1", "child1", "grandchild2"},
              {"struct1", "child2"}});
```

Authors:
  - Devavret Makkar (https://github.com/devavret)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Christopher Harris (https://github.com/cwharris)

URL: https://github.com/rapidsai/cudf/pull/8933
---
 cpp/include/cudf/io/parquet.hpp        |   2 +-
 cpp/src/io/parquet/parquet.cpp         |   1 +
 cpp/src/io/parquet/parquet.hpp         |   1 +
 cpp/src/io/parquet/reader_impl.cu      | 310 ++++++++++++++++---------
 cpp/tests/io/parquet_test.cpp          | 131 ++++++++++-
 python/cudf/cudf/io/parquet.py         |   4 +
 python/cudf/cudf/tests/test_parquet.py | 110 +++++++--
 7 files changed, 420 insertions(+), 139 deletions(-)

diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index ecd9607a87e..031228ae6de 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -50,7 +50,7 @@ class parquet_reader_options_builder;
 class parquet_reader_options {
   source_info _source;
 
-  // Names of column to read; empty is all
+  // Path in schema of column to read; empty is all
   std::vector<std::string> _columns;
 
   // List of individual row groups to read (ignored if empty)
diff --git a/cpp/src/io/parquet/parquet.cpp b/cpp/src/io/parquet/parquet.cpp
index 6c658788fa1..c8c54e9933f 100644
--- a/cpp/src/io/parquet/parquet.cpp
+++ b/cpp/src/io/parquet/parquet.cpp
@@ -347,6 +347,7 @@ int CompactProtocolReader::WalkSchema(
     ++idx;
     if (e->num_children > 0) {
       for (int i = 0; i < e->num_children; i++) {
+        e->children_idx.push_back(idx);
         int idx_old = idx;
         idx         = WalkSchema(md, idx, parent_idx, max_def_level, max_rep_level);
         if (idx <= idx_old) break;  // Error
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 2232017409d..4390d1c788f 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -165,6 +165,7 @@ struct SchemaElement {
   int max_definition_level = 0;
   int max_repetition_level = 0;
   int parent_idx           = 0;
+  std::vector<size_t> children_idx;
 
   bool operator==(SchemaElement const& other) const
   {
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index 9f9bdfd4755..caf11b66206 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -464,8 +464,9 @@ class aggregate_metadata {
    *
    * @param names List of column names to load, where index column name(s) will be added
    */
-  void add_pandas_index_names(std::vector<std::string>& names) const
+  std::vector<std::string> get_pandas_index_names() const
   {
+    std::vector<std::string> names;
     auto str = get_pandas_index();
     if (str.length() != 0) {
       std::regex index_name_expr{R"(\"((?:\\.|[^\"])*)\")"};
@@ -480,6 +481,7 @@ class aggregate_metadata {
         str = sm.suffix();
       }
     }
+    return names;
   }
 
   struct row_group_info {
@@ -549,86 +551,14 @@ class aggregate_metadata {
     return selection;
   }
 
-  /**
-   * @brief Build input and output column structures based on schema input. Recursive.
-   *
-   * @param[in,out] schema_idx Schema index to build information for. This value gets
-   * incremented as the function recurses.
-   * @param[out] input_columns Input column information (source data in the file)
-   * @param[out] output_columns Output column structure (resulting cudf columns)
-   * @param[in,out] nesting A stack keeping track of child column indices so we can
-   * reproduce the linear list of output columns that correspond to an input column.
-   * @param[in] strings_to_categorical Type conversion parameter
-   * @param[in] timestamp_type_id Type conversion parameter
-   * @param[in] strict_decimal_types True if it is an error to load an unsupported decimal type
-   *
-   */
-  void build_column_info(int& schema_idx,
-                         std::vector<input_column_info>& input_columns,
-                         std::vector<column_buffer>& output_columns,
-                         std::deque<int>& nesting,
-                         bool strings_to_categorical,
-                         type_id timestamp_type_id,
-                         bool strict_decimal_types) const
-  {
-    int start_schema_idx = schema_idx;
-    auto const& schema   = get_schema(schema_idx);
-    schema_idx++;
-
-    // if I am a stub, continue on
-    if (schema.is_stub()) {
-      // is this legit?
-      CUDF_EXPECTS(schema.num_children == 1, "Unexpected number of children for stub");
-      build_column_info(schema_idx,
-                        input_columns,
-                        output_columns,
-                        nesting,
-                        strings_to_categorical,
-                        timestamp_type_id,
-                        strict_decimal_types);
-      return;
-    }
-
-    // if we're at the root, this is a new output column
-    nesting.push_back(static_cast<int>(output_columns.size()));
-    auto const col_type =
-      to_type_id(schema, strings_to_categorical, timestamp_type_id, strict_decimal_types);
-    auto const dtype = col_type == type_id::DECIMAL32 || col_type == type_id::DECIMAL64
-                         ? data_type{col_type, numeric::scale_type{-schema.decimal_scale}}
-                         : data_type{col_type};
-    output_columns.emplace_back(dtype, schema.repetition_type == OPTIONAL ? true : false);
-    column_buffer& output_col = output_columns.back();
-    output_col.name           = schema.name;
-
-    // build each child
-    for (int idx = 0; idx < schema.num_children; idx++) {
-      build_column_info(schema_idx,
-                        input_columns,
-                        output_col.children,
-                        nesting,
-                        strings_to_categorical,
-                        timestamp_type_id,
-                        strict_decimal_types);
-    }
-
-    // if I have no children, we're at a leaf and I'm an input column (that is, one with actual
-    // data stored) so add me to the list.
-    if (schema.num_children == 0) {
-      input_columns.emplace_back(input_column_info{start_schema_idx, schema.name});
-      input_column_info& input_col = input_columns.back();
-      std::copy(nesting.begin(), nesting.end(), std::back_inserter(input_col.nesting));
-    }
-
-    nesting.pop_back();
-  }
-
   /**
    * @brief Filters and reduces down to a selection of columns
    *
-   * @param use_names List of column names to select
+   * @param use_names List of paths of column names to select
    * @param include_index Whether to always include the PANDAS index column(s)
    * @param strings_to_categorical Type conversion parameter
    * @param timestamp_type_id Type conversion parameter
+   * @param strict_decimal_types Type conversion parameter
    *
    * @return input column information, output column information, list of output column schema
    * indices
@@ -639,9 +569,86 @@ class aggregate_metadata {
                       type_id timestamp_type_id,
                       bool strict_decimal_types) const
   {
-    auto const& pfm = per_file_metadata[0];
+    auto find_schema_child = [&](SchemaElement const& schema_elem, std::string const& name) {
+      auto const& col_schema_idx = std::find_if(
+        schema_elem.children_idx.cbegin(),
+        schema_elem.children_idx.cend(),
+        [&](size_t col_schema_idx) { return get_schema(col_schema_idx).name == name; });
+
+      return (col_schema_idx != schema_elem.children_idx.end()) ? static_cast<int>(*col_schema_idx)
+                                                                : -1;
+    };
+
+    std::vector<column_buffer> output_columns;
+    std::vector<input_column_info> input_columns;
+    std::vector<int> nesting;
+
+    // Return true if column path is valid. e.g. if the path is {"struct1", "child1"}, then it is
+    // valid if "struct1.child1" exists in this file's schema. If "struct1" exists but "child1" is
+    // not a child of "struct1" then the function will return false for "struct1"
+    std::function<bool(column_name_info const*, int, std::vector<column_buffer>&)> build_column =
+      [&](column_name_info const* col_name_info,
+          int schema_idx,
+          std::vector<column_buffer>& out_col_array) {
+        if (schema_idx < 0) { return false; }
+        auto const& schema_elem = get_schema(schema_idx);
+
+        // if schema_elem is a stub then it does not exist in the column_name_info and column_buffer
+        // hierarchy. So continue on
+        if (schema_elem.is_stub()) {
+          // is this legit?
+          CUDF_EXPECTS(schema_elem.num_children == 1, "Unexpected number of children for stub");
+          auto child_col_name_info = (col_name_info) ? &col_name_info->children[0] : nullptr;
+          return build_column(child_col_name_info, schema_elem.children_idx[0], out_col_array);
+        }
+
+        // if we're at the root, this is a new output column
+        auto const col_type =
+          to_type_id(schema_elem, strings_to_categorical, timestamp_type_id, strict_decimal_types);
+        auto const dtype = col_type == type_id::DECIMAL32 || col_type == type_id::DECIMAL64
+                             ? data_type{col_type, numeric::scale_type{-schema_elem.decimal_scale}}
+                             : data_type{col_type};
+
+        column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL);
+        // store the index of this element if inserted in out_col_array
+        nesting.push_back(static_cast<int>(out_col_array.size()));
+        output_col.name = schema_elem.name;
+
+        // build each child
+        bool path_is_valid = false;
+        if (col_name_info == nullptr or col_name_info->children.empty()) {
+          // add all children of schema_elem.
+          // At this point, we can no longer pass a col_name_info to build_column
+          for (int idx = 0; idx < schema_elem.num_children; idx++) {
+            path_is_valid |=
+              build_column(nullptr, schema_elem.children_idx[idx], output_col.children);
+          }
+        } else {
+          for (size_t idx = 0; idx < col_name_info->children.size(); idx++) {
+            path_is_valid |=
+              build_column(&col_name_info->children[idx],
+                           find_schema_child(schema_elem, col_name_info->children[idx].name),
+                           output_col.children);
+          }
+        }
+
+        // if I have no children, we're at a leaf and I'm an input column (that is, one with actual
+        // data stored) so add me to the list.
+        if (schema_elem.num_children == 0) {
+          input_column_info& input_col =
+            input_columns.emplace_back(input_column_info{schema_idx, schema_elem.name});
+          std::copy(nesting.cbegin(), nesting.cend(), std::back_inserter(input_col.nesting));
+          path_is_valid = true;  // If we're able to reach leaf then path is valid
+        }
+
+        if (path_is_valid) { out_col_array.push_back(std::move(output_col)); }
+
+        nesting.pop_back();
+        return path_is_valid;
+      };
+
+    std::vector<int> output_column_schemas;
 
-    // determine the list of output columns
     //
     // there is not necessarily a 1:1 mapping between input columns and output columns.
     // For example, parquet does not explicitly store a ColumnChunkDesc for struct columns.
@@ -657,43 +664,120 @@ class aggregate_metadata {
     // "firstname", "middlename" and "lastname" represent the input columns in the file that we
     // process to produce the final cudf "name" column.
     //
-    std::vector<int> output_column_schemas;
+    // A user can ask for a single field out of the struct e.g. firstname.
+    // In this case they'll pass a fully qualified name to the schema element like
+    // ["name", "firstname"]
+    //
+    auto const& root = get_schema(0);
     if (use_names.empty()) {
-      // walk the schema and choose all top level columns
-      for (size_t schema_idx = 1; schema_idx < pfm.schema.size(); schema_idx++) {
-        auto const& schema = pfm.schema[schema_idx];
-        if (schema.parent_idx == 0) { output_column_schemas.push_back(schema_idx); }
+      for (auto const& schema_idx : root.children_idx) {
+        build_column(nullptr, schema_idx, output_columns);
+        output_column_schemas.push_back(schema_idx);
       }
     } else {
-      // Load subset of columns; include PANDAS index unless excluded
-      std::vector<std::string> local_use_names = use_names;
-      if (include_index) { add_pandas_index_names(local_use_names); }
-      for (const auto& use_name : local_use_names) {
-        for (size_t schema_idx = 1; schema_idx < pfm.schema.size(); schema_idx++) {
-          auto const& schema = pfm.schema[schema_idx];
-          // We select only top level columns by name. Selecting nested columns by name is not
-          // supported. Top level columns are identified by their parent being the root (idx == 0)
-          if (use_name == schema.name and schema.parent_idx == 0) {
-            output_column_schemas.push_back(schema_idx);
-          }
+      struct path_info {
+        std::string full_path;
+        int schema_idx;
+      };
+
+      // Convert schema into a vector of every possible path
+      std::vector<path_info> all_paths;
+      std::function<void(std::string, int)> add_path = [&](std::string path_till_now,
+                                                           int schema_idx) {
+        auto const& schema_elem = get_schema(schema_idx);
+        std::string curr_path   = path_till_now + schema_elem.name;
+        all_paths.push_back({curr_path, schema_idx});
+        for (auto const& child_idx : schema_elem.children_idx) {
+          add_path(curr_path + ".", child_idx);
         }
+      };
+      for (auto const& child_idx : get_schema(0).children_idx) {
+        add_path("", child_idx);
       }
-    }
 
-    // construct input and output output column info
-    std::vector<column_buffer> output_columns;
-    output_columns.reserve(output_column_schemas.size());
-    std::vector<input_column_info> input_columns;
-    std::deque<int> nesting;
-    for (size_t idx = 0; idx < output_column_schemas.size(); idx++) {
-      int schema_index = output_column_schemas[idx];
-      build_column_info(schema_index,
-                        input_columns,
-                        output_columns,
-                        nesting,
-                        strings_to_categorical,
-                        timestamp_type_id,
-                        strict_decimal_types);
+      // Find which of the selected paths are valid and get their schema index
+      std::vector<path_info> valid_selected_paths;
+      for (auto const& selected_path : use_names) {
+        auto found_path =
+          std::find_if(all_paths.begin(), all_paths.end(), [&](path_info& valid_path) {
+            return valid_path.full_path == selected_path;
+          });
+        if (found_path != all_paths.end()) {
+          valid_selected_paths.push_back({selected_path, found_path->schema_idx});
+        }
+      }
+
+      // Now construct paths as vector of strings for further consumption
+      std::vector<std::vector<std::string>> use_names3;
+      std::transform(valid_selected_paths.begin(),
+                     valid_selected_paths.end(),
+                     std::back_inserter(use_names3),
+                     [&](path_info const& valid_path) {
+                       auto schema_idx = valid_path.schema_idx;
+                       std::vector<std::string> result_path;
+                       do {
+                         SchemaElement const& elem = get_schema(schema_idx);
+                         result_path.push_back(elem.name);
+                         schema_idx = elem.parent_idx;
+                       } while (schema_idx > 0);
+                       return std::vector<std::string>(result_path.rbegin(), result_path.rend());
+                     });
+
+      std::vector<column_name_info> selected_columns;
+      if (include_index) {
+        std::vector<std::string> index_names = get_pandas_index_names();
+        std::transform(index_names.cbegin(),
+                       index_names.cend(),
+                       std::back_inserter(selected_columns),
+                       [](std::string const& name) { return column_name_info(name); });
+      }
+      // Merge the vector use_names into a set of hierarchical column_name_info objects
+      /* This is because if we have columns like this:
+       *     col1
+       *      / \
+       *    s3   f4
+       *   / \
+       * f5   f6
+       *
+       * there may be common paths in use_names like:
+       * {"col1", "s3", "f5"}, {"col1", "f4"}
+       * which means we want the output to contain
+       *     col1
+       *      / \
+       *    s3   f4
+       *   /
+       * f5
+       *
+       * rather than
+       *  col1   col1
+       *   |      |
+       *   s3     f4
+       *   |
+       *   f5
+       */
+      for (auto const& path : use_names3) {
+        auto array_to_find_in = &selected_columns;
+        for (size_t depth = 0; depth < path.size(); ++depth) {
+          // Check if the path exists in our selected_columns and if not, add it.
+          auto const& name_to_find = path[depth];
+          auto found_col           = std::find_if(
+            array_to_find_in->begin(),
+            array_to_find_in->end(),
+            [&name_to_find](column_name_info const& col) { return col.name == name_to_find; });
+          if (found_col == array_to_find_in->end()) {
+            auto& col        = array_to_find_in->emplace_back(name_to_find);
+            array_to_find_in = &col.children;
+          } else {
+            // Path exists. go down further.
+            array_to_find_in = &found_col->children;
+          }
+        }
+      }
+      for (auto& col : selected_columns) {
+        auto const& top_level_col_schema_idx = find_schema_child(root, col.name);
+        bool valid_column = build_column(&col, top_level_col_schema_idx, output_columns);
+        if (valid_column) output_column_schemas.push_back(top_level_col_schema_idx);
+      }
     }
 
     return std::make_tuple(
@@ -1581,18 +1665,16 @@ table_with_metadata reader::impl::read(size_type skip_rows,
 
       // create the final output cudf columns
       for (size_t i = 0; i < _output_columns.size(); ++i) {
-        out_metadata.schema_info.push_back(column_name_info{""});
-        out_columns.emplace_back(
-          make_column(_output_columns[i], &out_metadata.schema_info.back(), stream, _mr));
+        column_name_info& col_name = out_metadata.schema_info.emplace_back("");
+        out_columns.emplace_back(make_column(_output_columns[i], &col_name, stream, _mr));
       }
     }
   }
 
   // Create empty columns as needed (this can happen if we've ended up with no actual data to read)
   for (size_t i = out_columns.size(); i < _output_columns.size(); ++i) {
-    out_metadata.schema_info.push_back(column_name_info{""});
-    out_columns.emplace_back(cudf::io::detail::empty_like(
-      _output_columns[i], &out_metadata.schema_info.back(), stream, _mr));
+    column_name_info& col_name = out_metadata.schema_info.emplace_back("");
+    out_columns.emplace_back(io::detail::empty_like(_output_columns[i], &col_name, stream, _mr));
   }
 
   // Return column names (must match order of returned columns)
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 70b4bd1d873..7260aa9e686 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -816,7 +816,7 @@ TEST_F(ParquetWriterTest, MultiIndex)
   expected_metadata.column_metadata[3].set_name("floats");
   expected_metadata.column_metadata[4].set_name("doubles");
   expected_metadata.user_data.insert(
-    {"pandas", "\"index_columns\": [\"floats\", \"doubles\"], \"column1\": [\"int8s\"]"});
+    {"pandas", "\"index_columns\": [\"int8s\", \"int16s\"], \"column1\": [\"int32s\"]"});
 
   auto filepath = temp_env->get_temp_filepath("MultiIndex.parquet");
   cudf_io::parquet_writer_options out_opts =
@@ -827,7 +827,7 @@ TEST_F(ParquetWriterTest, MultiIndex)
   cudf_io::parquet_reader_options in_opts =
     cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath})
       .use_pandas_metadata(true)
-      .columns({"int8s", "int16s", "int32s"});
+      .columns({"int32s", "floats", "doubles"});
   auto result = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
@@ -967,8 +967,6 @@ TEST_F(ParquetWriterTest, StructOfList)
   auto struct_2 =
     cudf::test::structs_column_wrapper{{is_human_col, struct_1}, {0, 1, 1, 1, 1, 1}}.release();
 
-  // cudf::test::print(struct_2->child(1).child(2));
-
   auto expected = table_view({*struct_2});
 
   cudf_io::table_input_metadata expected_metadata(expected);
@@ -2497,6 +2495,131 @@ TEST_F(ParquetReaderTest, ReorderedColumns)
   }
 }
 
+TEST_F(ParquetReaderTest, SelectNestedColumn)
+{
+  // Struct<is_human:bool,
+  //        Struct<weight:float,
+  //               ages:int,
+  //               land_unit:List<int>>,
+  //               flats:List<List<int>>
+  //              >
+  //       >
+
+  auto weights_col = cudf::test::fixed_width_column_wrapper<float>{1.1, 2.4, 5.3, 8.0, 9.6, 6.9};
+
+  auto ages_col =
+    cudf::test::fixed_width_column_wrapper<int32_t>{{48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}};
+
+  auto struct_1 = cudf::test::structs_column_wrapper{{weights_col, ages_col}, {1, 1, 1, 1, 0, 1}};
+
+  auto is_human_col = cudf::test::fixed_width_column_wrapper<bool>{
+    {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}};
+
+  auto struct_2 =
+    cudf::test::structs_column_wrapper{{is_human_col, struct_1}, {0, 1, 1, 1, 1, 1}}.release();
+
+  auto input = table_view({*struct_2});
+
+  cudf_io::table_input_metadata input_metadata(input);
+  input_metadata.column_metadata[0].set_name("being");
+  input_metadata.column_metadata[0].child(0).set_name("human?");
+  input_metadata.column_metadata[0].child(1).set_name("particulars");
+  input_metadata.column_metadata[0].child(1).child(0).set_name("weight");
+  input_metadata.column_metadata[0].child(1).child(1).set_name("age");
+
+  auto filepath = temp_env->get_temp_filepath("SelectNestedColumn.parquet");
+  cudf_io::parquet_writer_options args =
+    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, input)
+      .metadata(&input_metadata);
+  cudf_io::write_parquet(args);
+
+  {  // Test selecting a single leaf from the table
+    cudf_io::parquet_reader_options read_args =
+      cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath))
+        .columns({"being.particulars.age"});
+    const auto result = cudf_io::read_parquet(read_args);
+
+    auto expect_ages_col = cudf::test::fixed_width_column_wrapper<int32_t>{
+      {48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}};
+    auto expect_s_1 = cudf::test::structs_column_wrapper{{expect_ages_col}, {1, 1, 1, 1, 0, 1}};
+    auto expect_s_2 =
+      cudf::test::structs_column_wrapper{{expect_s_1}, {0, 1, 1, 1, 1, 1}}.release();
+    auto expected = table_view({*expect_s_2});
+
+    cudf_io::table_input_metadata expected_metadata(expected);
+    expected_metadata.column_metadata[0].set_name("being");
+    expected_metadata.column_metadata[0].child(0).set_name("particulars");
+    expected_metadata.column_metadata[0].child(0).child(0).set_name("age");
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+    compare_metadata_equality(expected_metadata, result.metadata);
+  }
+
+  {  // Test selecting a non-leaf and expecting all hierarchy from that node onwards
+    cudf_io::parquet_reader_options read_args =
+      cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath))
+        .columns({"being.particulars"});
+    const auto result = cudf_io::read_parquet(read_args);
+
+    auto expected_weights_col =
+      cudf::test::fixed_width_column_wrapper<float>{1.1, 2.4, 5.3, 8.0, 9.6, 6.9};
+
+    auto expected_ages_col = cudf::test::fixed_width_column_wrapper<int32_t>{
+      {48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}};
+
+    auto expected_s_1 = cudf::test::structs_column_wrapper{
+      {expected_weights_col, expected_ages_col}, {1, 1, 1, 1, 0, 1}};
+
+    auto expect_s_2 =
+      cudf::test::structs_column_wrapper{{expected_s_1}, {0, 1, 1, 1, 1, 1}}.release();
+    auto expected = table_view({*expect_s_2});
+
+    cudf_io::table_input_metadata expected_metadata(expected);
+    expected_metadata.column_metadata[0].set_name("being");
+    expected_metadata.column_metadata[0].child(0).set_name("particulars");
+    expected_metadata.column_metadata[0].child(0).child(0).set_name("weight");
+    expected_metadata.column_metadata[0].child(0).child(1).set_name("age");
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+    compare_metadata_equality(expected_metadata, result.metadata);
+  }
+
+  {  // Test selecting struct children out of order
+    cudf_io::parquet_reader_options read_args =
+      cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath))
+        .columns({"being.particulars.age", "being.particulars.weight", "being.human?"});
+    const auto result = cudf_io::read_parquet(read_args);
+
+    auto expected_weights_col =
+      cudf::test::fixed_width_column_wrapper<float>{1.1, 2.4, 5.3, 8.0, 9.6, 6.9};
+
+    auto expected_ages_col = cudf::test::fixed_width_column_wrapper<int32_t>{
+      {48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}};
+
+    auto expected_is_human_col = cudf::test::fixed_width_column_wrapper<bool>{
+      {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}};
+
+    auto expect_s_1 = cudf::test::structs_column_wrapper{{expected_ages_col, expected_weights_col},
+                                                         {1, 1, 1, 1, 0, 1}};
+
+    auto expect_s_2 =
+      cudf::test::structs_column_wrapper{{expect_s_1, expected_is_human_col}, {0, 1, 1, 1, 1, 1}}
+        .release();
+
+    auto expected = table_view({*expect_s_2});
+
+    cudf_io::table_input_metadata expected_metadata(expected);
+    expected_metadata.column_metadata[0].set_name("being");
+    expected_metadata.column_metadata[0].child(0).set_name("particulars");
+    expected_metadata.column_metadata[0].child(0).child(0).set_name("age");
+    expected_metadata.column_metadata[0].child(0).child(1).set_name("weight");
+    expected_metadata.column_metadata[0].child(1).set_name("human?");
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+    compare_metadata_equality(expected_metadata, result.metadata);
+  }
+}
+
 TEST_F(ParquetReaderTest, DecimalRead)
 {
   {
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index a18486cff3c..fa748761695 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -210,6 +210,10 @@ def read_parquet(
         else:
             filepaths_or_buffers.append(tmp_source)
 
+    if columns is not None:
+        if not is_list_like(columns):
+            raise ValueError("Expected list like for columns")
+
     if filters is not None:
         # Convert filters to ds.Expression
         filters = pq._filters_to_expression(filters)
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 21dc8315e32..e4a61a2a37e 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1140,6 +1140,84 @@ def test_parquet_reader_struct_basic(tmpdir, data):
     assert expect.equals(got.to_arrow())
 
 
+def select_columns_params():
+    dfs = [
+        # struct
+        (
+            [
+                {"a": 1, "b": 2},
+                {"a": 10, "b": 20},
+                {"a": None, "b": 22},
+                {"a": None, "b": None},
+                {"a": 15, "b": None},
+            ],
+            [["struct"], ["struct.a"], ["struct.b"], ["c"]],
+        ),
+        # struct-of-list
+        (
+            [
+                {"a": 1, "b": 2, "c": [1, 2, 3]},
+                {"a": 10, "b": 20, "c": [4, 5]},
+                {"a": None, "b": 22, "c": [6]},
+                {"a": None, "b": None, "c": None},
+                {"a": 15, "b": None, "c": [-1, -2]},
+                None,
+                {"a": 100, "b": 200, "c": [-10, None, -20]},
+            ],
+            [
+                ["struct"],
+                ["struct.c"],
+                ["struct.c.list"],
+                ["struct.c.list.item"],
+                ["struct.b", "struct.c"],
+                ["struct.b", "struct.d", "struct.c"],
+            ],
+        ),
+        # list-of-struct
+        (
+            [
+                [{"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 4, "b": 5}],
+                None,
+                [{"a": 10, "b": 20}],
+                [{"a": 100, "b": 200}, {"a": None, "b": 300}, None],
+            ],
+            [
+                ["struct"],
+                ["struct.list"],
+                ["struct.list.item"],
+                ["struct.list.item.a", "struct.list.item.b"],
+                ["struct.list.item.c"],
+            ],
+        ),
+        # struct with "." in field names
+        (
+            [
+                {"a.b": 1, "b.a": 2},
+                {"a.b": 10, "b.a": 20},
+                {"a.b": None, "b.a": 22},
+                {"a.b": None, "b.a": None},
+                {"a.b": 15, "b.a": None},
+            ],
+            [["struct"], ["struct.a"], ["struct.b.a"]],
+        ),
+    ]
+    for df_col_pair in dfs:
+        for cols in df_col_pair[1]:
+            yield df_col_pair[0], cols
+
+
+@pytest.mark.parametrize("data, columns", select_columns_params())
+def test_parquet_reader_struct_select_columns(tmpdir, data, columns):
+    table = pa.Table.from_pydict({"struct": data})
+    buff = BytesIO()
+
+    pa.parquet.write_table(table, buff)
+
+    expect = pq.ParquetFile(buff).read(columns=columns)
+    got = cudf.read_parquet(buff, columns=columns)
+    assert expect.equals(got.to_arrow())
+
+
 def test_parquet_reader_struct_los_large(tmpdir):
     num_rows = 256
     list_size = 64
@@ -1860,26 +1938,18 @@ def test_parquet_writer_list_statistics(tmpdir):
             ]
         },
         # List of Structs
-        pytest.param(
-            {
-                "family": [
-                    [
-                        None,
-                        {"human?": True, "deets": {"weight": 2.4, "age": 27}},
-                    ],
-                    [
-                        {"human?": None, "deets": {"weight": 5.3, "age": 25}},
-                        {"human?": False, "deets": {"weight": 8.0, "age": 31}},
-                        {"human?": False, "deets": None},
-                    ],
-                    [],
-                    [{"human?": None, "deets": {"weight": 6.9, "age": None}}],
-                ]
-            },
-            marks=pytest.mark.xfail(
-                reason="https://github.com/rapidsai/cudf/issues/7561"
-            ),
-        ),
+        {
+            "family": [
+                [None, {"human?": True, "deets": {"weight": 2.4, "age": 27}}],
+                [
+                    {"human?": None, "deets": {"weight": 5.3, "age": 25}},
+                    {"human?": False, "deets": {"weight": 8.0, "age": 31}},
+                    {"human?": False, "deets": None},
+                ],
+                [],
+                [{"human?": None, "deets": {"weight": 6.9, "age": None}}],
+            ]
+        },
         # Struct of Lists
         pytest.param(
             {

From 53f0bb442b8036abd565937740115365ecb4d1b3 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Thu, 19 Aug 2021 07:52:48 -0500
Subject: [PATCH 12/46] Update JNI java CSV APIs to not use deprecated API
 (#9066)

This fixes #9062

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/9066
---
 java/src/main/java/ai/rapids/cudf/DType.java  | 72 ++++++++-----------
 java/src/main/java/ai/rapids/cudf/Schema.java | 35 ++++++---
 java/src/main/java/ai/rapids/cudf/Table.java  | 10 +--
 java/src/main/native/src/TableJni.cpp         | 23 +++++-
 4 files changed, 82 insertions(+), 58 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/DType.java b/java/src/main/java/ai/rapids/cudf/DType.java
index 87237f1e4b2..2d851aa2ae3 100644
--- a/java/src/main/java/ai/rapids/cudf/DType.java
+++ b/java/src/main/java/ai/rapids/cudf/DType.java
@@ -30,65 +30,61 @@ public final class DType {
   2. Update SINGLETON_DTYPE_LOOKUP to reflect new type. The order should be maintained between
   DTypeEnum and SINGLETON_DTYPE_LOOKUP */
   public enum DTypeEnum {
-    EMPTY(0, 0, "NOT SUPPORTED"),
-    INT8(1, 1, "byte"),
-    INT16(2, 2, "short"),
-    INT32(4, 3, "int"),
-    INT64(8, 4, "long"),
-    UINT8(1, 5, "uint8"),
-    UINT16(2, 6, "uint16"),
-    UINT32(4, 7, "uint32"),
-    UINT64(8, 8, "uint64"),
-    FLOAT32(4, 9, "float"),
-    FLOAT64(8, 10, "double"),
+    EMPTY(0, 0),
+    INT8(1, 1),
+    INT16(2, 2),
+    INT32(4, 3),
+    INT64(8, 4),
+    UINT8(1, 5),
+    UINT16(2, 6),
+    UINT32(4, 7),
+    UINT64(8, 8),
+    FLOAT32(4, 9),
+    FLOAT64(8, 10),
     /**
      * Byte wise true non-0/false 0.  In general true will be 1.
      */
-    BOOL8(1, 11, "bool"),
+    BOOL8(1, 11),
     /**
      * Days since the UNIX epoch
      */
-    TIMESTAMP_DAYS(4, 12, "date32"),
+    TIMESTAMP_DAYS(4, 12),
     /**
      * s since the UNIX epoch
      */
-    TIMESTAMP_SECONDS(8, 13, "timestamp[s]"),
+    TIMESTAMP_SECONDS(8, 13),
     /**
      * ms since the UNIX epoch
      */
-    TIMESTAMP_MILLISECONDS(8, 14, "timestamp[ms]"),
+    TIMESTAMP_MILLISECONDS(8, 14),
     /**
      * microseconds since the UNIX epoch
      */
-    TIMESTAMP_MICROSECONDS(8, 15, "timestamp[us]"),
+    TIMESTAMP_MICROSECONDS(8, 15),
     /**
      * ns since the UNIX epoch
      */
-    TIMESTAMP_NANOSECONDS(8, 16, "timestamp[ns]"),
-
-    //We currently don't have mappings for duration type to I/O files, and these
-    //simpleNames might change in future when we do
-    DURATION_DAYS(4, 17, "int32"),
-    DURATION_SECONDS(8, 18, "int64"),
-    DURATION_MILLISECONDS(8, 19, "int64"),
-    DURATION_MICROSECONDS(8, 20, "int64"),
-    DURATION_NANOSECONDS(8, 21, "int64"),
-    //DICTIONARY32(4, 22, "NO IDEA"),
-
-    STRING(0, 23, "str"),
-    LIST(0, 24, "list"),
-    DECIMAL32(4, 25, "decimal32"),
-    DECIMAL64(8, 26, "decimal64"),
-    STRUCT(0, 27, "struct");
+    TIMESTAMP_NANOSECONDS(8, 16),
+
+    DURATION_DAYS(4, 17),
+    DURATION_SECONDS(8, 18),
+    DURATION_MILLISECONDS(8, 19),
+    DURATION_MICROSECONDS(8, 20),
+    DURATION_NANOSECONDS(8, 21),
+    //DICTIONARY32(4, 22),
+
+    STRING(0, 23),
+    LIST(0, 24),
+    DECIMAL32(4, 25),
+    DECIMAL64(8, 26),
+    STRUCT(0, 27);
 
     final int sizeInBytes;
     final int nativeId;
-    final String simpleName;
 
-    DTypeEnum(int sizeInBytes, int nativeId, String simpleName) {
+    DTypeEnum(int sizeInBytes, int nativeId) {
       this.sizeInBytes = sizeInBytes;
       this.nativeId = nativeId;
-      this.simpleName = simpleName;
     }
 
     public int getNativeId() { return nativeId; }
@@ -191,12 +187,6 @@ private DType(DTypeEnum id, int decimalScale) {
    */
   public int getScale() { return scale; }
 
-  /**
-   * Returns string name mapped to type.
-   * @return name corresponding to type
-   */
-  public String getSimpleName() { return typeId.simpleName; }
-
   /**
    * Return enum for this DType
    * @return DTypeEnum
diff --git a/java/src/main/java/ai/rapids/cudf/Schema.java b/java/src/main/java/ai/rapids/cudf/Schema.java
index f0bc3d930d9..c90d27efa97 100644
--- a/java/src/main/java/ai/rapids/cudf/Schema.java
+++ b/java/src/main/java/ai/rapids/cudf/Schema.java
@@ -27,11 +27,11 @@
 public class Schema {
   public static final Schema INFERRED = new Schema();
   private final List<String> names;
-  private final List<String> typeNames;
+  private final List<DType> types;
 
-  private Schema(List<String> names, List<String> typeNames) {
+  private Schema(List<String> names, List<DType> types) {
     this.names = new ArrayList<>(names);
-    this.typeNames = new ArrayList<>(typeNames);
+    this.types = new ArrayList<>(types);
   }
 
   /**
@@ -39,7 +39,7 @@ private Schema(List<String> names, List<String> typeNames) {
    */
   private Schema() {
     names = null;
-    typeNames = null;
+    types = null;
   }
 
   public static Builder builder() {
@@ -53,25 +53,40 @@ public String[] getColumnNames() {
     return names.toArray(new String[names.size()]);
   }
 
-  String[] getTypesAsStrings() {
-    if (typeNames == null) {
+  int[] getTypeIds() {
+    if (types == null) {
       return null;
     }
-    return typeNames.toArray(new String[typeNames.size()]);
+    int[] ret = new int[types.size()];
+    for (int i = 0; i < types.size(); i++) {
+      ret[i] = types.get(i).getTypeId().nativeId;
+    }
+    return ret;
+  }
+
+  int[] getTypeScales() {
+    if (types == null) {
+      return null;
+    }
+    int[] ret = new int[types.size()];
+    for (int i = 0; i < types.size(); i++) {
+      ret[i] = types.get(i).getScale();
+    }
+    return ret;
   }
 
   public static class Builder {
     private final List<String> names = new ArrayList<>();
-    private final List<String> typeNames = new ArrayList<>();
+    private final List<DType> types = new ArrayList<>();
 
     public Builder column(DType type, String name) {
-      typeNames.add(type.getSimpleName());
+      types.add(type);
       names.add(name);
       return this;
     }
 
     public Schema build() {
-      return new Schema(names, typeNames);
+      return new Schema(names, types);
     }
   }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 80cf55aa7f0..1fc9616d607 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -200,7 +200,8 @@ private static native long bound(long inputTable, long valueTable,
    * into a java
    * object to try and pull out all of the options.  If this becomes unwieldy we can change it.
    * @param columnNames       names of all of the columns, even the ones filtered out
-   * @param dTypes            types of all of the columns as strings.  Why strings? who knows.
+   * @param dTypeIds          native types IDs of all of the columns.
+   * @param dTypeScales       scale of the type for all of the columns.
    * @param filterColumnNames name of the columns to read, or an empty array if we want to read
    *                          all of them
    * @param filePath          the path of the file to read, or null if no path should be read.
@@ -214,7 +215,8 @@ private static native long bound(long inputTable, long valueTable,
    * @param trueValues        values that should be treated as boolean true
    * @param falseValues       values that should be treated as boolean false
    */
-  private static native long[] readCSV(String[] columnNames, String[] dTypes,
+  private static native long[] readCSV(String[] columnNames,
+                                       int[] dTypeIds, int[] dTypeScales,
                                        String[] filterColumnNames,
                                        String filePath, long address, long length,
                                        int headerRow, byte delim, byte quote,
@@ -652,7 +654,7 @@ public static Table readCSV(Schema schema, File path) {
    */
   public static Table readCSV(Schema schema, CSVOptions opts, File path) {
     return new Table(
-        readCSV(schema.getColumnNames(), schema.getTypesAsStrings(),
+        readCSV(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(),
             opts.getIncludeColumnNames(), path.getAbsolutePath(),
             0, 0,
             opts.getHeaderRow(),
@@ -725,7 +727,7 @@ public static Table readCSV(Schema schema, CSVOptions opts, HostMemoryBuffer buf
     assert len > 0;
     assert len <= buffer.getLength() - offset;
     assert offset >= 0 && offset < buffer.length;
-    return new Table(readCSV(schema.getColumnNames(), schema.getTypesAsStrings(),
+    return new Table(readCSV(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(),
         opts.getIncludeColumnNames(), null,
         buffer.getAddress() + offset, len,
         opts.getHeaderRow(),
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 8db5ea98167..f9cd81e7e97 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1162,7 +1162,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv *env, jclass
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
-    JNIEnv *env, jclass, jobjectArray col_names, jobjectArray data_types,
+    JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
     jobjectArray filter_col_names, jstring inputfilepath, jlong buffer, jlong buffer_length,
     jint header_row, jbyte delim, jbyte quote, jbyte comment, jobjectArray null_values,
     jobjectArray true_values, jobjectArray false_values) {
@@ -1183,7 +1183,23 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jstringArray n_col_names(env, col_names);
-    cudf::jni::native_jstringArray n_data_types(env, data_types);
+    cudf::jni::native_jintArray n_types(env, j_types);
+    cudf::jni::native_jintArray n_scales(env, j_scales);
+    if (n_types.is_null() != n_scales.is_null()) {
+      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match null",
+                    NULL);
+    }
+    std::vector<cudf::data_type> data_types;
+    if (!n_types.is_null()) {
+      if (n_types.size() != n_scales.size()) {
+        JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match size",
+                      NULL);
+      }
+      data_types.reserve(n_types.size());
+      for (int index = 0; index < n_types.size(); index++) {
+        data_types.emplace_back(cudf::jni::make_data_type(n_types[index], n_scales[index]));
+      }
+    }
 
     cudf::jni::native_jstring filename(env, inputfilepath);
     if (!read_buffer && filename.is_empty()) {
@@ -1207,7 +1223,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
                                             .delimiter(delim)
                                             .header(header_row)
                                             .names(n_col_names.as_cpp_vector())
-                                            .dtypes(n_data_types.as_cpp_vector())
+                                            .dtypes(data_types)
                                             .use_cols_names(n_filter_col_names.as_cpp_vector())
                                             .true_values(n_true_values.as_cpp_vector())
                                             .false_values(n_false_values.as_cpp_vector())
@@ -1217,6 +1233,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
                                             .quotechar(quote)
                                             .comment(comment)
                                             .build();
+
     cudf::io::table_with_metadata result = cudf::io::read_csv(opts);
     return cudf::jni::convert_table_for_return(env, result.tbl);
   }

From 94c659c0adc1398fb7ef42c5dc06a1b37cdb21b7 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Thu, 19 Aug 2021 09:12:08 -0500
Subject: [PATCH 13/46] Add groupby_aggregation and groupby_scan_aggregation
 classes and force their usage. (#8906)

Followup to https://github.com/rapidsai/cudf/pull/8052
Partially addresses #7106

Adds the `groupby_aggregation` class and forces usage of that type when calling `groupby::aggregate()`

Adds the `groupby_scan_aggregation` class and forces usage of that type when calling `groupby::scan()`

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Jake Hemstad (https://github.com/jrhemstad)
  - David Wendt (https://github.com/davidwendt)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Nghia Truong (https://github.com/ttnghia)
  - Devavret Makkar (https://github.com/devavret)

URL: https://github.com/rapidsai/cudf/pull/8906
---
 cpp/benchmarks/groupby/group_nth_benchmark.cu |   2 +-
 cpp/examples/basic/src/process_csv.cpp        |   4 +-
 cpp/include/cudf/aggregation.hpp              |  25 +-
 .../cudf/detail/aggregation/aggregation.hpp   |  63 +--
 cpp/include/cudf/groupby.hpp                  |  23 +-
 cpp/src/aggregation/aggregation.cpp           |  40 ++
 cpp/src/groupby/common/utils.hpp              |   6 +-
 cpp/src/groupby/groupby.cu                    |   8 +-
 cpp/src/groupby/sort/scan.cpp                 |   2 +-
 cpp/src/io/json/reader_impl.cu                |   8 +-
 cpp/tests/groupby/argmax_tests.cpp            |  29 +-
 cpp/tests/groupby/argmin_tests.cpp            |  29 +-
 cpp/tests/groupby/collect_list_tests.cpp      |  23 +-
 cpp/tests/groupby/collect_set_tests.cpp       |  14 +-
 cpp/tests/groupby/count_scan_tests.cpp        |  33 +-
 cpp/tests/groupby/count_tests.cpp             |  47 ++-
 cpp/tests/groupby/groupby_test_util.hpp       |  10 +-
 cpp/tests/groupby/keys_tests.cpp              |  32 +-
 cpp/tests/groupby/m2_tests.cpp                |   2 +-
 cpp/tests/groupby/max_scan_tests.cpp          |  12 +-
 cpp/tests/groupby/max_tests.cpp               |  42 +-
 cpp/tests/groupby/mean_tests.cpp              |  13 +-
 cpp/tests/groupby/median_tests.cpp            |  13 +-
 cpp/tests/groupby/merge_lists_tests.cpp       |   3 +-
 cpp/tests/groupby/merge_m2_tests.cpp          |   9 +-
 cpp/tests/groupby/merge_sets_tests.cpp        |   3 +-
 cpp/tests/groupby/min_scan_tests.cpp          |  14 +-
 cpp/tests/groupby/min_tests.cpp               |  42 +-
 cpp/tests/groupby/nth_element_tests.cpp       |  87 +++--
 cpp/tests/groupby/nunique_tests.cpp           |  23 +-
 cpp/tests/groupby/product_tests.cpp           |  42 +-
 cpp/tests/groupby/quantile_tests.cpp          |  34 +-
 cpp/tests/groupby/rank_scan_tests.cpp         |  82 ++--
 cpp/tests/groupby/std_tests.cpp               |  15 +-
 cpp/tests/groupby/sum_of_squares_tests.cpp    |  16 +-
 cpp/tests/groupby/sum_scan_tests.cpp          |  12 +-
 cpp/tests/groupby/sum_tests.cpp               |  37 +-
 cpp/tests/groupby/var_tests.cpp               |  18 +-
 java/src/main/native/src/TableJni.cpp         |  28 +-
 python/cudf/cudf/_lib/aggregation.pxd         |  15 +-
 python/cudf/cudf/_lib/aggregation.pyx         | 369 ++++++++++++++++++
 python/cudf/cudf/_lib/cpp/aggregation.pxd     |   6 +
 python/cudf/cudf/_lib/cpp/groupby.pxd         |  14 +-
 python/cudf/cudf/_lib/groupby.pyx             | 158 +++++---
 44 files changed, 1105 insertions(+), 402 deletions(-)

diff --git a/cpp/benchmarks/groupby/group_nth_benchmark.cu b/cpp/benchmarks/groupby/group_nth_benchmark.cu
index 9765a4a265c..c6dbffb162e 100644
--- a/cpp/benchmarks/groupby/group_nth_benchmark.cu
+++ b/cpp/benchmarks/groupby/group_nth_benchmark.cu
@@ -63,7 +63,7 @@ void BM_pre_sorted_nth(benchmark::State& state)
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
   requests[0].values = vals;
-  requests[0].aggregations.push_back(cudf::make_nth_element_aggregation(-1));
+  requests[0].aggregations.push_back(cudf::make_nth_element_aggregation<groupby_aggregation>(-1));
 
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
diff --git a/cpp/examples/basic/src/process_csv.cpp b/cpp/examples/basic/src/process_csv.cpp
index 2467c97393b..cd469af0036 100644
--- a/cpp/examples/basic/src/process_csv.cpp
+++ b/cpp/examples/basic/src/process_csv.cpp
@@ -25,7 +25,7 @@ void write_csv(cudf::table_view const& tbl_view, std::string const& file_path)
 }
 
 std::vector<cudf::groupby::aggregation_request> make_single_aggregation_request(
-  std::unique_ptr<cudf::aggregation>&& agg, cudf::column_view value)
+  std::unique_ptr<cudf::groupby_aggregation>&& agg, cudf::column_view value)
 {
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
@@ -42,7 +42,7 @@ std::unique_ptr<cudf::table> average_closing_price(cudf::table_view stock_info_t
 
   // Compute the average of each company's closing price with entire column
   cudf::groupby::groupby grpby_obj(keys);
-  auto requests = make_single_aggregation_request(cudf::make_mean_aggregation(), val);
+  auto requests = make_single_aggregation_request(cudf::make_mean_aggregation<cudf::groupby_aggregation>(), val);
 
   auto agg_results = grpby_obj.aggregate(requests);
 
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index 7ac3638b21c..ff665e2706a 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -106,8 +106,7 @@ class aggregation {
 };
 
 /**
- * @brief Derived class intended for enforcing operation-specific restrictions
- * when calling various cudf functions.
+ * @brief Derived class intended for rolling_window specific aggregation usage.
  *
  * As an example, rolling_window will only accept rolling_aggregation inputs,
  * and the appropriate derived classes (sum_aggregation, mean_aggregation, etc)
@@ -121,6 +120,28 @@ class rolling_aggregation : public virtual aggregation {
   rolling_aggregation() {}
 };
 
+/**
+ * @brief Derived class intended for groupby specific aggregation usage.
+ */
+class groupby_aggregation : public virtual aggregation {
+ public:
+  ~groupby_aggregation() = default;
+
+ protected:
+  groupby_aggregation() {}
+};
+
+/**
+ * @brief Derived class intended for groupby specific scan usage.
+ */
+class groupby_scan_aggregation : public virtual aggregation {
+ public:
+  ~groupby_scan_aggregation() = default;
+
+ protected:
+  groupby_scan_aggregation() {}
+};
+
 enum class udf_type : bool { CUDA, PTX };
 
 /// Factory to create a SUM aggregation
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 163ad3e480f..4e4c63ae517 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -130,7 +130,9 @@ class aggregation_finalizer {  // Declares the interface for the finalizer
 /**
  * @brief Derived class for specifying a sum aggregation
  */
-class sum_aggregation final : public rolling_aggregation {
+class sum_aggregation final : public rolling_aggregation,
+                              public groupby_aggregation,
+                              public groupby_scan_aggregation {
  public:
   sum_aggregation() : aggregation(SUM) {}
 
@@ -149,7 +151,7 @@ class sum_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived class for specifying a product aggregation
  */
-class product_aggregation final : public aggregation {
+class product_aggregation final : public groupby_aggregation {
  public:
   product_aggregation() : aggregation(PRODUCT) {}
 
@@ -168,7 +170,9 @@ class product_aggregation final : public aggregation {
 /**
  * @brief Derived class for specifying a min aggregation
  */
-class min_aggregation final : public rolling_aggregation {
+class min_aggregation final : public rolling_aggregation,
+                              public groupby_aggregation,
+                              public groupby_scan_aggregation {
  public:
   min_aggregation() : aggregation(MIN) {}
 
@@ -187,7 +191,9 @@ class min_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived class for specifying a max aggregation
  */
-class max_aggregation final : public rolling_aggregation {
+class max_aggregation final : public rolling_aggregation,
+                              public groupby_aggregation,
+                              public groupby_scan_aggregation {
  public:
   max_aggregation() : aggregation(MAX) {}
 
@@ -206,7 +212,9 @@ class max_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived class for specifying a count aggregation
  */
-class count_aggregation final : public rolling_aggregation {
+class count_aggregation final : public rolling_aggregation,
+                                public groupby_aggregation,
+                                public groupby_scan_aggregation {
  public:
   count_aggregation(aggregation::Kind kind) : aggregation(kind) {}
 
@@ -263,7 +271,7 @@ class all_aggregation final : public aggregation {
 /**
  * @brief Derived class for specifying a sum_of_squares aggregation
  */
-class sum_of_squares_aggregation final : public aggregation {
+class sum_of_squares_aggregation final : public groupby_aggregation {
  public:
   sum_of_squares_aggregation() : aggregation(SUM_OF_SQUARES) {}
 
@@ -282,7 +290,7 @@ class sum_of_squares_aggregation final : public aggregation {
 /**
  * @brief Derived class for specifying a mean aggregation
  */
-class mean_aggregation final : public rolling_aggregation {
+class mean_aggregation final : public rolling_aggregation, public groupby_aggregation {
  public:
   mean_aggregation() : aggregation(MEAN) {}
 
@@ -301,7 +309,7 @@ class mean_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived class for specifying a m2 aggregation
  */
-class m2_aggregation : public aggregation {
+class m2_aggregation : public groupby_aggregation {
  public:
   m2_aggregation() : aggregation{M2} {}
 
@@ -320,7 +328,7 @@ class m2_aggregation : public aggregation {
 /**
  * @brief Derived class for specifying a standard deviation/variance aggregation
  */
-class std_var_aggregation : public aggregation {
+class std_var_aggregation : public groupby_aggregation {
  public:
   size_type _ddof;  ///< Delta degrees of freedom
 
@@ -339,7 +347,6 @@ class std_var_aggregation : public aggregation {
     CUDF_EXPECTS(k == aggregation::STD or k == aggregation::VARIANCE,
                  "std_var_aggregation can accept only STD, VARIANCE");
   }
-
   size_type hash_impl() const { return std::hash<size_type>{}(_ddof); }
 };
 
@@ -348,7 +355,10 @@ class std_var_aggregation : public aggregation {
  */
 class var_aggregation final : public std_var_aggregation {
  public:
-  var_aggregation(size_type ddof) : std_var_aggregation{aggregation::VARIANCE, ddof} {}
+  var_aggregation(size_type ddof)
+    : aggregation{aggregation::VARIANCE}, std_var_aggregation{aggregation::VARIANCE, ddof}
+  {
+  }
 
   std::unique_ptr<aggregation> clone() const override
   {
@@ -367,7 +377,10 @@ class var_aggregation final : public std_var_aggregation {
  */
 class std_aggregation final : public std_var_aggregation {
  public:
-  std_aggregation(size_type ddof) : std_var_aggregation{aggregation::STD, ddof} {}
+  std_aggregation(size_type ddof)
+    : aggregation{aggregation::STD}, std_var_aggregation{aggregation::STD, ddof}
+  {
+  }
 
   std::unique_ptr<aggregation> clone() const override
   {
@@ -384,7 +397,7 @@ class std_aggregation final : public std_var_aggregation {
 /**
  * @brief Derived class for specifying a median aggregation
  */
-class median_aggregation final : public aggregation {
+class median_aggregation final : public groupby_aggregation {
  public:
   median_aggregation() : aggregation(MEDIAN) {}
 
@@ -403,7 +416,7 @@ class median_aggregation final : public aggregation {
 /**
  * @brief Derived class for specifying a quantile aggregation
  */
-class quantile_aggregation final : public aggregation {
+class quantile_aggregation final : public groupby_aggregation {
  public:
   quantile_aggregation(std::vector<double> const& q, interpolation i)
     : aggregation{QUANTILE}, _quantiles{q}, _interpolation{i}
@@ -449,7 +462,7 @@ class quantile_aggregation final : public aggregation {
 /**
  * @brief Derived class for specifying an argmax aggregation
  */
-class argmax_aggregation final : public rolling_aggregation {
+class argmax_aggregation final : public rolling_aggregation, public groupby_aggregation {
  public:
   argmax_aggregation() : aggregation(ARGMAX) {}
 
@@ -468,7 +481,7 @@ class argmax_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived class for specifying an argmin aggregation
  */
-class argmin_aggregation final : public rolling_aggregation {
+class argmin_aggregation final : public rolling_aggregation, public groupby_aggregation {
  public:
   argmin_aggregation() : aggregation(ARGMIN) {}
 
@@ -487,7 +500,7 @@ class argmin_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived class for specifying a nunique aggregation
  */
-class nunique_aggregation final : public aggregation {
+class nunique_aggregation final : public groupby_aggregation {
  public:
   nunique_aggregation(null_policy null_handling)
     : aggregation{NUNIQUE}, _null_handling{null_handling}
@@ -523,7 +536,7 @@ class nunique_aggregation final : public aggregation {
 /**
  * @brief Derived class for specifying a nth element aggregation
  */
-class nth_element_aggregation final : public aggregation {
+class nth_element_aggregation final : public groupby_aggregation {
  public:
   nth_element_aggregation(size_type n, null_policy null_handling)
     : aggregation{NTH_ELEMENT}, _n{n}, _null_handling{null_handling}
@@ -582,7 +595,7 @@ class row_number_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived class for specifying a rank aggregation
  */
-class rank_aggregation final : public rolling_aggregation {
+class rank_aggregation final : public rolling_aggregation, public groupby_scan_aggregation {
  public:
   rank_aggregation() : aggregation{RANK} {}
 
@@ -601,7 +614,7 @@ class rank_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived class for specifying a dense rank aggregation
  */
-class dense_rank_aggregation final : public rolling_aggregation {
+class dense_rank_aggregation final : public rolling_aggregation, public groupby_scan_aggregation {
  public:
   dense_rank_aggregation() : aggregation{DENSE_RANK} {}
 
@@ -620,7 +633,7 @@ class dense_rank_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived aggregation class for specifying COLLECT_LIST aggregation
  */
-class collect_list_aggregation final : public rolling_aggregation {
+class collect_list_aggregation final : public rolling_aggregation, public groupby_aggregation {
  public:
   explicit collect_list_aggregation(null_policy null_handling = null_policy::INCLUDE)
     : aggregation{COLLECT_LIST}, _null_handling{null_handling}
@@ -656,7 +669,7 @@ class collect_list_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived aggregation class for specifying COLLECT_SET aggregation
  */
-class collect_set_aggregation final : public rolling_aggregation {
+class collect_set_aggregation final : public rolling_aggregation, public groupby_aggregation {
  public:
   explicit collect_set_aggregation(null_policy null_handling = null_policy::INCLUDE,
                                    null_equality nulls_equal = null_equality::EQUAL,
@@ -795,7 +808,7 @@ class udf_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived aggregation class for specifying MERGE_LISTS aggregation
  */
-class merge_lists_aggregation final : public aggregation {
+class merge_lists_aggregation final : public groupby_aggregation {
  public:
   explicit merge_lists_aggregation() : aggregation{MERGE_LISTS} {}
 
@@ -814,7 +827,7 @@ class merge_lists_aggregation final : public aggregation {
 /**
  * @brief Derived aggregation class for specifying MERGE_SETS aggregation
  */
-class merge_sets_aggregation final : public aggregation {
+class merge_sets_aggregation final : public groupby_aggregation {
  public:
   explicit merge_sets_aggregation(null_equality nulls_equal, nan_equality nans_equal)
     : aggregation{MERGE_SETS}, _nulls_equal(nulls_equal), _nans_equal(nans_equal)
@@ -855,7 +868,7 @@ class merge_sets_aggregation final : public aggregation {
 /**
  * @brief Derived aggregation class for specifying MERGE_M2 aggregation
  */
-class merge_m2_aggregation final : public aggregation {
+class merge_m2_aggregation final : public groupby_aggregation {
  public:
   explicit merge_m2_aggregation() : aggregation{MERGE_M2} {}
 
diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp
index 5656b38a0ef..3b8354ebc9f 100644
--- a/cpp/include/cudf/groupby.hpp
+++ b/cpp/include/cudf/groupby.hpp
@@ -56,8 +56,23 @@ class sort_groupby_helper;
  * `values.size()` column must equal `keys.num_rows()`.
  */
 struct aggregation_request {
-  column_view values;                                      ///< The elements to aggregate
-  std::vector<std::unique_ptr<aggregation>> aggregations;  ///< Desired aggregations
+  column_view values;                                              ///< The elements to aggregate
+  std::vector<std::unique_ptr<groupby_aggregation>> aggregations;  ///< Desired aggregations
+};
+
+/**
+ * @brief Request for groupby aggregation(s) for scanning a column.
+ *
+ * The group membership of each `value[i]` is determined by the corresponding
+ * row `i` in the original order of `keys` used to construct the
+ * `groupby`. I.e., for each `aggregation`, `values[i]` is aggregated with all
+ * other `values[j]` where rows `i` and `j` in `keys` are equivalent.
+ *
+ * `values.size()` column must equal `keys.num_rows()`.
+ */
+struct scan_request {
+  column_view values;  ///< The elements to aggregate
+  std::vector<std::unique_ptr<groupby_scan_aggregation>> aggregations;  ///< Desired aggregations
 };
 
 /**
@@ -222,7 +237,7 @@ class groupby {
    * specified in `requests`.
    */
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> scan(
-    host_span<aggregation_request const> requests,
+    host_span<scan_request const> requests,
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -388,7 +403,7 @@ class groupby {
     rmm::mr::device_memory_resource* mr);
 
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> sort_scan(
-    host_span<aggregation_request const> requests,
+    host_span<scan_request const> requests,
     rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr);
 };
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index 016f2367139..f0c522257fb 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -362,6 +362,8 @@ std::unique_ptr<Base> make_sum_aggregation()
 }
 template std::unique_ptr<aggregation> make_sum_aggregation<aggregation>();
 template std::unique_ptr<rolling_aggregation> make_sum_aggregation<rolling_aggregation>();
+template std::unique_ptr<groupby_aggregation> make_sum_aggregation<groupby_aggregation>();
+template std::unique_ptr<groupby_scan_aggregation> make_sum_aggregation<groupby_scan_aggregation>();
 
 /// Factory to create a PRODUCT aggregation
 template <typename Base>
@@ -370,6 +372,7 @@ std::unique_ptr<Base> make_product_aggregation()
   return std::make_unique<detail::product_aggregation>();
 }
 template std::unique_ptr<aggregation> make_product_aggregation<aggregation>();
+template std::unique_ptr<groupby_aggregation> make_product_aggregation<groupby_aggregation>();
 
 /// Factory to create a MIN aggregation
 template <typename Base>
@@ -379,6 +382,8 @@ std::unique_ptr<Base> make_min_aggregation()
 }
 template std::unique_ptr<aggregation> make_min_aggregation<aggregation>();
 template std::unique_ptr<rolling_aggregation> make_min_aggregation<rolling_aggregation>();
+template std::unique_ptr<groupby_aggregation> make_min_aggregation<groupby_aggregation>();
+template std::unique_ptr<groupby_scan_aggregation> make_min_aggregation<groupby_scan_aggregation>();
 
 /// Factory to create a MAX aggregation
 template <typename Base>
@@ -388,6 +393,8 @@ std::unique_ptr<Base> make_max_aggregation()
 }
 template std::unique_ptr<aggregation> make_max_aggregation<aggregation>();
 template std::unique_ptr<rolling_aggregation> make_max_aggregation<rolling_aggregation>();
+template std::unique_ptr<groupby_aggregation> make_max_aggregation<groupby_aggregation>();
+template std::unique_ptr<groupby_scan_aggregation> make_max_aggregation<groupby_scan_aggregation>();
 
 /// Factory to create a COUNT aggregation
 template <typename Base>
@@ -401,6 +408,10 @@ template std::unique_ptr<aggregation> make_count_aggregation<aggregation>(
   null_policy null_handling);
 template std::unique_ptr<rolling_aggregation> make_count_aggregation<rolling_aggregation>(
   null_policy null_handling);
+template std::unique_ptr<groupby_aggregation> make_count_aggregation<groupby_aggregation>(
+  null_policy null_handling);
+template std::unique_ptr<groupby_scan_aggregation> make_count_aggregation<groupby_scan_aggregation>(
+  null_policy null_handling);
 
 /// Factory to create a ANY aggregation
 template <typename Base>
@@ -425,6 +436,8 @@ std::unique_ptr<Base> make_sum_of_squares_aggregation()
   return std::make_unique<detail::sum_of_squares_aggregation>();
 }
 template std::unique_ptr<aggregation> make_sum_of_squares_aggregation<aggregation>();
+template std::unique_ptr<groupby_aggregation>
+make_sum_of_squares_aggregation<groupby_aggregation>();
 
 /// Factory to create a MEAN aggregation
 template <typename Base>
@@ -434,6 +447,7 @@ std::unique_ptr<Base> make_mean_aggregation()
 }
 template std::unique_ptr<aggregation> make_mean_aggregation<aggregation>();
 template std::unique_ptr<rolling_aggregation> make_mean_aggregation<rolling_aggregation>();
+template std::unique_ptr<groupby_aggregation> make_mean_aggregation<groupby_aggregation>();
 
 /// Factory to create a M2 aggregation
 template <typename Base>
@@ -442,6 +456,7 @@ std::unique_ptr<Base> make_m2_aggregation()
   return std::make_unique<detail::m2_aggregation>();
 }
 template std::unique_ptr<aggregation> make_m2_aggregation<aggregation>();
+template std::unique_ptr<groupby_aggregation> make_m2_aggregation<groupby_aggregation>();
 
 /// Factory to create a VARIANCE aggregation
 template <typename Base>
@@ -450,6 +465,8 @@ std::unique_ptr<Base> make_variance_aggregation(size_type ddof)
   return std::make_unique<detail::var_aggregation>(ddof);
 }
 template std::unique_ptr<aggregation> make_variance_aggregation<aggregation>(size_type ddof);
+template std::unique_ptr<groupby_aggregation> make_variance_aggregation<groupby_aggregation>(
+  size_type ddof);
 
 /// Factory to create a STD aggregation
 template <typename Base>
@@ -458,6 +475,8 @@ std::unique_ptr<Base> make_std_aggregation(size_type ddof)
   return std::make_unique<detail::std_aggregation>(ddof);
 }
 template std::unique_ptr<aggregation> make_std_aggregation<aggregation>(size_type ddof);
+template std::unique_ptr<groupby_aggregation> make_std_aggregation<groupby_aggregation>(
+  size_type ddof);
 
 /// Factory to create a MEDIAN aggregation
 template <typename Base>
@@ -466,6 +485,7 @@ std::unique_ptr<Base> make_median_aggregation()
   return std::make_unique<detail::median_aggregation>();
 }
 template std::unique_ptr<aggregation> make_median_aggregation<aggregation>();
+template std::unique_ptr<groupby_aggregation> make_median_aggregation<groupby_aggregation>();
 
 /// Factory to create a QUANTILE aggregation
 template <typename Base>
@@ -475,6 +495,8 @@ std::unique_ptr<Base> make_quantile_aggregation(std::vector<double> const& q, in
 }
 template std::unique_ptr<aggregation> make_quantile_aggregation<aggregation>(
   std::vector<double> const& q, interpolation i);
+template std::unique_ptr<groupby_aggregation> make_quantile_aggregation<groupby_aggregation>(
+  std::vector<double> const& q, interpolation i);
 
 /// Factory to create an ARGMAX aggregation
 template <typename Base>
@@ -484,6 +506,7 @@ std::unique_ptr<Base> make_argmax_aggregation()
 }
 template std::unique_ptr<aggregation> make_argmax_aggregation<aggregation>();
 template std::unique_ptr<rolling_aggregation> make_argmax_aggregation<rolling_aggregation>();
+template std::unique_ptr<groupby_aggregation> make_argmax_aggregation<groupby_aggregation>();
 
 /// Factory to create an ARGMIN aggregation
 template <typename Base>
@@ -493,6 +516,7 @@ std::unique_ptr<Base> make_argmin_aggregation()
 }
 template std::unique_ptr<aggregation> make_argmin_aggregation<aggregation>();
 template std::unique_ptr<rolling_aggregation> make_argmin_aggregation<rolling_aggregation>();
+template std::unique_ptr<groupby_aggregation> make_argmin_aggregation<groupby_aggregation>();
 
 /// Factory to create an NUNIQUE aggregation
 template <typename Base>
@@ -502,6 +526,8 @@ std::unique_ptr<Base> make_nunique_aggregation(null_policy null_handling)
 }
 template std::unique_ptr<aggregation> make_nunique_aggregation<aggregation>(
   null_policy null_handling);
+template std::unique_ptr<groupby_aggregation> make_nunique_aggregation<groupby_aggregation>(
+  null_policy null_handling);
 
 /// Factory to create an NTH_ELEMENT aggregation
 template <typename Base>
@@ -511,6 +537,8 @@ std::unique_ptr<Base> make_nth_element_aggregation(size_type n, null_policy null
 }
 template std::unique_ptr<aggregation> make_nth_element_aggregation<aggregation>(
   size_type n, null_policy null_handling);
+template std::unique_ptr<groupby_aggregation> make_nth_element_aggregation<groupby_aggregation>(
+  size_type n, null_policy null_handling);
 
 /// Factory to create a ROW_NUMBER aggregation
 template <typename Base>
@@ -528,6 +556,8 @@ std::unique_ptr<Base> make_rank_aggregation()
   return std::make_unique<detail::rank_aggregation>();
 }
 template std::unique_ptr<aggregation> make_rank_aggregation<aggregation>();
+template std::unique_ptr<groupby_scan_aggregation>
+make_rank_aggregation<groupby_scan_aggregation>();
 
 /// Factory to create a DENSE_RANK aggregation
 template <typename Base>
@@ -536,6 +566,8 @@ std::unique_ptr<Base> make_dense_rank_aggregation()
   return std::make_unique<detail::dense_rank_aggregation>();
 }
 template std::unique_ptr<aggregation> make_dense_rank_aggregation<aggregation>();
+template std::unique_ptr<groupby_scan_aggregation>
+make_dense_rank_aggregation<groupby_scan_aggregation>();
 
 /// Factory to create a COLLECT_LIST aggregation
 template <typename Base>
@@ -547,6 +579,8 @@ template std::unique_ptr<aggregation> make_collect_list_aggregation<aggregation>
   null_policy null_handling);
 template std::unique_ptr<rolling_aggregation> make_collect_list_aggregation<rolling_aggregation>(
   null_policy null_handling);
+template std::unique_ptr<groupby_aggregation> make_collect_list_aggregation<groupby_aggregation>(
+  null_policy null_handling);
 
 /// Factory to create a COLLECT_SET aggregation
 template <typename Base>
@@ -560,6 +594,8 @@ template std::unique_ptr<aggregation> make_collect_set_aggregation<aggregation>(
   null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal);
 template std::unique_ptr<rolling_aggregation> make_collect_set_aggregation<rolling_aggregation>(
   null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal);
+template std::unique_ptr<groupby_aggregation> make_collect_set_aggregation<groupby_aggregation>(
+  null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal);
 
 /// Factory to create a LAG aggregation
 template <typename Base>
@@ -605,6 +641,7 @@ std::unique_ptr<Base> make_merge_lists_aggregation()
   return std::make_unique<detail::merge_lists_aggregation>();
 }
 template std::unique_ptr<aggregation> make_merge_lists_aggregation<aggregation>();
+template std::unique_ptr<groupby_aggregation> make_merge_lists_aggregation<groupby_aggregation>();
 
 /// Factory to create a MERGE_SETS aggregation
 template <typename Base>
@@ -615,6 +652,8 @@ std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal,
 }
 template std::unique_ptr<aggregation> make_merge_sets_aggregation<aggregation>(null_equality,
                                                                                nan_equality);
+template std::unique_ptr<groupby_aggregation> make_merge_sets_aggregation<groupby_aggregation>(
+  null_equality, nan_equality);
 
 /// Factory to create a MERGE_M2 aggregation
 template <typename Base>
@@ -623,6 +662,7 @@ std::unique_ptr<Base> make_merge_m2_aggregation()
   return std::make_unique<detail::merge_m2_aggregation>();
 }
 template std::unique_ptr<aggregation> make_merge_m2_aggregation<aggregation>();
+template std::unique_ptr<groupby_aggregation> make_merge_m2_aggregation<groupby_aggregation>();
 
 namespace detail {
 namespace {
diff --git a/cpp/src/groupby/common/utils.hpp b/cpp/src/groupby/common/utils.hpp
index e8d5c60f81a..3da20fb9af3 100644
--- a/cpp/src/groupby/common/utils.hpp
+++ b/cpp/src/groupby/common/utils.hpp
@@ -24,8 +24,10 @@
 namespace cudf {
 namespace groupby {
 namespace detail {
-inline std::vector<aggregation_result> extract_results(
-  host_span<aggregation_request const> requests, cudf::detail::result_cache& cache)
+
+template <typename RequestType>
+inline std::vector<aggregation_result> extract_results(host_span<RequestType const> requests,
+                                                       cudf::detail::result_cache& cache)
 {
   std::vector<aggregation_result> results(requests.size());
 
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 8c43c071a85..a26d69e3d46 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -120,7 +120,8 @@ struct empty_column_constructor {
 };
 
 /// Make an empty table with appropriate types for requested aggs
-auto empty_results(host_span<aggregation_request const> requests)
+template <typename RequestType>
+auto empty_results(host_span<RequestType const> requests)
 {
   std::vector<aggregation_result> empty_results;
 
@@ -144,7 +145,8 @@ auto empty_results(host_span<aggregation_request const> requests)
 }
 
 /// Verifies the agg requested on the request's values is valid
-void verify_valid_requests(host_span<aggregation_request const> requests)
+template <typename RequestType>
+void verify_valid_requests(host_span<RequestType const> requests)
 {
   CUDF_EXPECTS(
     std::all_of(
@@ -184,7 +186,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggr
 
 // Compute scan requests
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::scan(
-  host_span<aggregation_request const> requests, rmm::mr::device_memory_resource* mr)
+  host_span<scan_request const> requests, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(
diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp
index 450a8313402..c43df77bb5e 100644
--- a/cpp/src/groupby/sort/scan.cpp
+++ b/cpp/src/groupby/sort/scan.cpp
@@ -152,7 +152,7 @@ void scan_result_functor::operator()<aggregation::DENSE_RANK>(aggregation const&
 
 // Sort-based groupby
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::sort_scan(
-  host_span<aggregation_request const> requests,
+  host_span<scan_request const> requests,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index a8f117c22bf..85608a0984a 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -87,12 +87,12 @@ std::unique_ptr<table> aggregate_keys_info(std::unique_ptr<table> info)
   auto const info_view = info->view();
   std::vector<groupby::aggregation_request> requests;
   requests.emplace_back(groupby::aggregation_request{info_view.column(0)});
-  requests.back().aggregations.emplace_back(make_min_aggregation());
-  requests.back().aggregations.emplace_back(make_nth_element_aggregation(0));
+  requests.back().aggregations.emplace_back(make_min_aggregation<groupby_aggregation>());
+  requests.back().aggregations.emplace_back(make_nth_element_aggregation<groupby_aggregation>(0));
 
   requests.emplace_back(groupby::aggregation_request{info_view.column(1)});
-  requests.back().aggregations.emplace_back(make_min_aggregation());
-  requests.back().aggregations.emplace_back(make_nth_element_aggregation(0));
+  requests.back().aggregations.emplace_back(make_min_aggregation<groupby_aggregation>());
+  requests.back().aggregations.emplace_back(make_nth_element_aggregation<groupby_aggregation>(0));
 
   // Aggregate by hash values
   groupby::groupby gb_obj(
diff --git a/cpp/tests/groupby/argmax_tests.cpp b/cpp/tests/groupby/argmax_tests.cpp
index 6bf627d7b78..7cf693f7b08 100644
--- a/cpp/tests/groupby/argmax_tests.cpp
+++ b/cpp/tests/groupby/argmax_tests.cpp
@@ -47,10 +47,10 @@ TYPED_TEST(groupby_argmax_test, basic)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
   fixed_width_column_wrapper<R> expect_vals{0, 1, 2};
 
-  auto agg = cudf::make_argmax_aggregation();
+  auto agg = cudf::make_argmax_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_argmax_aggregation();
+  auto agg2 = cudf::make_argmax_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -67,10 +67,10 @@ TYPED_TEST(groupby_argmax_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_argmax_aggregation();
+  auto agg = cudf::make_argmax_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_argmax_aggregation();
+  auto agg2 = cudf::make_argmax_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -87,10 +87,10 @@ TYPED_TEST(groupby_argmax_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  auto agg = cudf::make_argmax_aggregation();
+  auto agg = cudf::make_argmax_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_argmax_aggregation();
+  auto agg2 = cudf::make_argmax_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -111,10 +111,10 @@ TYPED_TEST(groupby_argmax_test, null_keys_and_values)
   //  {6, 3,     5, 4, 0,   2, 1,    -}
   fixed_width_column_wrapper<R> expect_vals({3, 4, 7, 0}, {1, 1, 1, 0});
 
-  auto agg = cudf::make_argmax_aggregation();
+  auto agg = cudf::make_argmax_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_argmax_aggregation();
+  auto agg2 = cudf::make_argmax_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -132,10 +132,10 @@ TEST_F(groupby_argmax_string_test, basic)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
   fixed_width_column_wrapper<R> expect_vals({0, 4, 2});
 
-  auto agg = cudf::make_argmax_aggregation();
+  auto agg = cudf::make_argmax_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_argmax_aggregation();
+  auto agg2 = cudf::make_argmax_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -150,10 +150,10 @@ TEST_F(groupby_argmax_string_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  auto agg = cudf::make_argmax_aggregation();
+  auto agg = cudf::make_argmax_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_argmax_aggregation();
+  auto agg2 = cudf::make_argmax_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -172,12 +172,13 @@ TEST_F(groupby_dictionary_argmax_test, basic)
   fixed_width_column_wrapper<R> expect_vals({ 0, 4, 2 });
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_argmax_aggregation());
+  test_single_agg(
+    keys, vals, expect_keys, expect_vals, cudf::make_argmax_aggregation<groupby_aggregation>());
   test_single_agg(keys,
                   vals,
                   expect_keys,
                   expect_vals,
-                  cudf::make_argmax_aggregation(),
+                  cudf::make_argmax_aggregation<groupby_aggregation>(),
                   force_use_sort_impl::YES);
 }
 
diff --git a/cpp/tests/groupby/argmin_tests.cpp b/cpp/tests/groupby/argmin_tests.cpp
index d192c1b21b1..915575546c9 100644
--- a/cpp/tests/groupby/argmin_tests.cpp
+++ b/cpp/tests/groupby/argmin_tests.cpp
@@ -47,10 +47,10 @@ TYPED_TEST(groupby_argmin_test, basic)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
   fixed_width_column_wrapper<R> expect_vals{6, 9, 8};
 
-  auto agg = cudf::make_argmin_aggregation();
+  auto agg = cudf::make_argmin_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_argmin_aggregation();
+  auto agg2 = cudf::make_argmin_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -67,10 +67,10 @@ TYPED_TEST(groupby_argmin_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_argmin_aggregation();
+  auto agg = cudf::make_argmin_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_argmin_aggregation();
+  auto agg2 = cudf::make_argmin_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -87,10 +87,10 @@ TYPED_TEST(groupby_argmin_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  auto agg = cudf::make_argmin_aggregation();
+  auto agg = cudf::make_argmin_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_argmin_aggregation();
+  auto agg2 = cudf::make_argmin_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -111,11 +111,11 @@ TYPED_TEST(groupby_argmin_test, null_keys_and_values)
   //  { 9, 6,     8, 5, 0,   7, 1,    -}
   fixed_width_column_wrapper<R> expect_vals({3, 9, 8, 0}, {1, 1, 1, 0});
 
-  auto agg = cudf::make_argmin_aggregation();
+  auto agg = cudf::make_argmin_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
   // TODO: explore making this a gtest parameter
-  auto agg2 = cudf::make_argmin_aggregation();
+  auto agg2 = cudf::make_argmin_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -133,10 +133,10 @@ TEST_F(groupby_argmin_string_test, basic)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
   fixed_width_column_wrapper<R> expect_vals({3, 5, 7});
 
-  auto agg = cudf::make_argmin_aggregation();
+  auto agg = cudf::make_argmin_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_argmin_aggregation();
+  auto agg2 = cudf::make_argmin_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -151,10 +151,10 @@ TEST_F(groupby_argmin_string_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  auto agg = cudf::make_argmin_aggregation();
+  auto agg = cudf::make_argmin_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_argmin_aggregation();
+  auto agg2 = cudf::make_argmin_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -173,12 +173,13 @@ TEST_F(groupby_dictionary_argmin_test, basic)
   fixed_width_column_wrapper<R> expect_vals({ 3, 5, 7 });
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_argmin_aggregation());
+  test_single_agg(
+    keys, vals, expect_keys, expect_vals, cudf::make_argmin_aggregation<groupby_aggregation>());
   test_single_agg(keys,
                   vals,
                   expect_keys,
                   expect_vals,
-                  cudf::make_argmin_aggregation(),
+                  cudf::make_argmin_aggregation<groupby_aggregation>(),
                   force_use_sort_impl::YES);
 }
 
diff --git a/cpp/tests/groupby/collect_list_tests.cpp b/cpp/tests/groupby/collect_list_tests.cpp
index 43c62743b9f..009917dabae 100644
--- a/cpp/tests/groupby/collect_list_tests.cpp
+++ b/cpp/tests/groupby/collect_list_tests.cpp
@@ -45,7 +45,7 @@ TYPED_TEST(groupby_collect_list_test, CollectWithoutNulls)
   fixed_width_column_wrapper<K, int32_t> expect_keys{1, 2};
   lists_column_wrapper<V, int32_t> expect_vals{{1, 2, 3}, {4, 5, 6}};
 
-  auto agg = cudf::make_collect_list_aggregation();
+  auto agg = cudf::make_collect_list_aggregation<groupby_aggregation>();
   test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -64,7 +64,7 @@ TYPED_TEST(groupby_collect_list_test, CollectWithNulls)
   lists_column_wrapper<V, int32_t> expect_vals{
     {{1, 2}, validity.begin()}, {{3, 4}, validity.begin()}, {{5, 6}, validity.begin()}};
 
-  auto agg = cudf::make_collect_list_aggregation();
+  auto agg = cudf::make_collect_list_aggregation<groupby_aggregation>();
   test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -82,7 +82,7 @@ TYPED_TEST(groupby_collect_list_test, CollectWithNullExclusion)
 
   lists_column_wrapper<V, int32_t> expect_vals{{2}, {4}, {}, {8, 9}};
 
-  auto agg = cudf::make_collect_list_aggregation(null_policy::EXCLUDE);
+  auto agg = cudf::make_collect_list_aggregation<groupby_aggregation>(null_policy::EXCLUDE);
   test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -97,7 +97,7 @@ TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInput)
   fixed_width_column_wrapper<K, int32_t> expect_keys{};
   lists_column_wrapper<V, int32_t> expect_vals{};
 
-  auto agg = cudf::make_collect_list_aggregation(null_policy::EXCLUDE);
+  auto agg = cudf::make_collect_list_aggregation<groupby_aggregation>(null_policy::EXCLUDE);
   test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -116,7 +116,7 @@ TYPED_TEST(groupby_collect_list_test, CollectLists)
   lists_column_wrapper<V, int32_t> expect_vals{
     {{1, 2}, {3, 4}}, {{5, 6, 7}, LCW{}}, {{9, 10}, {11}}};
 
-  auto agg = cudf::make_collect_list_aggregation();
+  auto agg = cudf::make_collect_list_aggregation<groupby_aggregation>();
   test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -135,7 +135,7 @@ TYPED_TEST(groupby_collect_list_test, CollectListsWithNullExclusion)
 
   LCW expect_vals{{{1, 2}}, {LCW{}}, {{9, 10}, {11}}, {}};
 
-  auto agg = cudf::make_collect_list_aggregation(null_policy::EXCLUDE);
+  auto agg = cudf::make_collect_list_aggregation<groupby_aggregation>(null_policy::EXCLUDE);
   test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -158,7 +158,7 @@ TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputLists)
   auto expect_values =
     cudf::make_lists_column(0, make_empty_column(offsets), std::move(expect_child), 0, {});
 
-  auto agg = cudf::make_collect_list_aggregation();
+  auto agg = cudf::make_collect_list_aggregation<groupby_aggregation>();
   test_single_agg(keys, values->view(), expect_keys, expect_values->view(), std::move(agg));
 }
 
@@ -190,7 +190,7 @@ TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputListsOfStructs)
   auto expect_values = cudf::make_lists_column(
     0, make_empty_column(data_type{type_to_id<offset_type>()}), std::move(expect_child), 0, {});
 
-  auto agg = cudf::make_collect_list_aggregation();
+  auto agg = cudf::make_collect_list_aggregation<groupby_aggregation>();
   test_single_agg(keys, values->view(), expect_keys, expect_values->view(), std::move(agg));
 }
 
@@ -212,8 +212,11 @@ TYPED_TEST(groupby_collect_list_test, dictionary)
                                              0,
                                              rmm::device_buffer{});
 
-  test_single_agg(
-    keys, vals, expect_keys, expect_vals->view(), cudf::make_collect_list_aggregation());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals->view(),
+                  cudf::make_collect_list_aggregation<groupby_aggregation>());
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/collect_set_tests.cpp b/cpp/tests/groupby/collect_set_tests.cpp
index 2f89b04c745..198caabfca9 100644
--- a/cpp/tests/groupby/collect_set_tests.cpp
+++ b/cpp/tests/groupby/collect_set_tests.cpp
@@ -33,16 +33,20 @@ namespace test {
 #define VALIDITY std::initializer_list<bool>
 
 struct CollectSetTest : public cudf::test::BaseFixture {
-  static auto collect_set() { return cudf::make_collect_set_aggregation(); }
+  static auto collect_set()
+  {
+    return cudf::make_collect_set_aggregation<cudf::groupby_aggregation>();
+  }
 
   static auto collect_set_null_unequal()
   {
-    return cudf::make_collect_set_aggregation(null_policy::INCLUDE, null_equality::UNEQUAL);
+    return cudf::make_collect_set_aggregation<cudf::groupby_aggregation>(null_policy::INCLUDE,
+                                                                         null_equality::UNEQUAL);
   }
 
   static auto collect_set_null_exclude()
   {
-    return cudf::make_collect_set_aggregation(null_policy::EXCLUDE);
+    return cudf::make_collect_set_aggregation<cudf::groupby_aggregation>(null_policy::EXCLUDE);
   }
 };
 
@@ -174,7 +178,7 @@ TEST_F(CollectSetTest, FloatsWithNaN)
                   vals,
                   keys_expected,
                   vals_expected,
-                  cudf::make_collect_set_aggregation(
+                  cudf::make_collect_set_aggregation<cudf::groupby_aggregation>(
                     null_policy::INCLUDE, null_equality::EQUAL, nan_equality::ALL_EQUAL));
   // null unequal with nan equal
   vals_expected = {
@@ -183,7 +187,7 @@ TEST_F(CollectSetTest, FloatsWithNaN)
                   vals,
                   keys_expected,
                   vals_expected,
-                  cudf::make_collect_set_aggregation(
+                  cudf::make_collect_set_aggregation<cudf::groupby_aggregation>(
                     null_policy::INCLUDE, null_equality::UNEQUAL, nan_equality::ALL_EQUAL));
 }
 
diff --git a/cpp/tests/groupby/count_scan_tests.cpp b/cpp/tests/groupby/count_scan_tests.cpp
index 9740bfa1954..62e8b11241d 100644
--- a/cpp/tests/groupby/count_scan_tests.cpp
+++ b/cpp/tests/groupby/count_scan_tests.cpp
@@ -53,11 +53,11 @@ TYPED_TEST(groupby_count_scan_test, basic)
   result_wrapper expect_vals{0, 1, 2, 0, 1, 2, 3, 0, 1, 2};
   // clang-format on
 
-  auto agg1 = cudf::make_count_aggregation();
+  auto agg1 = cudf::make_count_aggregation<groupby_scan_aggregation>();
   CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg1)),
                             "Unsupported groupby scan aggregation");
 
-  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  auto agg2 = cudf::make_count_aggregation<groupby_scan_aggregation>(null_policy::INCLUDE);
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2));
 }
 
@@ -74,10 +74,10 @@ TYPED_TEST(groupby_count_scan_test, empty_cols)
   result_wrapper expect_vals;
   // clang-format on
 
-  auto agg1 = cudf::make_count_aggregation();
+  auto agg1 = cudf::make_count_aggregation<groupby_scan_aggregation>();
   EXPECT_NO_THROW(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg1)));
 
-  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  auto agg2 = cudf::make_count_aggregation<groupby_scan_aggregation>(null_policy::INCLUDE);
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2));
 }
 
@@ -94,7 +94,7 @@ TYPED_TEST(groupby_count_scan_test, zero_valid_keys)
   result_wrapper expect_vals{};
   // clang-format on
 
-  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  auto agg2 = cudf::make_count_aggregation<groupby_scan_aggregation>(null_policy::INCLUDE);
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2));
 }
 
@@ -111,7 +111,7 @@ TYPED_TEST(groupby_count_scan_test, zero_valid_values)
   result_wrapper expect_vals{0, 1, 2};
   // clang-format on
 
-  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  auto agg2 = cudf::make_count_aggregation<groupby_scan_aggregation>(null_policy::INCLUDE);
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2));
 }
 
@@ -130,7 +130,7 @@ TYPED_TEST(groupby_count_scan_test, null_keys_and_values)
   result_wrapper expect_vals{0, 1, 2, 0, 1,    2, 3, 0, 1, 0};
   // clang-format on
 
-  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  auto agg2 = cudf::make_count_aggregation<groupby_scan_aggregation>(null_policy::INCLUDE);
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2));
 }
 
@@ -151,7 +151,7 @@ TEST_F(groupby_count_scan_string_test, basic)
   result_wrapper expect_vals{0, 0, 0, 1, 0, 1};
   // clang-format on
 
-  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  auto agg2 = cudf::make_count_aggregation<groupby_scan_aggregation>(null_policy::INCLUDE);
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2));
 }
 
@@ -182,10 +182,14 @@ TYPED_TEST(FixedPointTestBothReps, GroupByCountScan)
   // clang-format on
 
   CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(keys, vals, expect_keys, expect_vals, cudf::make_count_aggregation()),
+    test_single_scan(keys,
+                     vals,
+                     expect_keys,
+                     expect_vals,
+                     cudf::make_count_aggregation<groupby_scan_aggregation>()),
     "Unsupported groupby scan aggregation");
 
-  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  auto agg2 = cudf::make_count_aggregation<groupby_scan_aggregation>(null_policy::INCLUDE);
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2));
 }
 
@@ -205,11 +209,14 @@ TEST_F(groupby_dictionary_count_scan_test, basic)
   result_wrapper expect_vals{0, 0, 0, 1, 0, 1};
   // clang-format on
 
-  auto agg1 = cudf::make_count_aggregation();
+  auto agg1 = cudf::make_count_aggregation<groupby_scan_aggregation>();
   CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg1)),
                             "Unsupported groupby scan aggregation");
-  test_single_scan(
-    keys, vals, expect_keys, expect_vals, cudf::make_count_aggregation(null_policy::INCLUDE));
+  test_single_scan(keys,
+                   vals,
+                   expect_keys,
+                   expect_vals,
+                   cudf::make_count_aggregation<groupby_scan_aggregation>(null_policy::INCLUDE));
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/count_tests.cpp b/cpp/tests/groupby/count_tests.cpp
index 2d45de04607..cbb821767c9 100644
--- a/cpp/tests/groupby/count_tests.cpp
+++ b/cpp/tests/groupby/count_tests.cpp
@@ -45,13 +45,13 @@ TYPED_TEST(groupby_count_test, basic)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
   fixed_width_column_wrapper<R> expect_vals{3, 4, 3};
 
-  auto agg = cudf::make_count_aggregation();
+  auto agg = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg1 = cudf::make_count_aggregation();
+  auto agg1 = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
 
-  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  auto agg2 = cudf::make_count_aggregation<groupby_aggregation>(null_policy::INCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2));
 }
 
@@ -66,10 +66,10 @@ TYPED_TEST(groupby_count_test, empty_cols)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals;
 
-  auto agg = cudf::make_count_aggregation();
+  auto agg = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg1 = cudf::make_count_aggregation();
+  auto agg1 = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
 }
 
@@ -84,13 +84,13 @@ TYPED_TEST(groupby_count_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_count_aggregation();
+  auto agg = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg1 = cudf::make_count_aggregation();
+  auto agg1 = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
 
-  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  auto agg2 = cudf::make_count_aggregation<groupby_aggregation>(null_policy::INCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2));
 }
 
@@ -105,14 +105,14 @@ TYPED_TEST(groupby_count_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals{0};
 
-  auto agg = cudf::make_count_aggregation();
+  auto agg = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg1 = cudf::make_count_aggregation();
+  auto agg1 = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
 
   fixed_width_column_wrapper<R> expect_vals2{3};
-  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  auto agg2 = cudf::make_count_aggregation<groupby_aggregation>(null_policy::INCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2));
 }
 
@@ -133,14 +133,14 @@ TYPED_TEST(groupby_count_test, null_keys_and_values)
   fixed_width_column_wrapper<R> expect_vals({2,        3,         2,       0});
   // clang-format on
 
-  auto agg = cudf::make_count_aggregation();
+  auto agg = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg1 = cudf::make_count_aggregation();
+  auto agg1 = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
 
   fixed_width_column_wrapper<R> expect_vals2{3, 4, 2, 1};
-  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  auto agg2 = cudf::make_count_aggregation<groupby_aggregation>(null_policy::INCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2));
 }
 
@@ -160,10 +160,10 @@ TEST_F(groupby_count_string_test, basic)
   fixed_width_column_wrapper<K> expect_keys{0, 1, 3, 5};
   fixed_width_column_wrapper<R> expect_vals{1, 1, 2, 2};
 
-  auto agg = cudf::make_count_aggregation();
+  auto agg = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg1 = cudf::make_count_aggregation();
+  auto agg1 = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
 }
 // clang-format on
@@ -191,13 +191,13 @@ TYPED_TEST(FixedPointTestBothReps, GroupByCount)
   auto const expect_keys = fixed_width_column_wrapper<K>{1, 2, 3};
   auto const expect_vals = fixed_width_column_wrapper<R>{3, 4, 3};
 
-  auto agg = cudf::make_count_aggregation();
+  auto agg = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg1 = cudf::make_count_aggregation();
+  auto agg1 = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
 
-  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  auto agg2 = cudf::make_count_aggregation<groupby_aggregation>(null_policy::INCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2));
 }
 
@@ -216,9 +216,14 @@ TEST_F(groupby_dictionary_count_test, basic)
   fixed_width_column_wrapper<R> expect_vals{1, 1, 2, 2};
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_count_aggregation());
   test_single_agg(
-    keys, vals, expect_keys, expect_vals, cudf::make_count_aggregation(), force_use_sort_impl::YES);
+    keys, vals, expect_keys, expect_vals, cudf::make_count_aggregation<groupby_aggregation>());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_count_aggregation<groupby_aggregation>(),
+                  force_use_sort_impl::YES);
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/groupby_test_util.hpp b/cpp/tests/groupby/groupby_test_util.hpp
index 9a083ac8e74..542205b5b51 100644
--- a/cpp/tests/groupby/groupby_test_util.hpp
+++ b/cpp/tests/groupby/groupby_test_util.hpp
@@ -63,7 +63,7 @@ inline void test_single_agg(column_view const& keys,
                             column_view const& values,
                             column_view const& expect_keys,
                             column_view const& expect_vals,
-                            std::unique_ptr<aggregation>&& agg,
+                            std::unique_ptr<groupby_aggregation>&& agg,
                             force_use_sort_impl use_sort           = force_use_sort_impl::NO,
                             null_policy include_null_keys          = null_policy::EXCLUDE,
                             sorted keys_are_sorted                 = sorted::NO,
@@ -78,7 +78,7 @@ inline void test_single_agg(column_view const& keys,
 
   if (use_sort == force_use_sort_impl::YES) {
     // WAR to force groupby to use sort implementation
-    requests[0].aggregations.push_back(make_nth_element_aggregation(0));
+    requests[0].aggregations.push_back(make_nth_element_aggregation<groupby_aggregation>(0));
   }
 
   groupby::groupby gb_obj(
@@ -105,14 +105,14 @@ inline void test_single_scan(column_view const& keys,
                              column_view const& values,
                              column_view const& expect_keys,
                              column_view const& expect_vals,
-                             std::unique_ptr<aggregation>&& agg,
+                             std::unique_ptr<groupby_scan_aggregation>&& agg,
                              null_policy include_null_keys                  = null_policy::EXCLUDE,
                              sorted keys_are_sorted                         = sorted::NO,
                              std::vector<order> const& column_order         = {},
                              std::vector<null_order> const& null_precedence = {})
 {
-  std::vector<groupby::aggregation_request> requests;
-  requests.emplace_back(groupby::aggregation_request());
+  std::vector<groupby::scan_request> requests;
+  requests.emplace_back(groupby::scan_request());
   requests[0].values = values;
 
   requests[0].aggregations.push_back(std::move(agg));
diff --git a/cpp/tests/groupby/keys_tests.cpp b/cpp/tests/groupby/keys_tests.cpp
index 91db37a5ff6..683eeb7eb01 100644
--- a/cpp/tests/groupby/keys_tests.cpp
+++ b/cpp/tests/groupby/keys_tests.cpp
@@ -50,7 +50,7 @@ TYPED_TEST(groupby_keys_test, basic)
   fixed_width_column_wrapper<R> expect_vals { 3, 4, 3 };
   // clang-format on
 
-  auto agg = cudf::make_count_aggregation();
+  auto agg = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -68,7 +68,7 @@ TYPED_TEST(groupby_keys_test, zero_valid_keys)
   fixed_width_column_wrapper<R> expect_vals { };
   // clang-format on
 
-  auto agg = cudf::make_count_aggregation();
+  auto agg = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -89,7 +89,7 @@ TYPED_TEST(groupby_keys_test, some_null_keys)
   fixed_width_column_wrapper<R> expect_vals { 3,        4,           2,     1};
   // clang-format on
 
-  auto agg = cudf::make_count_aggregation();
+  auto agg = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -111,7 +111,7 @@ TYPED_TEST(groupby_keys_test, include_null_keys)
   fixed_width_column_wrapper<R> expect_vals { 9,        19,          10,    4,  7};
   // clang-format on
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys,
                   vals,
                   expect_keys,
@@ -135,7 +135,7 @@ TYPED_TEST(groupby_keys_test, pre_sorted_keys)
   fixed_width_column_wrapper<R> expect_vals { 3,       18,         24,      4};
   // clang-format on
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys,
                   vals,
                   expect_keys,
@@ -160,7 +160,7 @@ TYPED_TEST(groupby_keys_test, pre_sorted_keys_descending)
   fixed_width_column_wrapper<R> expect_vals { 0, 6,       22,        21      };
   // clang-format on
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys,
                   vals,
                   expect_keys,
@@ -187,7 +187,7 @@ TYPED_TEST(groupby_keys_test, pre_sorted_keys_nullable)
   fixed_width_column_wrapper<R> expect_vals { 3,       15,         17,      4};
   // clang-format on
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys,
                   vals,
                   expect_keys,
@@ -215,7 +215,7 @@ TYPED_TEST(groupby_keys_test, pre_sorted_keys_nulls_before_include_nulls)
   fixed_width_column_wrapper<R> expect_vals { 3,        7,     11,    7,  17,    4};
   // clang-format on
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys,
                   vals,
                   expect_keys,
@@ -234,10 +234,11 @@ TYPED_TEST(groupby_keys_test, mismatch_num_rows)
   fixed_width_column_wrapper<K> keys{1, 2, 3};
   fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4};
 
-  auto agg = cudf::make_count_aggregation();
+  auto agg = cudf::make_count_aggregation<groupby_aggregation>();
   CUDF_EXPECT_THROW_MESSAGE(test_single_agg(keys, vals, keys, vals, std::move(agg)),
                             "Size mismatch between request values and groupby keys.");
-  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, keys, vals, std::move(agg)),
+  auto agg2 = cudf::make_count_aggregation<groupby_scan_aggregation>();
+  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, keys, vals, std::move(agg2)),
                             "Size mismatch between request values and groupby keys.");
 }
 
@@ -257,7 +258,7 @@ TEST_F(groupby_string_keys_test, basic)
   fixed_width_column_wrapper<R> expect_vals {     9,    19,   17 };
   // clang-format on
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 // clang-format on
@@ -278,9 +279,14 @@ TEST_F(groupby_dictionary_keys_test, basic)
   fixed_width_column_wrapper<R> expect_vals({     9,    19,   17 });
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_sum_aggregation());
   test_single_agg(
-    keys, vals, expect_keys, expect_vals, cudf::make_sum_aggregation(), force_use_sort_impl::YES);
+    keys, vals, expect_keys, expect_vals, cudf::make_sum_aggregation<groupby_aggregation>());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_sum_aggregation<groupby_aggregation>(),
+                  force_use_sort_impl::YES);
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/m2_tests.cpp b/cpp/tests/groupby/m2_tests.cpp
index 7b338a0d9b8..be7d6c1ce05 100644
--- a/cpp/tests/groupby/m2_tests.cpp
+++ b/cpp/tests/groupby/m2_tests.cpp
@@ -44,7 +44,7 @@ auto compute_M2(cudf::column_view const& keys, cudf::column_view const& values)
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
   requests[0].values = values;
-  requests[0].aggregations.emplace_back(cudf::make_m2_aggregation());
+  requests[0].aggregations.emplace_back(cudf::make_m2_aggregation<cudf::groupby_aggregation>());
 
   auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys}));
   auto result = gb_obj.aggregate(requests);
diff --git a/cpp/tests/groupby/max_scan_tests.cpp b/cpp/tests/groupby/max_scan_tests.cpp
index 70a48da69e8..4d83dc9f7ba 100644
--- a/cpp/tests/groupby/max_scan_tests.cpp
+++ b/cpp/tests/groupby/max_scan_tests.cpp
@@ -55,7 +55,7 @@ TYPED_TEST(groupby_max_scan_test, basic)
   result_wrapper expect_vals({5, 8, 8, 6, 9, 9, 9, 7, 7, 7});
   // clang-format on
 
-  auto agg = cudf::make_max_aggregation();
+  auto agg = cudf::make_max_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -70,7 +70,7 @@ TYPED_TEST(groupby_max_scan_test, empty_cols)
   key_wrapper expect_keys{};
   result_wrapper expect_vals{};
 
-  auto agg = cudf::make_max_aggregation();
+  auto agg = cudf::make_max_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -87,7 +87,7 @@ TYPED_TEST(groupby_max_scan_test, zero_valid_keys)
   result_wrapper expect_vals{};
   // clang-format on
 
-  auto agg = cudf::make_max_aggregation();
+  auto agg = cudf::make_max_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -104,7 +104,7 @@ TYPED_TEST(groupby_max_scan_test, zero_valid_values)
   result_wrapper expect_vals({-1, -1, -1}, all_nulls());
   // clang-format on
 
-  auto agg = cudf::make_max_aggregation();
+  auto agg = cudf::make_max_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -124,7 +124,7 @@ TYPED_TEST(groupby_max_scan_test, null_keys_and_values)
                              { 0, 1, 1, 1, 1,  0, 1, 1,    1, 0});
   // clang-format on
 
-  auto agg = cudf::make_max_aggregation();
+  auto agg = cudf::make_max_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -152,7 +152,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortMaxScanDecimalAsValue)
     auto const expect_vals_max = fp_wrapper{{5, 8, 8, 6, 9, 9, 9, 7, 7, 7}, scale};
     // clang-format on
 
-    auto agg = cudf::make_max_aggregation();
+    auto agg = cudf::make_max_aggregation<groupby_scan_aggregation>();
     test_single_scan(keys, vals, expect_keys, expect_vals_max, std::move(agg));
   }
 }
diff --git a/cpp/tests/groupby/max_tests.cpp b/cpp/tests/groupby/max_tests.cpp
index b5710d3f4bc..a1e34b625e8 100644
--- a/cpp/tests/groupby/max_tests.cpp
+++ b/cpp/tests/groupby/max_tests.cpp
@@ -46,10 +46,10 @@ TYPED_TEST(groupby_max_test, basic)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
   fixed_width_column_wrapper<R> expect_vals({6, 9, 8});
 
-  auto agg = cudf::make_max_aggregation();
+  auto agg = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_max_aggregation();
+  auto agg2 = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -64,10 +64,10 @@ TYPED_TEST(groupby_max_test, empty_cols)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_max_aggregation();
+  auto agg = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_max_aggregation();
+  auto agg2 = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -82,10 +82,10 @@ TYPED_TEST(groupby_max_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_max_aggregation();
+  auto agg = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_max_aggregation();
+  auto agg2 = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -100,10 +100,10 @@ TYPED_TEST(groupby_max_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  auto agg = cudf::make_max_aggregation();
+  auto agg = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_max_aggregation();
+  auto agg2 = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -122,10 +122,10 @@ TYPED_TEST(groupby_max_test, null_keys_and_values)
   //  { 0, 3,     1, 4, 5,   2, 8,    -}
   fixed_width_column_wrapper<R> expect_vals({3, 5, 8, 0}, {1, 1, 1, 0});
 
-  auto agg = cudf::make_max_aggregation();
+  auto agg = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_max_aggregation();
+  auto agg2 = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -140,10 +140,10 @@ TEST_F(groupby_max_string_test, basic)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
   strings_column_wrapper expect_vals({"año", "zit", "₹1"});
 
-  auto agg = cudf::make_max_aggregation();
+  auto agg = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_max_aggregation();
+  auto agg2 = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -155,10 +155,10 @@ TEST_F(groupby_max_string_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   strings_column_wrapper expect_vals({""}, all_nulls());
 
-  auto agg = cudf::make_max_aggregation();
+  auto agg = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_max_aggregation();
+  auto agg2 = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -187,7 +187,7 @@ TEST_F(groupby_max_string_test, max_sorted_strings)
   // fixed_width_column_wrapper<size_type> expect_argmax(
   // {6, 10, 14, 18, 22, 26, 30, 34, 38, 42, -1},
   // {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
-  auto agg = cudf::make_max_aggregation();
+  auto agg = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys,
                   vals,
                   expect_keys,
@@ -214,12 +214,16 @@ TEST_F(groupby_dictionary_max_test, basic)
 
   auto expect_vals = cudf::dictionary::set_keys(expect_vals_w, vals.keys());
 
-  test_single_agg(keys, vals, expect_keys, expect_vals->view(), cudf::make_max_aggregation());
   test_single_agg(keys,
                   vals,
                   expect_keys,
                   expect_vals->view(),
-                  cudf::make_max_aggregation(),
+                  cudf::make_max_aggregation<cudf::groupby_aggregation>());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals->view(),
+                  cudf::make_max_aggregation<cudf::groupby_aggregation>(),
                   force_use_sort_impl::YES);
 }
 
@@ -247,7 +251,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortMaxDecimalAsValue)
     auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
     auto const expect_vals_max = fp_wrapper{{6, 9, 8}, scale};
 
-    auto agg3 = cudf::make_max_aggregation();
+    auto agg3 = cudf::make_max_aggregation<cudf::groupby_aggregation>();
     test_single_agg(
       keys, vals, expect_keys, expect_vals_max, std::move(agg3), force_use_sort_impl::YES);
   }
@@ -271,7 +275,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupByHashMaxDecimalAsValue)
     auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
     auto const expect_vals_max = fp_wrapper{{6, 9, 8}, scale};
 
-    auto agg7 = cudf::make_max_aggregation();
+    auto agg7 = cudf::make_max_aggregation<cudf::groupby_aggregation>();
     test_single_agg(keys, vals, expect_keys, expect_vals_max, std::move(agg7));
   }
 }
diff --git a/cpp/tests/groupby/mean_tests.cpp b/cpp/tests/groupby/mean_tests.cpp
index bac95b11e81..613e1555b79 100644
--- a/cpp/tests/groupby/mean_tests.cpp
+++ b/cpp/tests/groupby/mean_tests.cpp
@@ -67,7 +67,7 @@ TYPED_TEST(groupby_mean_test, basic)
   fixed_width_column_wrapper<R, RT> expect_vals(expect_v.cbegin(), expect_v.cend());
   // clang-format on
 
-  auto agg = cudf::make_mean_aggregation();
+  auto agg = cudf::make_mean_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -82,7 +82,7 @@ TYPED_TEST(groupby_mean_test, empty_cols)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_mean_aggregation();
+  auto agg = cudf::make_mean_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -97,7 +97,7 @@ TYPED_TEST(groupby_mean_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_mean_aggregation();
+  auto agg = cudf::make_mean_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -112,7 +112,7 @@ TYPED_TEST(groupby_mean_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  auto agg = cudf::make_mean_aggregation();
+  auto agg = cudf::make_mean_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -135,7 +135,7 @@ TYPED_TEST(groupby_mean_test, null_keys_and_values)
   fixed_width_column_wrapper<R, RT> expect_vals(expect_v.cbegin(), expect_v.cend(), {1, 1, 1, 0});
   // clang-format on
 
-  auto agg = cudf::make_mean_aggregation();
+  auto agg = cudf::make_mean_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 // clang-format on
@@ -156,7 +156,8 @@ TEST_F(groupby_dictionary_mean_test, basic)
   fixed_width_column_wrapper<R, double> expect_vals({9. / 3, 19. / 4, 17. / 3});
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_mean_aggregation());
+  test_single_agg(
+    keys, vals, expect_keys, expect_vals, cudf::make_mean_aggregation<groupby_aggregation>());
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/median_tests.cpp b/cpp/tests/groupby/median_tests.cpp
index 18979820911..86d89325401 100644
--- a/cpp/tests/groupby/median_tests.cpp
+++ b/cpp/tests/groupby/median_tests.cpp
@@ -51,7 +51,7 @@ TYPED_TEST(groupby_median_test, basic)
   fixed_width_column_wrapper<R> expect_vals({3.,     4.5,        7.}, no_nulls());
   // clang-format on
 
-  auto agg = cudf::make_median_aggregation();
+  auto agg = cudf::make_median_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -66,7 +66,7 @@ TYPED_TEST(groupby_median_test, empty_cols)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_median_aggregation();
+  auto agg = cudf::make_median_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -81,7 +81,7 @@ TYPED_TEST(groupby_median_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_median_aggregation();
+  auto agg = cudf::make_median_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -96,7 +96,7 @@ TYPED_TEST(groupby_median_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  auto agg = cudf::make_median_aggregation();
+  auto agg = cudf::make_median_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -115,7 +115,7 @@ TYPED_TEST(groupby_median_test, null_keys_and_values)
   //  { 3, 6,     1, 4, 9,   2, 8,    -}
   fixed_width_column_wrapper<R> expect_vals({4.5, 4., 5., 0.}, {1, 1, 1, 0});
 
-  auto agg = cudf::make_median_aggregation();
+  auto agg = cudf::make_median_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -134,7 +134,8 @@ TYPED_TEST(groupby_median_test, dictionary)
   fixed_width_column_wrapper<R> expect_vals({3.,       4.5,       7.     }, no_nulls());
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_median_aggregation());
+  test_single_agg(
+    keys, vals, expect_keys, expect_vals, cudf::make_median_aggregation<groupby_aggregation>());
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/merge_lists_tests.cpp b/cpp/tests/groupby/merge_lists_tests.cpp
index 29c6185e3a5..b6b1d1a1720 100644
--- a/cpp/tests/groupby/merge_lists_tests.cpp
+++ b/cpp/tests/groupby/merge_lists_tests.cpp
@@ -42,7 +42,8 @@ auto merge_lists(vcol_views const& keys_cols, vcol_views const& values_cols)
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
   requests[0].values = *values;
-  requests[0].aggregations.emplace_back(cudf::make_merge_lists_aggregation());
+  requests[0].aggregations.emplace_back(
+    cudf::make_merge_lists_aggregation<cudf::groupby_aggregation>());
 
   auto gb_obj = cudf::groupby::groupby(cudf::table_view({*keys}));
   auto result = gb_obj.aggregate(requests);
diff --git a/cpp/tests/groupby/merge_m2_tests.cpp b/cpp/tests/groupby/merge_m2_tests.cpp
index 3ec8bfec774..60067e78022 100644
--- a/cpp/tests/groupby/merge_m2_tests.cpp
+++ b/cpp/tests/groupby/merge_m2_tests.cpp
@@ -59,9 +59,9 @@ auto compute_partial_results(cudf::column_view const& keys, cudf::column_view co
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
   requests[0].values = values;
-  requests[0].aggregations.emplace_back(cudf::make_count_aggregation());
-  requests[0].aggregations.emplace_back(cudf::make_mean_aggregation());
-  requests[0].aggregations.emplace_back(cudf::make_m2_aggregation());
+  requests[0].aggregations.emplace_back(cudf::make_count_aggregation<cudf::groupby_aggregation>());
+  requests[0].aggregations.emplace_back(cudf::make_mean_aggregation<cudf::groupby_aggregation>());
+  requests[0].aggregations.emplace_back(cudf::make_m2_aggregation<cudf::groupby_aggregation>());
 
   auto gb_obj                  = cudf::groupby::groupby(cudf::table_view({keys}));
   auto [out_keys, out_results] = gb_obj.aggregate(requests);
@@ -88,7 +88,8 @@ auto merge_M2(vcol_views const& keys_cols, vcol_views const& values_cols)
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
   requests[0].values = *values;
-  requests[0].aggregations.emplace_back(cudf::make_merge_m2_aggregation());
+  requests[0].aggregations.emplace_back(
+    cudf::make_merge_m2_aggregation<cudf::groupby_aggregation>());
 
   auto gb_obj = cudf::groupby::groupby(cudf::table_view({*keys}));
   auto result = gb_obj.aggregate(requests);
diff --git a/cpp/tests/groupby/merge_sets_tests.cpp b/cpp/tests/groupby/merge_sets_tests.cpp
index ee4f61bf44f..5a65774b430 100644
--- a/cpp/tests/groupby/merge_sets_tests.cpp
+++ b/cpp/tests/groupby/merge_sets_tests.cpp
@@ -42,7 +42,8 @@ auto merge_sets(vcol_views const& keys_cols, vcol_views const& values_cols)
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
   requests[0].values = *values;
-  requests[0].aggregations.emplace_back(cudf::make_merge_sets_aggregation());
+  requests[0].aggregations.emplace_back(
+    cudf::make_merge_sets_aggregation<cudf::groupby_aggregation>());
 
   auto gb_obj = cudf::groupby::groupby(cudf::table_view({*keys}));
   auto result = gb_obj.aggregate(requests);
diff --git a/cpp/tests/groupby/min_scan_tests.cpp b/cpp/tests/groupby/min_scan_tests.cpp
index ef548407761..452f70eaf16 100644
--- a/cpp/tests/groupby/min_scan_tests.cpp
+++ b/cpp/tests/groupby/min_scan_tests.cpp
@@ -53,7 +53,7 @@ TYPED_TEST(groupby_min_scan_test, basic)
   result_wrapper expect_vals({5, 5, 1, 6, 6, 0, 0, 7, 2, 2});
   // clang-format on
 
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -68,7 +68,7 @@ TYPED_TEST(groupby_min_scan_test, empty_cols)
   key_wrapper expect_keys{};
   result_wrapper expect_vals{};
 
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -85,7 +85,7 @@ TYPED_TEST(groupby_min_scan_test, zero_valid_keys)
   result_wrapper expect_vals{};
   // clang-format on
 
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -102,7 +102,7 @@ TYPED_TEST(groupby_min_scan_test, zero_valid_values)
   result_wrapper expect_vals({-1, -1, -1}, all_nulls());
   // clang-format on
 
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -122,7 +122,7 @@ TYPED_TEST(groupby_min_scan_test, null_keys_and_values)
                              { 0, 1, 1, 1, 1,  0, 1, 1,    1, 0});
   // clang-format on
 
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -137,7 +137,7 @@ TEST_F(groupby_min_scan_string_test, basic)
   key_wrapper expect_keys{1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
   strings_column_wrapper expect_vals;
 
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<groupby_scan_aggregation>();
   CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)),
                             "Unsupported groupby scan type-agg combination");
 }
@@ -167,7 +167,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortMinScanDecimalAsValue)
     auto const expect_vals_min = fp_wrapper{{5, 5, 1, 6, 6, 0, 0, 7, 2, 2}, scale};
     // clang-format on
 
-    auto agg = cudf::make_min_aggregation();
+    auto agg = cudf::make_min_aggregation<groupby_scan_aggregation>();
     test_single_scan(keys, vals, expect_keys, expect_vals_min, std::move(agg));
   }
 }
diff --git a/cpp/tests/groupby/min_tests.cpp b/cpp/tests/groupby/min_tests.cpp
index 1544e867595..59e9d540709 100644
--- a/cpp/tests/groupby/min_tests.cpp
+++ b/cpp/tests/groupby/min_tests.cpp
@@ -46,10 +46,10 @@ TYPED_TEST(groupby_min_test, basic)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
   fixed_width_column_wrapper<R> expect_vals({0, 1, 2});
 
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_min_aggregation();
+  auto agg2 = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -64,10 +64,10 @@ TYPED_TEST(groupby_min_test, empty_cols)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_min_aggregation();
+  auto agg2 = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -82,10 +82,10 @@ TYPED_TEST(groupby_min_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_min_aggregation();
+  auto agg2 = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -100,10 +100,10 @@ TYPED_TEST(groupby_min_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_min_aggregation();
+  auto agg2 = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -122,10 +122,10 @@ TYPED_TEST(groupby_min_test, null_keys_and_values)
   //  { 3, 6,     1, 4, 9,   2, 8,    -}
   fixed_width_column_wrapper<R> expect_vals({3, 1, 2, 0}, {1, 1, 1, 0});
 
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_min_aggregation();
+  auto agg2 = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -140,10 +140,10 @@ TEST_F(groupby_min_string_test, basic)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
   strings_column_wrapper expect_vals({"aaa", "bat", "$1"});
 
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_min_aggregation();
+  auto agg2 = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -155,10 +155,10 @@ TEST_F(groupby_min_string_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   strings_column_wrapper expect_vals({""}, all_nulls());
 
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_min_aggregation();
+  auto agg2 = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -187,7 +187,7 @@ TEST_F(groupby_min_string_test, min_sorted_strings)
   // fixed_width_column_wrapper<size_type> expect_argmin(
   // {6, 10, 14, 18, 22, 26, 30, 34, 38, 42, -1},
   // {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys,
                   vals,
                   expect_keys,
@@ -214,12 +214,16 @@ TEST_F(groupby_dictionary_min_test, basic)
 
   auto expect_vals = cudf::dictionary::set_keys(expect_vals_w, vals.keys());
 
-  test_single_agg(keys, vals, expect_keys, expect_vals->view(), cudf::make_min_aggregation());
   test_single_agg(keys,
                   vals,
                   expect_keys,
                   expect_vals->view(),
-                  cudf::make_min_aggregation(),
+                  cudf::make_min_aggregation<cudf::groupby_aggregation>());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals->view(),
+                  cudf::make_min_aggregation<cudf::groupby_aggregation>(),
                   force_use_sort_impl::YES);
 }
 
@@ -246,7 +250,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortMinDecimalAsValue)
     auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
     auto const expect_vals_min = fp_wrapper{{0, 1, 2}, scale};
 
-    auto agg2 = cudf::make_min_aggregation();
+    auto agg2 = cudf::make_min_aggregation<cudf::groupby_aggregation>();
     test_single_agg(
       keys, vals, expect_keys, expect_vals_min, std::move(agg2), force_use_sort_impl::YES);
   }
@@ -270,7 +274,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupByHashMinDecimalAsValue)
     auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
     auto const expect_vals_min = fp_wrapper{{0, 1, 2}, scale};
 
-    auto agg6 = cudf::make_min_aggregation();
+    auto agg6 = cudf::make_min_aggregation<cudf::groupby_aggregation>();
     test_single_agg(keys, vals, expect_keys, expect_vals_min, std::move(agg6));
   }
 }
diff --git a/cpp/tests/groupby/nth_element_tests.cpp b/cpp/tests/groupby/nth_element_tests.cpp
index d5029147906..22f1e14815f 100644
--- a/cpp/tests/groupby/nth_element_tests.cpp
+++ b/cpp/tests/groupby/nth_element_tests.cpp
@@ -50,15 +50,15 @@ TYPED_TEST(groupby_nth_element_test, basic)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
 
   //groupby.first()
-  auto agg = cudf::make_nth_element_aggregation(0);
+  auto agg = cudf::make_nth_element_aggregation<groupby_aggregation>(0);
   fixed_width_column_wrapper<R, int32_t> expect_vals0({0, 1, 2});
   test_single_agg(keys, vals, expect_keys, expect_vals0, std::move(agg));
 
-  agg = cudf::make_nth_element_aggregation(1);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(1);
   fixed_width_column_wrapper<R, int32_t> expect_vals1({3, 4, 7});
   test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg));
 
-  agg = cudf::make_nth_element_aggregation(2);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(2);
   fixed_width_column_wrapper<R, int32_t> expect_vals2({6, 5, 8});
   test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg));
 }
@@ -75,7 +75,7 @@ TYPED_TEST(groupby_nth_element_test, empty_cols)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_nth_element_aggregation(0);
+  auto agg = cudf::make_nth_element_aggregation<groupby_aggregation>(0);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -90,7 +90,7 @@ TYPED_TEST(groupby_nth_element_test, basic_out_of_bounds)
 
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
 
-  auto agg = cudf::make_nth_element_aggregation(3);
+  auto agg = cudf::make_nth_element_aggregation<groupby_aggregation>(3);
   fixed_width_column_wrapper<R, int32_t> expect_vals({0, 9, 0}, {0, 1, 0});
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
@@ -109,15 +109,15 @@ TYPED_TEST(groupby_nth_element_test, negative)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
 
   //groupby.last()
-  auto agg = cudf::make_nth_element_aggregation(-1);
+  auto agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-1);
   fixed_width_column_wrapper<R, int32_t> expect_vals0({6, 9, 8});
   test_single_agg(keys, vals, expect_keys, expect_vals0, std::move(agg));
 
-  agg = cudf::make_nth_element_aggregation(-2);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-2);
   fixed_width_column_wrapper<R, int32_t> expect_vals1({3, 5, 7});
   test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg));
 
-  agg = cudf::make_nth_element_aggregation(-3);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-3);
   fixed_width_column_wrapper<R, int32_t> expect_vals2({0, 4, 2});
   test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg));
 }
@@ -133,7 +133,7 @@ TYPED_TEST(groupby_nth_element_test, negative_out_of_bounds)
 
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
 
-  auto agg = cudf::make_nth_element_aggregation(-4);
+  auto agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-4);
   fixed_width_column_wrapper<R, int32_t> expect_vals({0, 1, 0}, {0, 1, 0});
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
@@ -150,7 +150,7 @@ TYPED_TEST(groupby_nth_element_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_nth_element_aggregation(0);
+  auto agg = cudf::make_nth_element_aggregation<groupby_aggregation>(0);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -166,7 +166,7 @@ TYPED_TEST(groupby_nth_element_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R, int32_t> expect_vals({3}, all_nulls());
 
-  auto agg = cudf::make_nth_element_aggregation(0);
+  auto agg = cudf::make_nth_element_aggregation<groupby_aggregation>(0);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -186,7 +186,7 @@ TYPED_TEST(groupby_nth_element_test, null_keys_and_values)
   //vals                                    {-,3,6,    1,4,-,9,  2,8,      -}
   fixed_width_column_wrapper<R, int32_t> expect_vals({-1, 1, 2, -1}, {0, 1, 1, 0});
 
-  auto agg = cudf::make_nth_element_aggregation(0);
+  auto agg = cudf::make_nth_element_aggregation<groupby_aggregation>(0);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -206,7 +206,7 @@ TYPED_TEST(groupby_nth_element_test, null_keys_and_values_out_of_bounds)
   //                                         value,     null,       out,    out
   fixed_width_column_wrapper<R, int32_t> expect_vals({6, -1, -1, -1}, {1, 0, 0, 0});
 
-  auto agg = cudf::make_nth_element_aggregation(2);
+  auto agg = cudf::make_nth_element_aggregation<groupby_aggregation>(2);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -237,18 +237,18 @@ TYPED_TEST(groupby_nth_element_test, exclude_nulls)
   fixed_width_column_wrapper<R, int32_t> expect_vals1({6, 4, 2, -1}, {1, 1, 1, 0});
   fixed_width_column_wrapper<R, int32_t> expect_vals2({-1, 9, 8, -1}, {0, 1, 1, 0});
 
-  auto agg = cudf::make_nth_element_aggregation(0, cudf::null_policy::INCLUDE);
+  auto agg = cudf::make_nth_element_aggregation<groupby_aggregation>(0, cudf::null_policy::INCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_nuls0, std::move(agg));
-  agg = cudf::make_nth_element_aggregation(1, cudf::null_policy::INCLUDE);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(1, cudf::null_policy::INCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_nuls1, std::move(agg));
-  agg = cudf::make_nth_element_aggregation(2, cudf::null_policy::INCLUDE);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(2, cudf::null_policy::INCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_nuls2, std::move(agg));
 
-  agg = cudf::make_nth_element_aggregation(0, cudf::null_policy::EXCLUDE);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(0, cudf::null_policy::EXCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_vals0, std::move(agg));
-  agg = cudf::make_nth_element_aggregation(1, cudf::null_policy::EXCLUDE);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(1, cudf::null_policy::EXCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg));
-  agg = cudf::make_nth_element_aggregation(2, cudf::null_policy::EXCLUDE);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(2, cudf::null_policy::EXCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg));
 }
 
@@ -282,18 +282,18 @@ TYPED_TEST(groupby_nth_element_test, exclude_nulls_negative_index)
   fixed_width_column_wrapper<R, int32_t> expect_vals1({3, 4, 2, -1}, {1, 1, 1, 0});
   fixed_width_column_wrapper<R, int32_t> expect_vals2({-1, 1, 2, -1}, {0, 1, 1, 0});
 
-  auto agg = cudf::make_nth_element_aggregation(-1, cudf::null_policy::INCLUDE);
+  auto agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-1, cudf::null_policy::INCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_nuls0, std::move(agg));
-  agg = cudf::make_nth_element_aggregation(-2, cudf::null_policy::INCLUDE);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-2, cudf::null_policy::INCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_nuls1, std::move(agg));
-  agg = cudf::make_nth_element_aggregation(-3, cudf::null_policy::INCLUDE);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-3, cudf::null_policy::INCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_nuls2, std::move(agg));
 
-  agg = cudf::make_nth_element_aggregation(-1, cudf::null_policy::EXCLUDE);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-1, cudf::null_policy::EXCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_vals0, std::move(agg));
-  agg = cudf::make_nth_element_aggregation(-2, cudf::null_policy::EXCLUDE);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-2, cudf::null_policy::EXCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg));
-  agg = cudf::make_nth_element_aggregation(-3, cudf::null_policy::EXCLUDE);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-3, cudf::null_policy::EXCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg));
 }
 
@@ -312,38 +312,38 @@ TEST_F(groupby_nth_element_string_test, basic_string)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
 
   //groupby.first()
-  auto agg = cudf::make_nth_element_aggregation(0);
+  auto agg = cudf::make_nth_element_aggregation<groupby_aggregation>(0);
   strings_column_wrapper expect_vals0{"ABCD", "1", "2"};
   test_single_agg(keys, vals, expect_keys, expect_vals0, std::move(agg));
 
-  agg = cudf::make_nth_element_aggregation(1);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(1);
   strings_column_wrapper expect_vals1{"3", "4", "7"};
   test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg));
 
-  agg = cudf::make_nth_element_aggregation(2);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(2);
   strings_column_wrapper expect_vals2{"6", "5", "8"};
   test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg));
 
   //+ve out of bounds
-  agg = cudf::make_nth_element_aggregation(3);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(3);
   strings_column_wrapper expect_vals3{{"", "9", ""}, {0, 1, 0}};
   test_single_agg(keys, vals, expect_keys, expect_vals3, std::move(agg));
 
   //groupby.last()
-  agg = cudf::make_nth_element_aggregation(-1);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-1);
   strings_column_wrapper expect_vals4{"6", "9", "8"};
   test_single_agg(keys, vals, expect_keys, expect_vals4, std::move(agg));
 
-  agg = cudf::make_nth_element_aggregation(-2);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-2);
   strings_column_wrapper expect_vals5{"3", "5", "7"};
   test_single_agg(keys, vals, expect_keys, expect_vals5, std::move(agg));
 
-  agg = cudf::make_nth_element_aggregation(-3);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-3);
   strings_column_wrapper expect_vals6{"ABCD", "4", "2"};
   test_single_agg(keys, vals, expect_keys, expect_vals6, std::move(agg));
 
   //-ve out of bounds
-  agg = cudf::make_nth_element_aggregation(-4);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-4);
   strings_column_wrapper expect_vals7{{"", "1", ""}, {0, 1, 0}};
   test_single_agg(keys, vals, expect_keys, expect_vals7, std::move(agg));
 }
@@ -361,8 +361,11 @@ TEST_F(groupby_nth_element_string_test, dictionary)
 
   auto expect_vals = cudf::dictionary::set_keys(expect_vals_w, vals.keys());
 
-  test_single_agg(
-    keys, vals, expect_keys, expect_vals->view(), cudf::make_nth_element_aggregation(2));
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals->view(),
+                  cudf::make_nth_element_aggregation<groupby_aggregation>(2));
 }
 
 template <typename T>
@@ -384,8 +387,11 @@ TYPED_TEST(groupby_nth_element_lists_test, Basics)
   auto expected_keys   = fixed_width_column_wrapper<K, int32_t>{1, 2, 3};
   auto expected_values = lists{{1, 2}, {5, 6, 7}, {9, 10}};
 
-  test_single_agg(
-    keys, values, expected_keys, expected_values, cudf::make_nth_element_aggregation(0));
+  test_single_agg(keys,
+                  values,
+                  expected_keys,
+                  expected_values,
+                  cudf::make_nth_element_aggregation<groupby_aggregation>(0));
 }
 
 TYPED_TEST(groupby_nth_element_lists_test, EmptyInput)
@@ -401,8 +407,11 @@ TYPED_TEST(groupby_nth_element_lists_test, EmptyInput)
   auto expected_keys   = fixed_width_column_wrapper<K, int32_t>{};
   auto expected_values = lists{};
 
-  test_single_agg(
-    keys, values, expected_keys, expected_values, cudf::make_nth_element_aggregation(2));
+  test_single_agg(keys,
+                  values,
+                  expected_keys,
+                  expected_values,
+                  cudf::make_nth_element_aggregation<groupby_aggregation>(2));
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/nunique_tests.cpp b/cpp/tests/groupby/nunique_tests.cpp
index 089ca8805d4..88a6a1c903b 100644
--- a/cpp/tests/groupby/nunique_tests.cpp
+++ b/cpp/tests/groupby/nunique_tests.cpp
@@ -49,7 +49,7 @@ TYPED_TEST(groupby_nunique_test, basic)
   fixed_width_column_wrapper<R> expect_bool_vals{2,   1,          1};
   // clang-format on
 
-  auto agg = cudf::make_nunique_aggregation();
+  auto agg = cudf::make_nunique_aggregation<groupby_aggregation>();
   if (std::is_same<V, bool>())
     test_single_agg(keys, vals, expect_keys, expect_bool_vals, std::move(agg));
   else
@@ -67,7 +67,7 @@ TYPED_TEST(groupby_nunique_test, empty_cols)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_nunique_aggregation();
+  auto agg = cudf::make_nunique_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -83,7 +83,7 @@ TYPED_TEST(groupby_nunique_test, basic_duplicates)
   fixed_width_column_wrapper<R> expect_vals{2, 4, 1};
   fixed_width_column_wrapper<R> expect_bool_vals{2, 1, 1};
 
-  auto agg = cudf::make_nunique_aggregation();
+  auto agg = cudf::make_nunique_aggregation<groupby_aggregation>();
   if (std::is_same<V, bool>())
     test_single_agg(keys, vals, expect_keys, expect_bool_vals, std::move(agg));
   else
@@ -101,7 +101,7 @@ TYPED_TEST(groupby_nunique_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_nunique_aggregation();
+  auto agg = cudf::make_nunique_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -116,7 +116,7 @@ TYPED_TEST(groupby_nunique_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals{0};
 
-  auto agg = cudf::make_nunique_aggregation();
+  auto agg = cudf::make_nunique_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -136,7 +136,7 @@ TYPED_TEST(groupby_nunique_test, null_keys_and_values)
   fixed_width_column_wrapper<R> expect_vals{2, 3, 2, 0};
   fixed_width_column_wrapper<R> expect_bool_vals{1, 1, 1, 0};
 
-  auto agg = cudf::make_nunique_aggregation();
+  auto agg = cudf::make_nunique_aggregation<groupby_aggregation>();
   if (std::is_same<V, bool>())
     test_single_agg(keys, vals, expect_keys, expect_bool_vals, std::move(agg));
   else
@@ -160,7 +160,7 @@ TYPED_TEST(groupby_nunique_test, null_keys_and_values_with_duplicates)
   fixed_width_column_wrapper<R> expect_vals{2, 3, 2, 0};
   fixed_width_column_wrapper<R> expect_bool_vals{1, 1, 1, 0};
 
-  auto agg = cudf::make_nunique_aggregation();
+  auto agg = cudf::make_nunique_aggregation<groupby_aggregation>();
   if (std::is_same<V, bool>())
     test_single_agg(keys, vals, expect_keys, expect_bool_vals, std::move(agg));
   else
@@ -184,7 +184,7 @@ TYPED_TEST(groupby_nunique_test, include_nulls)
   fixed_width_column_wrapper<R> expect_vals{3, 4, 2, 1};
   fixed_width_column_wrapper<R> expect_bool_vals{2, 2, 1, 1};
 
-  auto agg = cudf::make_nunique_aggregation(null_policy::INCLUDE);
+  auto agg = cudf::make_nunique_aggregation<groupby_aggregation>(null_policy::INCLUDE);
   if (std::is_same<V, bool>())
     test_single_agg(keys, vals, expect_keys, expect_bool_vals, std::move(agg));
   else
@@ -213,8 +213,11 @@ TYPED_TEST(groupby_nunique_test, dictionary)
   cudf::column_view expect_vals = (std::is_same<V, bool>()) ? cudf::column_view{expect_bool_vals}
                                                             : cudf::column_view{expect_fixed_vals};
 
-  test_single_agg(
-    keys, vals, expect_keys, expect_vals, cudf::make_nunique_aggregation(null_policy::INCLUDE));
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_nunique_aggregation<groupby_aggregation>(null_policy::INCLUDE));
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/product_tests.cpp b/cpp/tests/groupby/product_tests.cpp
index eaa2cc07ff8..047bf856493 100644
--- a/cpp/tests/groupby/product_tests.cpp
+++ b/cpp/tests/groupby/product_tests.cpp
@@ -51,7 +51,11 @@ TYPED_TEST(groupby_product_test, basic)
   fixed_width_column_wrapper<R> expect_vals({   0.,       180.,      112. }, no_nulls());
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_product_aggregation<cudf::groupby_aggregation>());
 }
 
 TYPED_TEST(groupby_product_test, empty_cols)
@@ -65,7 +69,11 @@ TYPED_TEST(groupby_product_test, empty_cols)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_product_aggregation<cudf::groupby_aggregation>());
 }
 
 TYPED_TEST(groupby_product_test, zero_valid_keys)
@@ -79,7 +87,11 @@ TYPED_TEST(groupby_product_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_product_aggregation<cudf::groupby_aggregation>());
 }
 
 TYPED_TEST(groupby_product_test, zero_valid_values)
@@ -93,7 +105,11 @@ TYPED_TEST(groupby_product_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_product_aggregation<cudf::groupby_aggregation>());
 }
 
 TYPED_TEST(groupby_product_test, null_keys_and_values)
@@ -114,7 +130,11 @@ TYPED_TEST(groupby_product_test, null_keys_and_values)
                                             { 1,        1,         1,       0});
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_product_aggregation<cudf::groupby_aggregation>());
 }
 
 TYPED_TEST(groupby_product_test, dictionary)
@@ -132,7 +152,11 @@ TYPED_TEST(groupby_product_test, dictionary)
   fixed_width_column_wrapper<R> expect_vals({  0.,     180.,        112. }, no_nulls());
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_product_aggregation<cudf::groupby_aggregation>());
 }
 
 TYPED_TEST(groupby_product_test, dictionary_with_nulls)
@@ -151,7 +175,11 @@ TYPED_TEST(groupby_product_test, dictionary_with_nulls)
   fixed_width_column_wrapper<R> expect_vals({  0.,     180.,        56. }, no_nulls());
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_product_aggregation<cudf::groupby_aggregation>());
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/quantile_tests.cpp b/cpp/tests/groupby/quantile_tests.cpp
index a82dae9edcb..43b065ee4d3 100644
--- a/cpp/tests/groupby/quantile_tests.cpp
+++ b/cpp/tests/groupby/quantile_tests.cpp
@@ -51,7 +51,7 @@ TYPED_TEST(groupby_quantile_test, basic)
   fixed_width_column_wrapper<R> expect_vals({3., 4.5, 7.}, no_nulls());
   // clang-format on
 
-  auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR);
+  auto agg = cudf::make_quantile_aggregation<groupby_aggregation>({0.5}, interpolation::LINEAR);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -66,7 +66,7 @@ TYPED_TEST(groupby_quantile_test, empty_cols)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR);
+  auto agg = cudf::make_quantile_aggregation<groupby_aggregation>({0.5}, interpolation::LINEAR);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -81,7 +81,7 @@ TYPED_TEST(groupby_quantile_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR);
+  auto agg = cudf::make_quantile_aggregation<groupby_aggregation>({0.5}, interpolation::LINEAR);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -96,7 +96,7 @@ TYPED_TEST(groupby_quantile_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR);
+  auto agg = cudf::make_quantile_aggregation<groupby_aggregation>({0.5}, interpolation::LINEAR);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -115,7 +115,7 @@ TYPED_TEST(groupby_quantile_test, null_keys_and_values)
   //  { 3, 6,     1, 4, 9,   2, 8,    -}
   fixed_width_column_wrapper<R> expect_vals({4.5, 4., 5., 0.}, {1, 1, 1, 0});
 
-  auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR);
+  auto agg = cudf::make_quantile_aggregation<groupby_aggregation>({0.5}, interpolation::LINEAR);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -134,7 +134,8 @@ TYPED_TEST(groupby_quantile_test, multiple_quantile)
   fixed_width_column_wrapper<R> expect_vals({1.5, 4.5, 3.25, 6.,   4.5, 7.5}, no_nulls());
   // clang-format on
 
-  auto agg = cudf::make_quantile_aggregation({0.25, 0.75}, interpolation::LINEAR);
+  auto agg =
+    cudf::make_quantile_aggregation<groupby_aggregation>({0.25, 0.75}, interpolation::LINEAR);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES);
 }
 
@@ -152,27 +153,27 @@ TYPED_TEST(groupby_quantile_test, interpolation_types)
 
   //                                         {0, 3, 6,  1, 4, 5, 9,  2, 7}
   fixed_width_column_wrapper<R> expect_vals1({2.4,      4.2,         4.}, no_nulls());
-  auto agg1 = cudf::make_quantile_aggregation({0.4}, interpolation::LINEAR);
+  auto agg1 = cudf::make_quantile_aggregation<groupby_aggregation>({0.4}, interpolation::LINEAR);
   test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg1));
 
   //                                         {0, 3, 6,  1, 4, 5, 9,  2, 7}
   fixed_width_column_wrapper<R> expect_vals2({3,        4,           2}, no_nulls());
-  auto agg2 = cudf::make_quantile_aggregation({0.4}, interpolation::NEAREST);
+  auto agg2 = cudf::make_quantile_aggregation<groupby_aggregation>({0.4}, interpolation::NEAREST);
   test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2));
 
   //                                         {0, 3, 6,  1, 4, 5, 9,  2, 7}
   fixed_width_column_wrapper<R> expect_vals3({0,        4,          2}, no_nulls());
-  auto agg3 = cudf::make_quantile_aggregation({0.4}, interpolation::LOWER);
+  auto agg3 = cudf::make_quantile_aggregation<groupby_aggregation>({0.4}, interpolation::LOWER);
   test_single_agg(keys, vals, expect_keys, expect_vals3, std::move(agg3));
 
   //                                         {0, 3, 6,  1, 4, 5, 9,  2, 7}
   fixed_width_column_wrapper<R> expect_vals4({3,        5,           7}, no_nulls());
-  auto agg4 = cudf::make_quantile_aggregation({0.4}, interpolation::HIGHER);
+  auto agg4 = cudf::make_quantile_aggregation<groupby_aggregation>({0.4}, interpolation::HIGHER);
   test_single_agg(keys, vals, expect_keys, expect_vals4, std::move(agg4));
 
   //                                         {0, 3, 6,  1, 4, 5, 9,  2, 7}
   fixed_width_column_wrapper<R> expect_vals5({1.5,      4.5,         4.5}, no_nulls());
-  auto agg5 = cudf::make_quantile_aggregation({0.4}, interpolation::MIDPOINT);
+  auto agg5 = cudf::make_quantile_aggregation<groupby_aggregation>({0.4}, interpolation::MIDPOINT);
   test_single_agg(keys, vals, expect_keys, expect_vals5, std::move(agg5));
   // clang-format on
 }
@@ -192,11 +193,12 @@ TYPED_TEST(groupby_quantile_test, dictionary)
   fixed_width_column_wrapper<R> expect_vals({3.,      4.5,        7.}, no_nulls());
   // clang-format on
 
-  test_single_agg(keys,
-                  vals,
-                  expect_keys,
-                  expect_vals,
-                  cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR));
+  test_single_agg(
+    keys,
+    vals,
+    expect_keys,
+    expect_vals,
+    cudf::make_quantile_aggregation<groupby_aggregation>({0.5}, interpolation::LINEAR));
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/rank_scan_tests.cpp b/cpp/tests/groupby/rank_scan_tests.cpp
index 51c4c1e63c2..37e75e2e906 100644
--- a/cpp/tests/groupby/rank_scan_tests.cpp
+++ b/cpp/tests/groupby/rank_scan_tests.cpp
@@ -39,11 +39,16 @@ inline void test_pair_rank_scans(column_view const& keys,
                    order,
                    keys,
                    expected_dense,
-                   make_dense_rank_aggregation(),
+                   make_dense_rank_aggregation<groupby_scan_aggregation>(),
+                   null_policy::INCLUDE,
+                   sorted::YES);
+  test_single_scan(keys,
+                   order,
+                   keys,
+                   expected_rank,
+                   make_rank_aggregation<groupby_scan_aggregation>(),
                    null_policy::INCLUDE,
                    sorted::YES);
-  test_single_scan(
-    keys, order, keys, expected_rank, make_rank_aggregation(), null_policy::INCLUDE, sorted::YES);
 }
 
 struct groupby_rank_scan_test : public BaseFixture {
@@ -201,11 +206,11 @@ TYPED_TEST(typed_groupby_rank_scan_test, mixedStructs)
   auto expected_rank_vals =
     fixed_width_column_wrapper<size_type>{1, 1, 3, 3, 5, 6, 1, 1, 3, 1, 1, 3};
 
-  std::vector<groupby::aggregation_request> requests;
-  requests.emplace_back(groupby::aggregation_request());
+  std::vector<groupby::scan_request> requests;
+  requests.emplace_back(groupby::scan_request());
   requests[0].values = *struct_col;
-  requests[0].aggregations.push_back(make_dense_rank_aggregation());
-  requests[0].aggregations.push_back(make_rank_aggregation());
+  requests[0].aggregations.push_back(make_dense_rank_aggregation<groupby_scan_aggregation>());
+  requests[0].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>());
 
   groupby::groupby gb_obj(table_view({keys}), null_policy::INCLUDE, sorted::YES);
   auto result = gb_obj.scan(requests);
@@ -377,34 +382,61 @@ TEST_F(groupby_rank_scan_test_failures, test_exception_triggers)
   fixed_width_column_wrapper<T> col{3, 3, 1};
 
   CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(
-      keys, col, keys, col, make_dense_rank_aggregation(), null_policy::INCLUDE, sorted::NO),
+    test_single_scan(keys,
+                     col,
+                     keys,
+                     col,
+                     make_dense_rank_aggregation<groupby_scan_aggregation>(),
+                     null_policy::INCLUDE,
+                     sorted::NO),
     "Dense rank aggregate in groupby scan requires the keys to be presorted");
 
-  CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(
-      keys, col, keys, col, make_rank_aggregation(), null_policy::INCLUDE, sorted::NO),
-    "Rank aggregate in groupby scan requires the keys to be presorted");
+  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys,
+                                             col,
+                                             keys,
+                                             col,
+                                             make_rank_aggregation<groupby_scan_aggregation>(),
+                                             null_policy::INCLUDE,
+                                             sorted::NO),
+                            "Rank aggregate in groupby scan requires the keys to be presorted");
 
   CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(
-      keys, col, keys, col, make_dense_rank_aggregation(), null_policy::EXCLUDE, sorted::YES),
+    test_single_scan(keys,
+                     col,
+                     keys,
+                     col,
+                     make_dense_rank_aggregation<groupby_scan_aggregation>(),
+                     null_policy::EXCLUDE,
+                     sorted::YES),
     "Dense rank aggregate in groupby scan requires the keys to be presorted");
 
-  CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(
-      keys, col, keys, col, make_rank_aggregation(), null_policy::EXCLUDE, sorted::YES),
-    "Rank aggregate in groupby scan requires the keys to be presorted");
+  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys,
+                                             col,
+                                             keys,
+                                             col,
+                                             make_rank_aggregation<groupby_scan_aggregation>(),
+                                             null_policy::EXCLUDE,
+                                             sorted::YES),
+                            "Rank aggregate in groupby scan requires the keys to be presorted");
 
   CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(
-      keys, col, keys, col, make_dense_rank_aggregation(), null_policy::EXCLUDE, sorted::NO),
+    test_single_scan(keys,
+                     col,
+                     keys,
+                     col,
+                     make_dense_rank_aggregation<groupby_scan_aggregation>(),
+                     null_policy::EXCLUDE,
+                     sorted::NO),
     "Dense rank aggregate in groupby scan requires the keys to be presorted");
 
-  CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(
-      keys, col, keys, col, make_rank_aggregation(), null_policy::EXCLUDE, sorted::NO),
-    "Rank aggregate in groupby scan requires the keys to be presorted");
+  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys,
+                                             col,
+                                             keys,
+                                             col,
+                                             make_rank_aggregation<groupby_scan_aggregation>(),
+                                             null_policy::EXCLUDE,
+                                             sorted::NO),
+                            "Rank aggregate in groupby scan requires the keys to be presorted");
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/std_tests.cpp b/cpp/tests/groupby/std_tests.cpp
index c771971ad9a..e2edabf3e8f 100644
--- a/cpp/tests/groupby/std_tests.cpp
+++ b/cpp/tests/groupby/std_tests.cpp
@@ -53,7 +53,7 @@ TYPED_TEST(groupby_std_test, basic)
   fixed_width_column_wrapper<R> expect_vals({3.,       sqrt(131./12), sqrt(31./3)}, no_nulls());
   // clang-format on
 
-  auto agg = cudf::make_std_aggregation();
+  auto agg = cudf::make_std_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -68,7 +68,7 @@ TYPED_TEST(groupby_std_test, empty_cols)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_std_aggregation();
+  auto agg = cudf::make_std_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -83,7 +83,7 @@ TYPED_TEST(groupby_std_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_std_aggregation();
+  auto agg = cudf::make_std_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -98,7 +98,7 @@ TYPED_TEST(groupby_std_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  auto agg = cudf::make_std_aggregation();
+  auto agg = cudf::make_std_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -118,7 +118,7 @@ TYPED_TEST(groupby_std_test, null_keys_and_values)
   fixed_width_column_wrapper<R> expect_vals({3 / sqrt(2), 7 / sqrt(3), 3 * sqrt(2), 0.},
                                             {1, 1, 1, 0});
 
-  auto agg = cudf::make_std_aggregation();
+  auto agg = cudf::make_std_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -137,7 +137,7 @@ TYPED_TEST(groupby_std_test, ddof_non_default)
   //                                        { 3, 6,     1, 4, 9,   2, 8,    3}
   fixed_width_column_wrapper<R> expect_vals({0., 7 * sqrt(2. / 3), 0., 0.}, {0, 1, 0, 0});
 
-  auto agg = cudf::make_std_aggregation(2);
+  auto agg = cudf::make_std_aggregation<cudf::groupby_aggregation>(2);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -156,7 +156,8 @@ TYPED_TEST(groupby_std_test, dictionary)
   fixed_width_column_wrapper<R> expect_vals({3.,       sqrt(131./12), sqrt(31./3)}, no_nulls());
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_std_aggregation());
+  test_single_agg(
+    keys, vals, expect_keys, expect_vals, cudf::make_std_aggregation<cudf::groupby_aggregation>());
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/sum_of_squares_tests.cpp b/cpp/tests/groupby/sum_of_squares_tests.cpp
index 12b044c7382..0dab2c6483e 100644
--- a/cpp/tests/groupby/sum_of_squares_tests.cpp
+++ b/cpp/tests/groupby/sum_of_squares_tests.cpp
@@ -49,7 +49,7 @@ TYPED_TEST(groupby_sum_of_squares_test, basic)
   //  { 0, 3, 6,  1, 4, 5, 9,  2, 7, 8}
   fixed_width_column_wrapper<R> expect_vals({45., 123., 117.}, no_nulls());
 
-  auto agg = cudf::make_sum_of_squares_aggregation();
+  auto agg = cudf::make_sum_of_squares_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -64,7 +64,7 @@ TYPED_TEST(groupby_sum_of_squares_test, empty_cols)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_sum_of_squares_aggregation();
+  auto agg = cudf::make_sum_of_squares_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -79,7 +79,7 @@ TYPED_TEST(groupby_sum_of_squares_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_sum_of_squares_aggregation();
+  auto agg = cudf::make_sum_of_squares_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -94,7 +94,7 @@ TYPED_TEST(groupby_sum_of_squares_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  auto agg = cudf::make_sum_of_squares_aggregation();
+  auto agg = cudf::make_sum_of_squares_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -113,7 +113,7 @@ TYPED_TEST(groupby_sum_of_squares_test, null_keys_and_values)
   //  { 3, 6,     1, 4, 9,   2, 8,    3}
   fixed_width_column_wrapper<R> expect_vals({45., 98., 68., 9.}, {1, 1, 1, 0});
 
-  auto agg = cudf::make_sum_of_squares_aggregation();
+  auto agg = cudf::make_sum_of_squares_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -132,7 +132,11 @@ TYPED_TEST(groupby_sum_of_squares_test, dictionary)
   fixed_width_column_wrapper<R> expect_vals({45.,       123.,       117.   }, no_nulls());
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_sum_of_squares_aggregation());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_sum_of_squares_aggregation<groupby_aggregation>());
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/sum_scan_tests.cpp b/cpp/tests/groupby/sum_scan_tests.cpp
index 2f1928747ae..86fc0238597 100644
--- a/cpp/tests/groupby/sum_scan_tests.cpp
+++ b/cpp/tests/groupby/sum_scan_tests.cpp
@@ -57,7 +57,7 @@ TYPED_TEST(groupby_sum_scan_test, basic)
   //                        {0, 3, 6, 1, 4, 5, 9, 2, 7, 8}
   result_wrapper expect_vals{0, 3, 9, 1, 5, 10, 19, 2, 9, 17};
   // clang-format on
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -74,7 +74,7 @@ TYPED_TEST(groupby_sum_scan_test, empty_cols)
   result_wrapper expect_vals{};
   // clang-format on
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -91,7 +91,7 @@ TYPED_TEST(groupby_sum_scan_test, zero_valid_keys)
   result_wrapper expect_vals{};
   // clang-format on
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -108,7 +108,7 @@ TYPED_TEST(groupby_sum_scan_test, zero_valid_values)
   result_wrapper expect_vals({3, 4, 5}, all_nulls());
   // clang-format on
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -128,7 +128,7 @@ TYPED_TEST(groupby_sum_scan_test, null_keys_and_values)
                              { 0, 1, 1, 1, 1,  0,  1, 1,    1, 0});
   // clang-format on
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -156,7 +156,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortSumScanDecimalAsValue)
     auto const expect_vals_sum = out_fp_wrapper{{0, 3, 9, 1, 5, 10, 19, 2, 9, 17}, scale};
     // clang-format on
 
-    auto agg2 = cudf::make_sum_aggregation();
+    auto agg2 = cudf::make_sum_aggregation<groupby_scan_aggregation>();
     test_single_scan(keys, vals, expect_keys, expect_vals_sum, std::move(agg2));
   }
 }
diff --git a/cpp/tests/groupby/sum_tests.cpp b/cpp/tests/groupby/sum_tests.cpp
index 458937ff2e4..5c935ee5a9d 100644
--- a/cpp/tests/groupby/sum_tests.cpp
+++ b/cpp/tests/groupby/sum_tests.cpp
@@ -49,10 +49,10 @@ TYPED_TEST(groupby_sum_test, basic)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
   fixed_width_column_wrapper<R> expect_vals{9, 19, 17};
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_sum_aggregation();
+  auto agg2 = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -67,10 +67,10 @@ TYPED_TEST(groupby_sum_test, empty_cols)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_sum_aggregation();
+  auto agg2 = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -85,10 +85,10 @@ TYPED_TEST(groupby_sum_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_sum_aggregation();
+  auto agg2 = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -103,10 +103,10 @@ TYPED_TEST(groupby_sum_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_sum_aggregation();
+  auto agg2 = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -125,10 +125,10 @@ TYPED_TEST(groupby_sum_test, null_keys_and_values)
   //  { 3, 6,     1, 4, 9,   2, 8,    -}
   fixed_width_column_wrapper<R> expect_vals({9, 14, 10, 0}, {1, 1, 1, 0});
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_sum_aggregation();
+  auto agg2 = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 // clang-format on
@@ -146,9 +146,14 @@ TYPED_TEST(groupby_sum_test, dictionary)
   fixed_width_column_wrapper<R> expect_vals{ 9, 19, 17};
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_sum_aggregation());
   test_single_agg(
-    keys, vals, expect_keys, expect_vals, cudf::make_sum_aggregation(), force_use_sort_impl::YES);
+    keys, vals, expect_keys, expect_vals, cudf::make_sum_aggregation<groupby_aggregation>());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_sum_aggregation<groupby_aggregation>(),
+                  force_use_sort_impl::YES);
 }
 
 template <typename T>
@@ -176,11 +181,11 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortSumDecimalAsValue)
     auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
     auto const expect_vals_sum = fp64_wrapper{{9, 19, 17}, scale};
 
-    auto agg1 = cudf::make_sum_aggregation();
+    auto agg1 = cudf::make_sum_aggregation<groupby_aggregation>();
     test_single_agg(
       keys, vals, expect_keys, expect_vals_sum, std::move(agg1), force_use_sort_impl::YES);
 
-    auto agg4 = cudf::make_product_aggregation();
+    auto agg4 = cudf::make_product_aggregation<groupby_aggregation>();
     EXPECT_THROW(
       test_single_agg(keys, vals, expect_keys, {}, std::move(agg4), force_use_sort_impl::YES),
       cudf::logic_error);
@@ -206,10 +211,10 @@ TYPED_TEST(FixedPointTestBothReps, GroupByHashSumDecimalAsValue)
     auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
     auto const expect_vals_sum = fp64_wrapper{{9, 19, 17}, scale};
 
-    auto agg5 = cudf::make_sum_aggregation();
+    auto agg5 = cudf::make_sum_aggregation<groupby_aggregation>();
     test_single_agg(keys, vals, expect_keys, expect_vals_sum, std::move(agg5));
 
-    auto agg8 = cudf::make_product_aggregation();
+    auto agg8 = cudf::make_product_aggregation<groupby_aggregation>();
     EXPECT_THROW(test_single_agg(keys, vals, expect_keys, {}, std::move(agg8)), cudf::logic_error);
   }
 }
diff --git a/cpp/tests/groupby/var_tests.cpp b/cpp/tests/groupby/var_tests.cpp
index c3fc781801d..68ccf791960 100644
--- a/cpp/tests/groupby/var_tests.cpp
+++ b/cpp/tests/groupby/var_tests.cpp
@@ -53,7 +53,7 @@ TYPED_TEST(groupby_var_test, basic)
   fixed_width_column_wrapper<R> expect_vals({9.,      131. / 12,   31. / 3}, no_nulls());
   // clang-format on
 
-  auto agg = cudf::make_variance_aggregation();
+  auto agg = cudf::make_variance_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -68,7 +68,7 @@ TYPED_TEST(groupby_var_test, empty_cols)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_variance_aggregation();
+  auto agg = cudf::make_variance_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -83,7 +83,7 @@ TYPED_TEST(groupby_var_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_variance_aggregation();
+  auto agg = cudf::make_variance_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -98,7 +98,7 @@ TYPED_TEST(groupby_var_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  auto agg = cudf::make_variance_aggregation();
+  auto agg = cudf::make_variance_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -119,7 +119,7 @@ TYPED_TEST(groupby_var_test, null_keys_and_values)
   fixed_width_column_wrapper<R> expect_vals({4.5,      49. / 3,   18.,     0.}, {1, 1, 1, 0});
   // clang-format on
 
-  auto agg = cudf::make_variance_aggregation();
+  auto agg = cudf::make_variance_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -141,7 +141,7 @@ TYPED_TEST(groupby_var_test, ddof_non_default)
                                             {0,         1,         0,       0});
   // clang-format on
 
-  auto agg = cudf::make_variance_aggregation(2);
+  auto agg = cudf::make_variance_aggregation<cudf::groupby_aggregation>(2);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -160,7 +160,11 @@ TYPED_TEST(groupby_var_test, dictionary)
   fixed_width_column_wrapper<R> expect_vals({9.,      131./12,      31./3  }, no_nulls());
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_variance_aggregation());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_variance_aggregation<cudf::groupby_aggregation>());
 }
 
 }  // namespace test
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index f9cd81e7e97..595bc1df151 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -2345,11 +2345,19 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByAggregate(
     for (int i = 0; i < n_values.size(); i++) {
       cudf::groupby::aggregation_request req;
       int col_index = n_values[i];
+
+      cudf::groupby_aggregation *agg =
+          dynamic_cast<cudf::groupby_aggregation *>(n_agg_instances[i]);
+      JNI_ARG_CHECK(env, agg != nullptr, "aggregation is not an instance of groupby_aggregation",
+                    nullptr);
+      std::unique_ptr<cudf::groupby_aggregation> cloned(
+          dynamic_cast<cudf::groupby_aggregation *>(agg->clone().release()));
+
       if (col_index == previous_index) {
-        requests.back().aggregations.push_back(n_agg_instances[i]->clone());
+        requests.back().aggregations.push_back(std::move(cloned));
       } else {
         req.values = n_input_table->column(col_index);
-        req.aggregations.push_back(n_agg_instances[i]->clone());
+        req.aggregations.push_back(std::move(cloned));
         requests.push_back(std::move(req));
       }
       previous_index = col_index;
@@ -2401,17 +2409,25 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByScan(
 
     // Aggregates are passed in already grouped by column, so we just need to fill it in
     // as we go.
-    std::vector<cudf::groupby::aggregation_request> requests;
+    std::vector<cudf::groupby::scan_request> requests;
 
     int previous_index = -1;
     for (int i = 0; i < n_values.size(); i++) {
-      cudf::groupby::aggregation_request req;
+      cudf::groupby::scan_request req;
       int col_index = n_values[i];
+
+      cudf::groupby_scan_aggregation *agg =
+          dynamic_cast<cudf::groupby_scan_aggregation *>(n_agg_instances[i]);
+      JNI_ARG_CHECK(env, agg != nullptr,
+                    "aggregation is not an instance of groupby_scan_aggregation", nullptr);
+      std::unique_ptr<cudf::groupby_scan_aggregation> cloned(
+          dynamic_cast<cudf::groupby_scan_aggregation *>(agg->clone().release()));
+
       if (col_index == previous_index) {
-        requests.back().aggregations.push_back(n_agg_instances[i]->clone());
+        requests.back().aggregations.push_back(std::move(cloned));
       } else {
         req.values = n_input_table->column(col_index);
-        req.aggregations.push_back(n_agg_instances[i]->clone());
+        req.aggregations.push_back(std::move(cloned));
         requests.push_back(std::move(req));
       }
       previous_index = col_index;
diff --git a/python/cudf/cudf/_lib/aggregation.pxd b/python/cudf/cudf/_lib/aggregation.pxd
index f608dab3fe1..84bcaed1b36 100644
--- a/python/cudf/cudf/_lib/aggregation.pxd
+++ b/python/cudf/cudf/_lib/aggregation.pxd
@@ -2,7 +2,12 @@
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.aggregation cimport aggregation, rolling_aggregation
+from cudf._lib.cpp.aggregation cimport (
+    aggregation,
+    groupby_aggregation,
+    groupby_scan_aggregation,
+    rolling_aggregation,
+)
 
 
 cdef class Aggregation:
@@ -11,5 +16,13 @@ cdef class Aggregation:
 cdef class RollingAggregation:
     cdef unique_ptr[rolling_aggregation] c_obj
 
+cdef class GroupbyAggregation:
+    cdef unique_ptr[groupby_aggregation] c_obj
+
+cdef class GroupbyScanAggregation:
+    cdef unique_ptr[groupby_scan_aggregation] c_obj
+
 cdef Aggregation make_aggregation(op, kwargs=*)
 cdef RollingAggregation make_rolling_aggregation(op, kwargs=*)
+cdef GroupbyAggregation make_groupby_aggregation(op, kwargs=*)
+cdef GroupbyScanAggregation make_groupby_scan_aggregation(op, kwargs=*)
diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 211d7c996cb..da407cdbfa8 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -461,6 +461,299 @@ cdef class RollingAggregation:
             ))
         return agg
 
+cdef class GroupbyAggregation:
+    """A Cython wrapper for groupby aggregations.
+
+    **This class should never be instantiated using a standard constructor,
+    only using one of its many factories.** These factories handle mapping
+    different cudf operations to their libcudf analogs, e.g.
+    `cudf.DataFrame.idxmin` -> `libcudf.argmin`. Additionally, they perform
+    any additional configuration needed to translate Python arguments into
+    their corresponding C++ types (for instance, C++ enumerations used for
+    flag arguments). The factory approach is necessary to support operations
+    like `df.agg(lambda x: x.sum())`; such functions are called with this
+    class as an argument to generation the desired aggregation.
+    """
+    @property
+    def kind(self):
+        return AggregationKind(self.c_obj.get()[0].kind).name
+
+    @classmethod
+    def sum(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_sum_aggregation[groupby_aggregation]())
+        return agg
+
+    @classmethod
+    def min(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_min_aggregation[groupby_aggregation]())
+        return agg
+
+    @classmethod
+    def max(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_max_aggregation[groupby_aggregation]())
+        return agg
+
+    @classmethod
+    def idxmin(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_argmin_aggregation[
+                groupby_aggregation]())
+        return agg
+
+    @classmethod
+    def idxmax(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_argmax_aggregation[
+                groupby_aggregation]())
+        return agg
+
+    @classmethod
+    def mean(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_mean_aggregation[groupby_aggregation]())
+        return agg
+
+    @classmethod
+    def count(cls, dropna=True):
+        cdef libcudf_types.null_policy c_null_handling
+        if dropna:
+            c_null_handling = libcudf_types.null_policy.EXCLUDE
+        else:
+            c_null_handling = libcudf_types.null_policy.INCLUDE
+
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_count_aggregation[groupby_aggregation](
+                c_null_handling
+            ))
+        return agg
+
+    @classmethod
+    def size(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_count_aggregation[groupby_aggregation](
+                <libcudf_types.null_policy><underlying_type_t_null_policy>(
+                    NullHandling.INCLUDE)
+            ))
+        return agg
+
+    @classmethod
+    def collect(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_collect_list_aggregation[groupby_aggregation]())
+        return agg
+
+    @classmethod
+    def nunique(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_nunique_aggregation[groupby_aggregation]())
+        return agg
+
+    @classmethod
+    def nth(cls, libcudf_types.size_type size):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_nth_element_aggregation[groupby_aggregation](size))
+        return agg
+
+    @classmethod
+    def product(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_product_aggregation[groupby_aggregation]())
+        return agg
+    prod = product
+
+    @classmethod
+    def sum_of_squares(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_sum_of_squares_aggregation[groupby_aggregation]()
+        )
+        return agg
+
+    @classmethod
+    def var(cls, ddof=1):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_variance_aggregation[groupby_aggregation](ddof))
+        return agg
+
+    @classmethod
+    def std(cls, ddof=1):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_std_aggregation[groupby_aggregation](ddof))
+        return agg
+
+    @classmethod
+    def median(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_median_aggregation[groupby_aggregation]())
+        return agg
+
+    @classmethod
+    def quantile(cls, q=0.5, interpolation="linear"):
+        cdef GroupbyAggregation agg = cls()
+
+        if not pd.api.types.is_list_like(q):
+            q = [q]
+
+        cdef vector[double] c_q = q
+        cdef libcudf_types.interpolation c_interp = (
+            <libcudf_types.interpolation> (
+                <underlying_type_t_interpolation> (
+                    Interpolation[interpolation.upper()]
+                )
+            )
+        )
+        agg.c_obj = move(
+            libcudf_aggregation.make_quantile_aggregation[groupby_aggregation](
+                c_q, c_interp)
+        )
+        return agg
+
+    @classmethod
+    def unique(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_collect_set_aggregation[groupby_aggregation]())
+        return agg
+
+    @classmethod
+    def first(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_nth_element_aggregation[groupby_aggregation](
+                0,
+                <libcudf_types.null_policy><underlying_type_t_null_policy>(
+                    NullHandling.EXCLUDE
+                )
+            )
+        )
+        return agg
+
+    @classmethod
+    def last(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_nth_element_aggregation[groupby_aggregation](
+                -1,
+                <libcudf_types.null_policy><underlying_type_t_null_policy>(
+                    NullHandling.EXCLUDE
+                )
+            )
+        )
+        return agg
+
+cdef class GroupbyScanAggregation:
+    """A Cython wrapper for groupby scan aggregations.
+
+    **This class should never be instantiated using a standard constructor,
+    only using one of its many factories.** These factories handle mapping
+    different cudf operations to their libcudf analogs, e.g.
+    `cudf.DataFrame.idxmin` -> `libcudf.argmin`. Additionally, they perform
+    any additional configuration needed to translate Python arguments into
+    their corresponding C++ types (for instance, C++ enumerations used for
+    flag arguments). The factory approach is necessary to support operations
+    like `df.agg(lambda x: x.sum())`; such functions are called with this
+    class as an argument to generation the desired aggregation.
+    """
+    @property
+    def kind(self):
+        return AggregationKind(self.c_obj.get()[0].kind).name
+
+    @classmethod
+    def sum(cls):
+        cdef GroupbyScanAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_sum_aggregation[groupby_scan_aggregation]())
+        return agg
+
+    @classmethod
+    def min(cls):
+        cdef GroupbyScanAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_min_aggregation[groupby_scan_aggregation]())
+        return agg
+
+    @classmethod
+    def max(cls):
+        cdef GroupbyScanAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_max_aggregation[groupby_scan_aggregation]())
+        return agg
+
+    @classmethod
+    def count(cls, dropna=True):
+        cdef libcudf_types.null_policy c_null_handling
+        if dropna:
+            c_null_handling = libcudf_types.null_policy.EXCLUDE
+        else:
+            c_null_handling = libcudf_types.null_policy.INCLUDE
+
+        cdef GroupbyScanAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_count_aggregation[groupby_scan_aggregation](c_null_handling))
+        return agg
+
+    @classmethod
+    def size(cls):
+        cdef GroupbyScanAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_count_aggregation[groupby_scan_aggregation](
+                <libcudf_types.null_policy><underlying_type_t_null_policy>(
+                    NullHandling.INCLUDE)
+            ))
+        return agg
+
+    @classmethod
+    def cumcount(cls):
+        cdef GroupbyScanAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_count_aggregation[groupby_scan_aggregation](
+                libcudf_types.null_policy.INCLUDE
+            ))
+        return agg
+
+    # scan aggregations
+    # TODO: update this after adding per algorithm aggregation derived types
+    # https://github.com/rapidsai/cudf/issues/7106
+    cumsum = sum
+    cummin = min
+    cummax = max
+
+
 cdef Aggregation make_aggregation(op, kwargs=None):
     r"""
     Parameters
@@ -536,3 +829,79 @@ cdef RollingAggregation make_rolling_aggregation(op, kwargs=None):
     else:
         raise TypeError(f"Unknown aggregation {op}")
     return agg
+
+cdef GroupbyAggregation make_groupby_aggregation(op, kwargs=None):
+    r"""
+    Parameters
+    ----------
+    op : str or callable
+        If callable, must meet one of the following requirements:
+
+        * Is of the form lambda x: x.agg(*args, **kwargs), where
+          `agg` is the name of a supported aggregation. Used to
+          to specify aggregations that take arguments, e.g.,
+          `lambda x: x.quantile(0.5)`.
+        * Is a user defined aggregation function that operates on
+          group values. In this case, the output dtype must be
+          specified in the `kwargs` dictionary.
+    \*\*kwargs : dict, optional
+        Any keyword arguments to be passed to the op.
+
+    Returns
+    -------
+    GroupbyAggregation
+    """
+    if kwargs is None:
+        kwargs = {}
+
+    cdef GroupbyAggregation agg
+    if isinstance(op, str):
+        agg = getattr(GroupbyAggregation, op)(**kwargs)
+    elif callable(op):
+        if op is list:
+            agg = GroupbyAggregation.collect()
+        elif "dtype" in kwargs:
+            agg = GroupbyAggregation.from_udf(op, **kwargs)
+        else:
+            agg = op(GroupbyAggregation)
+    else:
+        raise TypeError(f"Unknown aggregation {op}")
+    return agg
+
+cdef GroupbyScanAggregation make_groupby_scan_aggregation(op, kwargs=None):
+    r"""
+    Parameters
+    ----------
+    op : str or callable
+        If callable, must meet one of the following requirements:
+
+        * Is of the form lambda x: x.agg(*args, **kwargs), where
+          `agg` is the name of a supported aggregation. Used to
+          to specify aggregations that take arguments, e.g.,
+          `lambda x: x.quantile(0.5)`.
+        * Is a user defined aggregation function that operates on
+          group values. In this case, the output dtype must be
+          specified in the `kwargs` dictionary.
+    \*\*kwargs : dict, optional
+        Any keyword arguments to be passed to the op.
+
+    Returns
+    -------
+    GroupbyScanAggregation
+    """
+    if kwargs is None:
+        kwargs = {}
+
+    cdef GroupbyScanAggregation agg
+    if isinstance(op, str):
+        agg = getattr(GroupbyScanAggregation, op)(**kwargs)
+    elif callable(op):
+        if op is list:
+            agg = GroupbyScanAggregation.collect()
+        elif "dtype" in kwargs:
+            agg = GroupbyScanAggregation.from_udf(op, **kwargs)
+        else:
+            agg = op(GroupbyScanAggregation)
+    else:
+        raise TypeError(f"Unknown aggregation {op}")
+    return agg
diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd
index 6daee5077ed..13bfa49057c 100644
--- a/python/cudf/cudf/_lib/cpp/aggregation.pxd
+++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd
@@ -43,6 +43,12 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
     cdef cppclass rolling_aggregation:
         aggregation.Kind kind
 
+    cdef cppclass groupby_aggregation:
+        aggregation.Kind kind
+
+    cdef cppclass groupby_scan_aggregation:
+        aggregation.Kind kind
+
     ctypedef enum udf_type:
         CUDA 'cudf::udf_type::CUDA'
         PTX 'cudf::udf_type::PTX'
diff --git a/python/cudf/cudf/_lib/cpp/groupby.pxd b/python/cudf/cudf/_lib/cpp/groupby.pxd
index 2d8f251799d..2ecdf76842f 100644
--- a/python/cudf/cudf/_lib/cpp/groupby.pxd
+++ b/python/cudf/cudf/_lib/cpp/groupby.pxd
@@ -5,7 +5,10 @@ from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.aggregation cimport aggregation
+from cudf._lib.cpp.aggregation cimport (
+    groupby_aggregation,
+    groupby_scan_aggregation,
+)
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
@@ -26,7 +29,12 @@ cdef extern from "cudf/groupby.hpp" \
     cdef cppclass aggregation_request:
         aggregation_request() except +
         column_view values
-        vector[unique_ptr[aggregation]] aggregations
+        vector[unique_ptr[groupby_aggregation]] aggregations
+
+    cdef cppclass scan_request:
+        scan_request() except +
+        column_view values
+        vector[unique_ptr[groupby_scan_aggregation]] aggregations
 
     cdef cppclass aggregation_result:
         vector[unique_ptr[column]] results
@@ -76,7 +84,7 @@ cdef extern from "cudf/groupby.hpp" \
             unique_ptr[table],
             vector[aggregation_result]
         ] scan(
-            const vector[aggregation_request]& requests,
+            const vector[scan_request]& requests,
         ) except +
 
         pair[
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index ed9820300d8..d7416625248 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -32,7 +32,12 @@ from cudf._lib.scalar import as_device_scalar
 
 cimport cudf._lib.cpp.groupby as libcudf_groupby
 cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.aggregation cimport Aggregation, make_aggregation
+from cudf._lib.aggregation cimport (
+    GroupbyAggregation,
+    GroupbyScanAggregation,
+    make_groupby_aggregation,
+    make_groupby_scan_aggregation,
+)
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
@@ -105,30 +110,13 @@ cdef class GroupBy:
         )
         return grouped_keys, grouped_values, c_group_offsets
 
-    def aggregate(self, Table values, aggregations):
-        """
-        Parameters
-        ----------
-        values : Table
-        aggregations
-            A dict mapping column names in `Table` to a list of aggregations
-            to perform on that column
-
-            Each aggregation may be specified as:
-            - a string (e.g., "max")
-            - a lambda/function
-
-        Returns
-        -------
-        Table of aggregated values
-        """
+    def aggregate_internal(self, Table values, aggregations):
         from cudf.core.column_accessor import ColumnAccessor
         cdef vector[libcudf_groupby.aggregation_request] c_agg_requests
         cdef libcudf_groupby.aggregation_request c_agg_request
         cdef Column col
-        cdef Aggregation agg_obj
+        cdef GroupbyAggregation agg_obj
 
-        cdef bool scan = _is_all_scan_aggregate(aggregations)
         allow_empty = all(len(v) == 0 for v in aggregations.values())
 
         included_aggregations = defaultdict(list)
@@ -154,7 +142,7 @@ cdef class GroupBy:
 
             c_agg_request = move(libcudf_groupby.aggregation_request())
             for agg in aggs:
-                agg_obj = make_aggregation(agg)
+                agg_obj = make_groupby_aggregation(agg)
                 if (valid_aggregations == "ALL"
                         or agg_obj.kind in valid_aggregations):
                     included_aggregations[col_name].append(agg)
@@ -175,30 +163,90 @@ cdef class GroupBy:
             vector[libcudf_groupby.aggregation_result]
         ] c_result
 
-        try:
-            with nogil:
-                if scan:
-                    c_result = move(
-                        self.c_obj.get()[0].scan(
-                            c_agg_requests
-                        )
-                    )
-                else:
-                    c_result = move(
-                        self.c_obj.get()[0].aggregate(
-                            c_agg_requests
-                        )
+        with nogil:
+            c_result = move(
+                self.c_obj.get()[0].aggregate(
+                    c_agg_requests
+                )
+            )
+
+        grouped_keys, _ = data_from_unique_ptr(
+            move(c_result.first),
+            column_names=self.keys._column_names
+        )
+
+        result_data = ColumnAccessor(multiindex=True)
+        # Note: This loop relies on the included_aggregations dict being
+        # insertion ordered to map results to requested aggregations by index.
+        for i, col_name in enumerate(included_aggregations):
+            for j, agg_name in enumerate(included_aggregations[col_name]):
+                if callable(agg_name):
+                    agg_name = agg_name.__name__
+                result_data[(col_name, agg_name)] = (
+                    Column.from_unique_ptr(move(c_result.second[i].results[j]))
+                )
+
+        return result_data, cudf.Index._from_data(grouped_keys)
+
+    def scan_internal(self, Table values, aggregations):
+        from cudf.core.column_accessor import ColumnAccessor
+        cdef vector[libcudf_groupby.scan_request] c_agg_requests
+        cdef libcudf_groupby.scan_request c_agg_request
+        cdef Column col
+        cdef GroupbyScanAggregation agg_obj
+
+        allow_empty = all(len(v) == 0 for v in aggregations.values())
+
+        included_aggregations = defaultdict(list)
+        for i, (col_name, aggs) in enumerate(aggregations.items()):
+            col = values._data[col_name]
+            dtype = col.dtype
+
+            valid_aggregations = (
+                _LIST_AGGS if is_list_dtype(dtype)
+                else _STRING_AGGS if is_string_dtype(dtype)
+                else _CATEGORICAL_AGGS if is_categorical_dtype(dtype)
+                else _STRUCT_AGGS if is_struct_dtype(dtype)
+                else _INTERVAL_AGGS if is_interval_dtype(dtype)
+                else _DECIMAL_AGGS if is_decimal_dtype(dtype)
+                else "ALL"
+            )
+            if (valid_aggregations is _DECIMAL_AGGS
+                    and rmm._cuda.gpu.runtimeGetVersion() < 11000):
+                raise RuntimeError(
+                    "Decimal aggregations are only supported on CUDA >= 11 "
+                    "due to an nvcc compiler bug."
+                )
+
+            c_agg_request = move(libcudf_groupby.scan_request())
+            for agg in aggs:
+                agg_obj = make_groupby_scan_aggregation(agg)
+                if (valid_aggregations == "ALL"
+                        or agg_obj.kind in valid_aggregations):
+                    included_aggregations[col_name].append(agg)
+                    c_agg_request.aggregations.push_back(
+                        move(agg_obj.c_obj)
                     )
-        except RuntimeError as e:
-            # TODO: remove this try..except after
-            # https://github.com/rapidsai/cudf/issues/7611
-            # is resolved
-            if ("make_empty_column") in str(e):
-                raise NotImplementedError(
-                    "Aggregation not supported for empty columns"
-                ) from e
-            else:
-                raise
+            if not c_agg_request.aggregations.empty():
+                c_agg_request.values = col.view()
+                c_agg_requests.push_back(
+                    move(c_agg_request)
+                )
+
+        if c_agg_requests.empty() and not allow_empty:
+            raise DataError("All requested aggregations are unsupported.")
+
+        cdef pair[
+            unique_ptr[table],
+            vector[libcudf_groupby.aggregation_result]
+        ] c_result
+
+        with nogil:
+            c_result = move(
+                self.c_obj.get()[0].scan(
+                    c_agg_requests
+                )
+            )
 
         grouped_keys, _ = data_from_unique_ptr(
             move(c_result.first),
@@ -218,6 +266,28 @@ cdef class GroupBy:
 
         return result_data, cudf.Index._from_data(grouped_keys)
 
+    def aggregate(self, Table values, aggregations):
+        """
+        Parameters
+        ----------
+        values : Table
+        aggregations
+            A dict mapping column names in `Table` to a list of aggregations
+            to perform on that column
+
+            Each aggregation may be specified as:
+            - a string (e.g., "max")
+            - a lambda/function
+
+        Returns
+        -------
+        Table of aggregated values
+        """
+        if _is_all_scan_aggregate(aggregations):
+            return self.scan_internal(values, aggregations)
+
+        return self.aggregate_internal(values, aggregations)
+
     def shift(self, Table values, int periods, list fill_values):
         cdef table_view view = values.view()
         cdef size_type num_col = view.num_columns()

From 8b02ca32c8f9004de5bbb1d76bd77b17358538b4 Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Thu, 19 Aug 2021 11:28:31 -0500
Subject: [PATCH 14/46] use correct namespace in cuio code examples (#9037)

use correct namespace in cuio reader/writer code examples

Authors:
  - Christopher Harris (https://github.com/cwharris)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)
  - Devavret Makkar (https://github.com/devavret)

URL: https://github.com/rapidsai/cudf/pull/9037
---
 cpp/include/cudf/io/avro.hpp    |  9 +++------
 cpp/include/cudf/io/csv.hpp     | 18 ++++++++----------
 cpp/include/cudf/io/json.hpp    |  8 +++-----
 cpp/include/cudf/io/orc.hpp     | 18 ++++++------------
 cpp/include/cudf/io/parquet.hpp | 29 ++++++++++-------------------
 5 files changed, 30 insertions(+), 52 deletions(-)

diff --git a/cpp/include/cudf/io/avro.hpp b/cpp/include/cudf/io/avro.hpp
index 34410209c72..774690c939f 100644
--- a/cpp/include/cudf/io/avro.hpp
+++ b/cpp/include/cudf/io/avro.hpp
@@ -195,12 +195,9 @@ class avro_reader_options_builder {
  *
  * The following code snippet demonstrates how to read a dataset from a file:
  * @code
- *  ...
- *  std::string filepath = "dataset.avro";
- *  cudf::avro_reader_options options =
- * cudf::avro_reader_options::builder(cudf::source_info(filepath));
- *  ...
- *  auto result = cudf::read_avro(options);
+ *  auto source  = cudf::io::source_info("dataset.avro");
+ *  auto options = cudf::io::avro_reader_options::builder(source);
+ *  auto result  = cudf::io::read_avro(options);
  * @endcode
  *
  * @param options Settings for controlling reading behavior.
diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index d4a21b2e98c..fece1cb52b0 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -1185,11 +1185,9 @@ class csv_reader_options_builder {
  *
  * The following code snippet demonstrates how to read a dataset from a file:
  * @code
- *  std::string filepath = "dataset.csv";
- *  cudf::io::csv_reader_options options =
- * cudf::io::csv_reader_options::builder(cudf::source_info(filepath));
- *  ...
- *  auto result = cudf::read_csv(options);
+ *  auto source  = cudf::io::source_info("dataset.csv");
+ *  auto options = cudf::io::csv_reader_options::builder(source);
+ *  auto result  = cudf::io::read_csv(options);
  * @endcode
  *
  * @param options Settings for controlling reading behavior.
@@ -1514,12 +1512,12 @@ class csv_writer_options_builder {
  *
  * The following code snippet demonstrates how to write columns to a file:
  * @code
- *  std::string filepath = "dataset.csv";
- *  cudf::io::sink_info sink_info(filepath);
+ *  auto destination = cudf::io::sink_info("dataset.csv");
+ *  auto options     = cudf::io::csv_writer_options(destination, table->view())
+ *    .na_rep(na)
+ *    .include_header(include_header)
+ *    .rows_per_chunk(rows_per_chunk);
  *
- *  cudf::io::csv_writer_options options = cudf::io::csv_writer_options(sink_info,
- * table->view()).na_rep(na).include_header(include_header).rows_per_chunk(rows_per_chunk);
- *  ...
  *  cudf::io::write_csv(options);
  * @endcode
  *
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 8954f7dcab1..60f990c87d8 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -350,11 +350,9 @@ class json_reader_options_builder {
  *
  * The following code snippet demonstrates how to read a dataset from a file:
  * @code
- *  ...
- *  std::string filepath = "dataset.json";
- *  cudf::read_json_options options = cudf::read_json_options::builder(cudf::source_info(filepath));
- *  ...
- *  auto result = cudf::read_json(options);
+ *  auto source  = cudf::io::source_info("dataset.json");
+ *  auto options = cudf::io::read_json_options::builder(source);
+ *  auto result  = cudf::io::read_json(options);
  * @endcode
  *
  * @param options Settings for controlling reading behavior.
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 52d3138d3a1..4ae09b516a4 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -346,12 +346,9 @@ class orc_reader_options_builder {
  *
  * The following code snippet demonstrates how to read a dataset from a file:
  * @code
- *  ...
- *  std::string filepath = "dataset.orc";
- *  cudf::orc_reader_options options =
- * cudf::orc_reader_options::builder(cudf::source_info(filepath));
- *  ...
- *  auto result = cudf::read_orc(options);
+ *  auto source  = cudf::io::source_info("dataset.orc");
+ *  auto options = cudf::io::orc_reader_options::builder(source);
+ *  auto result  = cudf::io::read_orc(options);
  * @endcode
  *
  * Note: Support for reading files with struct columns is currently experimental, the output may not
@@ -568,12 +565,9 @@ class orc_writer_options_builder {
  *
  * The following code snippet demonstrates how to write columns to a file:
  * @code
- *  ...
- *  std::string filepath = "dataset.orc";
- *  cudf::orc_writer_options options = cudf::orc_writer_options::builder(cudf::sink_info(filepath),
- * table->view());
- *  ...
- *  cudf::write_orc(options);
+ *  auto destination = cudf::io::sink_info("dataset.orc");
+ *  auto options     = cudf::io::orc_writer_options::builder(destination, table->view());
+ *  cudf::io::write_orc(options);
  * @endcode
  *
  * @param options Settings for controlling reading behavior.
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 031228ae6de..25cbb6fd554 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -354,12 +354,9 @@ class parquet_reader_options_builder {
  *
  * The following code snippet demonstrates how to read a dataset from a file:
  * @code
- *  ...
- *  std::string filepath = "dataset.parquet";
- *  cudf::io::parquet_reader_options options =
- *  cudf::io::parquet_reader_options::builder(cudf::source_info(filepath));
- *  ...
- *  auto result = cudf::read_parquet(options);
+ *  auto source  = cudf::io::source_info("dataset.parquet");
+ *  auto options = cudf::io::parquet_reader_options::builder(source);
+ *  auto result  = cudf::io::read_parquet(options);
  * @endcode
  *
  * @param options Settings for controlling reading behavior
@@ -784,12 +781,9 @@ class parquet_writer_options_builder {
  *
  * The following code snippet demonstrates how to write columns to a file:
  * @code
- *  ...
- *  std::string filepath = "dataset.parquet";
- *  cudf::io::parquet_writer_options options =
- *  cudf::io::parquet_writer_options::builder(cudf::sink_info(filepath), table->view());
- *  ...
- *  cudf::write_parquet(options);
+ *  auto destination = cudf::io::sink_info("dataset.parquet");
+ *  auto options     = cudf::io::parquet_writer_options::builder(destination, table->view());
+ *  cudf::io::write_parquet(options);
  * @endcode
  *
  * @param options Settings for controlling writing behavior.
@@ -1019,15 +1013,12 @@ std::unique_ptr<std::vector<uint8_t>> merge_rowgroup_metadata(
  * one logical table by writing a series of individual cudf::tables.
  *
  * @code
- *  ...
- *  std::string filepath = "dataset.parquet";
- *  cudf::io::chunked_parquet_writer_options options =
- *  cudf::io::chunked_parquet_writer_options::builder(cudf::sink_info(filepath), table->view());
- *  ...
- *  cudf::io::parquet_chunked_writer writer(options)
+ *  auto destination = cudf::io::sink_info("dataset.parquet");
+ *  auto options = cudf::io::chunked_parquet_writer_options::builder(destination, table->view());
+ *  auto writer  = cudf::io::parquet_chunked_writer(options);
+ *
  *  writer.write(table0)
  *  writer.write(table1)
- *  ...
  *  writer.close()
  *  @endcode
  */

From c3f06a57c1bf3489d96916739654398a2bd3bb6a Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Thu, 19 Aug 2021 16:17:45 -0500
Subject: [PATCH 15/46] Fix compilation errors in groupby benchmarks. (#9072)

Fixes a couple of compilation errors caused by new groupby_aggregation class.

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Christopher Harris (https://github.com/cwharris)
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)
  - Nghia Truong (https://github.com/ttnghia)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/9072
---
 cpp/benchmarks/groupby/group_nth_benchmark.cu | 3 ++-
 cpp/benchmarks/groupby/group_sum_benchmark.cu | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/cpp/benchmarks/groupby/group_nth_benchmark.cu b/cpp/benchmarks/groupby/group_nth_benchmark.cu
index c6dbffb162e..8d1de36db95 100644
--- a/cpp/benchmarks/groupby/group_nth_benchmark.cu
+++ b/cpp/benchmarks/groupby/group_nth_benchmark.cu
@@ -63,7 +63,8 @@ void BM_pre_sorted_nth(benchmark::State& state)
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
   requests[0].values = vals;
-  requests[0].aggregations.push_back(cudf::make_nth_element_aggregation<groupby_aggregation>(-1));
+  requests[0].aggregations.push_back(
+    cudf::make_nth_element_aggregation<cudf::groupby_aggregation>(-1));
 
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
diff --git a/cpp/benchmarks/groupby/group_sum_benchmark.cu b/cpp/benchmarks/groupby/group_sum_benchmark.cu
index 1455f1cecdc..6351da66fdd 100644
--- a/cpp/benchmarks/groupby/group_sum_benchmark.cu
+++ b/cpp/benchmarks/groupby/group_sum_benchmark.cu
@@ -58,7 +58,7 @@ void BM_basic_sum(benchmark::State& state)
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
   requests[0].values = vals;
-  requests[0].aggregations.push_back(cudf::make_sum_aggregation());
+  requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
 
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
@@ -97,7 +97,7 @@ void BM_pre_sorted_sum(benchmark::State& state)
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
   requests[0].values = vals;
-  requests[0].aggregations.push_back(cudf::make_sum_aggregation());
+  requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
 
   for (auto _ : state) {
     cuda_event_timer timer(state, true);

From eb85d77ccccbfece37b69c653cf6ce08e885d849 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 19 Aug 2021 23:13:59 -0700
Subject: [PATCH 16/46] Remove the option to pass data types as strings to
 `read_csv` and `read_json` (#9079)

closes #8240

Also added a missing check for dtype count to the JSON reader and negative tests for the check.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Elias Stehle (https://github.com/elstehle)

URL: https://github.com/rapidsai/cudf/pull/9079
---
 cpp/include/cudf/io/csv.hpp     | 38 +--------------
 cpp/include/cudf/io/json.hpp    | 38 +--------------
 cpp/src/io/csv/reader_impl.cu   | 86 +--------------------------------
 cpp/src/io/csv/reader_impl.hpp  |  9 ----
 cpp/src/io/json/reader_impl.cu  | 79 ++++++++----------------------
 cpp/src/io/json/reader_impl.hpp |  2 -
 cpp/tests/io/csv_test.cpp       |  6 ++-
 cpp/tests/io/json_test.cpp      | 23 +++++++++
 8 files changed, 54 insertions(+), 227 deletions(-)

diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index fece1cb52b0..455ffce7ed8 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -115,8 +115,7 @@ class csv_reader_options {
   // Conversion settings
 
   // Per-column types; disables type inference on those columns
-  std::variant<std::vector<std::string>, std::vector<data_type>, std::map<std::string, data_type>>
-    _dtypes;
+  std::variant<std::vector<data_type>, std::map<std::string, data_type>> _dtypes;
   // Additional values to recognize as boolean true values
   std::vector<std::string> _true_values{"True", "TRUE", "true"};
   // Additional values to recognize as boolean false values
@@ -305,10 +304,7 @@ class csv_reader_options {
   /**
    * @brief Returns per-column types.
    */
-  std::variant<std::vector<std::string>,
-               std::vector<data_type>,
-               std::map<std::string, data_type>> const&
-  get_dtypes() const
+  std::variant<std::vector<data_type>, std::map<std::string, data_type>> const& get_dtypes() const
   {
     return _dtypes;
   }
@@ -608,20 +604,6 @@ class csv_reader_options {
    */
   void set_dtypes(std::vector<data_type> types) { _dtypes = std::move(types); }
 
-  /**
-   * @brief Sets per-column types, specified by the type's respective string representation.
-   *
-   * @param types Vector of dtypes in which the column needs to be read.
-   */
-  [[deprecated(
-    "The string-based interface will be deprecated."
-    "Use dtypes(std::vector<data_type>) or "
-    "dtypes(std::map<std::string, data_type>) instead.")]] void
-  set_dtypes(std::vector<std::string> types)
-  {
-    _dtypes = std::move(types);
-  }
-
   /**
    * @brief Sets additional values to recognize as boolean true values.
    *
@@ -1067,22 +1049,6 @@ class csv_reader_options_builder {
     return *this;
   }
 
-  /**
-   * @brief Sets per-column types, specified by the type's respective string representation.
-   *
-   * @param types Vector of dtypes in which the column needs to be read.
-   * @return this for chaining.
-   */
-  [[deprecated(
-    "The string-based interface will be deprecated."
-    "Use dtypes(std::vector<data_type>) or "
-    "dtypes(std::map<std::string, data_type>) instead.")]] csv_reader_options_builder&
-  dtypes(std::vector<std::string> types)
-  {
-    options._dtypes = std::move(types);
-    return *this;
-  }
-
   /**
    * @brief Sets additional values to recognize as boolean true values.
    *
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 60f990c87d8..31201e30ac6 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -68,8 +68,7 @@ class json_reader_options {
   source_info _source;
 
   // Data types of the column; empty to infer dtypes
-  std::variant<std::vector<std::string>, std::vector<data_type>, std::map<std::string, data_type>>
-    _dtypes;
+  std::variant<std::vector<data_type>, std::map<std::string, data_type>> _dtypes;
   // Specify the compression format of the source or infer from file extension
   compression_type _compression = compression_type::AUTO;
 
@@ -117,10 +116,7 @@ class json_reader_options {
   /**
    * @brief Returns data types of the columns.
    */
-  std::variant<std::vector<std::string>,
-               std::vector<data_type>,
-               std::map<std::string, data_type>> const&
-  get_dtypes() const
+  std::variant<std::vector<data_type>, std::map<std::string, data_type>> const& get_dtypes() const
   {
     return _dtypes;
   }
@@ -150,20 +146,6 @@ class json_reader_options {
    */
   bool is_enabled_dayfirst() const { return _dayfirst; }
 
-  /**
-   * @brief Set data types for columns to be read.
-   *
-   * @param types Vector of dtypes in string format.
-   */
-  [[deprecated(
-    "The string-based interface will be deprecated."
-    "Use dtypes(std::vector<data_type>) or "
-    "dtypes(std::map<std::string, data_type>) instead.")]] void
-  set_dtypes(std::vector<std::string> types)
-  {
-    _dtypes = std::move(types);
-  }
-
   /**
    * @brief Set data types for columns to be read.
    *
@@ -232,22 +214,6 @@ class json_reader_options_builder {
    */
   explicit json_reader_options_builder(source_info const& src) : options(src) {}
 
-  /**
-   * @brief Set data types for columns to be read.
-   *
-   * @param types Vector of dtypes in string format
-   * @return this for chaining
-   */
-  [[deprecated(
-    "The string-based interface will be deprecated."
-    "Use dtypes(std::vector<data_type>) or "
-    "dtypes(std::map<std::string, data_type>) instead.")]] json_reader_options_builder&
-  dtypes(std::vector<std::string> types)
-  {
-    options._dtypes = std::move(types);
-    return *this;
-  }
-
   /**
    * @brief Set data types for columns to be read.
    *
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 549b0474fe1..7f85589a8aa 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -27,7 +27,6 @@
 
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/strings/replace.hpp>
 #include <cudf/table/table.hpp>
@@ -420,14 +419,8 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
   if (has_to_infer_column_types) {
     column_types = infer_column_types(data, row_offsets, stream);
   } else {
-    column_types = std::visit(
-      cudf::detail::visitor_overload{
-        [&](const std::vector<data_type>& data_types) { return select_data_types(data_types); },
-        [&](const std::map<std::string, data_type>& data_types) {
-          return select_data_types(data_types);
-        },
-        [&](const std::vector<string>& dtypes) { return parse_column_types(dtypes); }},
-      opts_.get_dtypes());
+    column_types = std::visit([&](auto const& data_types) { return select_data_types(data_types); },
+                              opts_.get_dtypes());
   }
 
   out_columns.reserve(column_types.size());
@@ -707,81 +700,6 @@ std::vector<data_type> reader::impl::infer_column_types(device_span<char const>
   return dtypes;
 }
 
-std::vector<data_type> reader::impl::parse_column_types(
-  const std::vector<std::string>& types_as_strings)
-{
-  std::vector<data_type> dtypes;
-
-  const bool is_dict = std::all_of(types_as_strings.begin(),
-                                   types_as_strings.end(),
-                                   [](const auto& s) { return s.find(':') != std::string::npos; });
-
-  if (!is_dict) {
-    if (types_as_strings.size() == 1) {
-      // If it's a single dtype, assign that dtype to all active columns
-      data_type dtype_;
-      column_parse::flags col_flags_;
-      std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[0]);
-      dtypes.resize(num_active_cols_, dtype_);
-      for (int col = 0; col < num_actual_cols_; col++) {
-        column_flags_[col] |= col_flags_;
-      }
-      CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
-    } else {
-      // If it's a list, assign dtypes to active columns in the given order
-      CUDF_EXPECTS(static_cast<int>(types_as_strings.size()) >= num_actual_cols_,
-                   "Must specify data types for all columns");
-
-      auto dtype_ = std::back_inserter(dtypes);
-
-      for (int col = 0; col < num_actual_cols_; col++) {
-        if (column_flags_[col] & column_parse::enabled) {
-          column_parse::flags col_flags_;
-          std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[col]);
-          column_flags_[col] |= col_flags_;
-          CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
-        }
-      }
-    }
-  } else {
-    // Translate vector of `name : dtype` strings to map
-    // NOTE: Incoming pairs can be out-of-order from column names in dataset
-    std::unordered_map<std::string, std::string> col_type_map;
-    for (const auto& pair : types_as_strings) {
-      const auto pos     = pair.find_last_of(':');
-      const auto name    = pair.substr(0, pos);
-      const auto dtype   = pair.substr(pos + 1, pair.size());
-      col_type_map[name] = dtype;
-    }
-
-    auto dtype_ = std::back_inserter(dtypes);
-
-    for (int col = 0; col < num_actual_cols_; col++) {
-      if (column_flags_[col] & column_parse::enabled) {
-        CUDF_EXPECTS(col_type_map.find(col_names_[col]) != col_type_map.end(),
-                     "Must specify data types for all active columns");
-        column_parse::flags col_flags_;
-        std::tie(dtype_, col_flags_) = get_dtype_info(col_type_map[col_names_[col]]);
-        column_flags_[col] |= col_flags_;
-        CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
-      }
-    }
-  }
-
-  if (opts_.get_timestamp_type().id() != cudf::type_id::EMPTY) {
-    for (auto& type : dtypes) {
-      if (cudf::is_timestamp(type)) { type = opts_.get_timestamp_type(); }
-    }
-  }
-
-  for (size_t i = 0; i < dtypes.size(); i++) {
-    // Replace EMPTY dtype with STRING
-    if (dtypes[i].id() == type_id::EMPTY) { dtypes[i] = data_type{type_id::STRING}; }
-  }
-
-  return dtypes;
-}
-
 std::vector<column_buffer> reader::impl::decode_data(device_span<char const> data,
                                                      device_span<uint64_t const> row_offsets,
                                                      host_span<data_type const> column_types,
diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
index 36c2bf4f9e7..4416457be16 100644
--- a/cpp/src/io/csv/reader_impl.hpp
+++ b/cpp/src/io/csv/reader_impl.hpp
@@ -197,15 +197,6 @@ class reader::impl {
    */
   std::vector<data_type> select_data_types(std::vector<data_type> const& dtypes);
 
-  /**
-   * @brief Parses the columns' data types from the vector of dtypes that are provided as strings.
-   *
-   * @param types_as_strings The vector of strings from which to parse the columns' target data
-   * types
-   * @return List of columns' data types
-   */
-  std::vector<data_type> parse_column_types(std::vector<std::string> const& types_as_strings);
-
   /**
    * @brief Converts the row-column data and outputs to column bufferrs.
    *
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 85608a0984a..f1080342312 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -466,71 +466,32 @@ void reader::impl::set_column_names(device_span<uint64_t const> rec_starts,
   }
 }
 
-std::vector<data_type> reader::impl::parse_data_types(
-  std::vector<std::string> const& types_as_strings)
-{
-  CUDF_EXPECTS(types_as_strings.size() == metadata_.column_names.size(),
-               "Need to specify the type of each column.\n");
-  std::vector<data_type> dtypes;
-  // Assume that the dtype is in dictionary format only if all elements contain a colon
-  const bool is_dict = std::all_of(
-    std::cbegin(types_as_strings), std::cend(types_as_strings), [](const std::string& s) {
-      return std::find(std::cbegin(s), std::cend(s), ':') != std::cend(s);
-    });
-
-  auto split_on_colon = [](std::string_view s) {
-    auto const i = s.find(":");
-    return std::pair{s.substr(0, i), s.substr(i + 1)};
-  };
-
-  if (is_dict) {
-    std::map<std::string, data_type> col_type_map;
-    std::transform(
-      std::cbegin(types_as_strings),
-      std::cend(types_as_strings),
-      std::inserter(col_type_map, col_type_map.end()),
-      [&](auto const& ts) {
-        auto const [col_name, type_str] = split_on_colon(ts);
-        return std::pair{std::string{col_name}, convert_string_to_dtype(std::string{type_str})};
-      });
-
-    // Using the map here allows O(n log n) complexity
-    std::transform(std::cbegin(metadata_.column_names),
-                   std::cend(metadata_.column_names),
-                   std::back_inserter(dtypes),
-                   [&](auto const& column_name) { return col_type_map[column_name]; });
-  } else {
-    std::transform(std::cbegin(types_as_strings),
-                   std::cend(types_as_strings),
-                   std::back_inserter(dtypes),
-                   [](auto const& col_dtype) { return convert_string_to_dtype(col_dtype); });
-  }
-  return dtypes;
-}
-
 void reader::impl::set_data_types(device_span<uint64_t const> rec_starts,
                                   rmm::cuda_stream_view stream)
 {
   bool has_to_infer_column_types =
     std::visit([](const auto& dtypes) { return dtypes.empty(); }, options_.get_dtypes());
   if (!has_to_infer_column_types) {
-    dtypes_ = std::visit(
-      cudf::detail::visitor_overload{
-        [&](const std::vector<data_type>& dtypes) { return dtypes; },
-        [&](const std::map<std::string, data_type>& dtypes) {
-          std::vector<data_type> sorted_dtypes;
-          std::transform(std::cbegin(metadata_.column_names),
-                         std::cend(metadata_.column_names),
-                         std::back_inserter(sorted_dtypes),
-                         [&](auto const& column_name) {
-                           auto const it = dtypes.find(column_name);
-                           CUDF_EXPECTS(it != dtypes.end(), "Must specify types for all columns");
-                           return it->second;
-                         });
-          return sorted_dtypes;
-        },
-        [&](std::vector<std::string> const& dtypes) { return parse_data_types(dtypes); }},
-      options_.get_dtypes());
+    dtypes_ = std::visit(cudf::detail::visitor_overload{
+                           [&](const std::vector<data_type>& dtypes) {
+                             CUDF_EXPECTS(dtypes.size() == metadata_.column_names.size(),
+                                          "Must specify types for all columns");
+                             return dtypes;
+                           },
+                           [&](const std::map<std::string, data_type>& dtypes) {
+                             std::vector<data_type> sorted_dtypes;
+                             std::transform(std::cbegin(metadata_.column_names),
+                                            std::cend(metadata_.column_names),
+                                            std::back_inserter(sorted_dtypes),
+                                            [&](auto const& column_name) {
+                                              auto const it = dtypes.find(column_name);
+                                              CUDF_EXPECTS(it != dtypes.end(),
+                                                           "Must specify types for all columns");
+                                              return it->second;
+                                            });
+                             return sorted_dtypes;
+                           }},
+                         options_.get_dtypes());
   } else {
     CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n");
     auto const num_columns       = metadata_.column_names.size();
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index 5cf51369cdf..bbda7e9ba74 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -158,8 +158,6 @@ class reader::impl {
    */
   void set_column_names(device_span<uint64_t const> rec_starts, rmm::cuda_stream_view stream);
 
-  std::vector<data_type> parse_data_types(std::vector<std::string> const& types_as_strings);
-
   /**
    * @brief Set the data type array data member
    *
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index 53e0ab14fd3..5b6270a8be1 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -1858,7 +1858,11 @@ TEST_F(CsvReaderTest, HeaderEmbeddedDelimiter)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names(names)
-      .dtypes(std::vector<std::string>{"int32", "str", "int32", "int32", "int32"});
+      .dtypes({dtype<int32_t>(),
+               dtype<cudf::string_view>(),
+               dtype<int32_t>(),
+               dtype<int32_t>(),
+               dtype<int32_t>()});
   auto result = cudf_io::read_csv(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result.tbl->view());
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index a263fa0fce0..e83592a028a 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -888,4 +888,27 @@ TEST_F(JsonReaderTest, JsonLinesMultipleFileInputs)
                                  float64_wrapper{{1.1, 2.2, 3.3, 4.4}, validity});
 }
 
+TEST_F(JsonReaderTest, BadDtypeParams)
+{
+  std::string buffer = "[1,2,3,4]";
+
+  cudf_io::json_reader_options options_vec =
+    cudf_io::json_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+      .lines(true)
+      .dtypes({dtype<int8_t>()});
+
+  // should throw because there are four columns and only one dtype
+  EXPECT_THROW(cudf_io::read_json(options_vec), cudf::logic_error);
+
+  cudf_io::json_reader_options options_map =
+    cudf_io::json_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+      .lines(true)
+      .dtypes(std::map<std::string, cudf::data_type>{{"0", dtype<int8_t>()},
+                                                     {"1", dtype<int8_t>()},
+                                                     {"2", dtype<int8_t>()},
+                                                     {"wrong_name", dtype<int8_t>()}});
+  // should throw because one of the columns is not in the dtype map
+  EXPECT_THROW(cudf_io::read_json(options_map), cudf::logic_error);
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 3ea4b4236fbf3adf8fabe46f46738c64455c3e79 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Fri, 20 Aug 2021 10:14:53 -0500
Subject: [PATCH 17/46] Java bindings for cudf::hash_join (#9080)

Adds Java APIs to build a hash table for the right side table in a join and re-use it to join against a series of left probe tables.  It also exposes the ability to compute the join output row count and pass that count to a subsequent call to produce the join gather maps to avoid redundant computation when the output row count must be examined before manifesting the gather maps.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/9080
---
 .../main/java/ai/rapids/cudf/HashJoin.java    | 127 ++++++++
 .../java/ai/rapids/cudf/MemoryCleaner.java    |   4 +
 java/src/main/java/ai/rapids/cudf/Table.java  | 220 +++++++++++++
 java/src/main/native/CMakeLists.txt           |   1 +
 java/src/main/native/src/HashJoinJni.cpp      |  45 +++
 java/src/main/native/src/TableJni.cpp         | 237 ++++++++++----
 .../java/ai/rapids/cudf/HashJoinTest.java     |  45 +++
 .../test/java/ai/rapids/cudf/TableTest.java   | 288 +++++++++++++++++-
 8 files changed, 900 insertions(+), 67 deletions(-)
 create mode 100644 java/src/main/java/ai/rapids/cudf/HashJoin.java
 create mode 100644 java/src/main/native/src/HashJoinJni.cpp
 create mode 100644 java/src/test/java/ai/rapids/cudf/HashJoinTest.java

diff --git a/java/src/main/java/ai/rapids/cudf/HashJoin.java b/java/src/main/java/ai/rapids/cudf/HashJoin.java
new file mode 100644
index 00000000000..620a7ce6a6c
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/HashJoin.java
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class represents a hash table built from the join keys of the right-side table for a
+ * join operation. This hash table can then be reused across a series of left probe tables
+ * to compute gather maps for joins more efficiently when the right-side table is not changing.
+ * It can also be used to query the output row count of a join and then pass that result to the
+ * operation that generates the join gather maps to avoid redundant computation when the output
+ * row count must be checked before manifesting the join gather maps.
+ */
+public class HashJoin implements AutoCloseable {
+  static {
+    NativeDepsLoader.loadNativeDeps();
+  }
+
+  private static final Logger log = LoggerFactory.getLogger(HashJoin.class);
+
+  private static class HashJoinCleaner extends MemoryCleaner.Cleaner {
+    private Table buildKeys;
+    private long nativeHandle;
+
+    HashJoinCleaner(Table buildKeys, long nativeHandle) {
+      this.buildKeys = buildKeys;
+      this.nativeHandle = nativeHandle;
+      addRef();
+    }
+
+    @Override
+    protected synchronized boolean cleanImpl(boolean logErrorIfNotClean) {
+      long origAddress = nativeHandle;
+      boolean neededCleanup = nativeHandle != 0;
+      if (neededCleanup) {
+        try {
+          destroy(nativeHandle);
+          buildKeys.close();
+          buildKeys = null;
+        } finally {
+          nativeHandle = 0;
+        }
+        if (logErrorIfNotClean) {
+          log.error("A HASH TABLE WAS LEAKED (ID: " + id + " " + Long.toHexString(origAddress));
+        }
+      }
+      return neededCleanup;
+    }
+
+    @Override
+    public boolean isClean() {
+      return nativeHandle == 0;
+    }
+  }
+
+  private final HashJoinCleaner cleaner;
+  private final boolean compareNulls;
+  private boolean isClosed = false;
+
+  /**
+   * Construct a hash table for a join from a table representing the join key columns from the
+   * right-side table in the join. The resulting instance must be closed to release the
+   * GPU resources associated with the instance.
+   * @param buildKeys table view containing the join keys for the right-side join table
+   * @param compareNulls true if null key values should match otherwise false
+   */
+  public HashJoin(Table buildKeys, boolean compareNulls) {
+    this.compareNulls = compareNulls;
+    Table buildTable = new Table(buildKeys.getColumns());
+    try {
+      long handle = create(buildTable.getNativeView(), compareNulls);
+      this.cleaner = new HashJoinCleaner(buildTable, handle);
+      MemoryCleaner.register(this, cleaner);
+    } catch (Throwable t) {
+      try {
+        buildTable.close();
+      } catch (Throwable t2) {
+        t.addSuppressed(t2);
+      }
+      throw t;
+    }
+  }
+
+  @Override
+  public synchronized void close() {
+    cleaner.delRef();
+    if (isClosed) {
+      cleaner.logRefCountDebug("double free " + this);
+      throw new IllegalStateException("Close called too many times " + this);
+    }
+    cleaner.clean(false);
+    isClosed = true;
+  }
+
+  long getNativeView() {
+    return cleaner.nativeHandle;
+  }
+
+  /** Get the number of join key columns for the table that was used to generate the has table. */
+  public long getNumberOfColumns() {
+    return cleaner.buildKeys.getNumberOfColumns();
+  }
+
+  /** Returns true if the hash table was built to match on nulls otherwise false. */
+  public boolean getCompareNulls() {
+    return compareNulls;
+  }
+
+  private static native long create(long tableView, boolean nullEqual);
+  private static native void destroy(long handle);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java b/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java
index 4bf38543a2d..a936d4830ee 100644
--- a/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java
+++ b/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java
@@ -277,6 +277,10 @@ public static void register(CompiledExpression expr, Cleaner cleaner) {
     all.add(new CleanerWeakReference(expr, cleaner, collected, false));
   }
 
+  static void register(HashJoin hashJoin, Cleaner cleaner) {
+    all.add(new CleanerWeakReference(hashJoin, cleaner, collected, true));
+  }
+
   /**
    * This is not 100% perfect and we can still run into situations where RMM buffers were not
    * collected and this returns false because of thread race conditions. This is just a best effort.
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 1fc9616d607..e725932ed5e 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -505,18 +505,48 @@ private static native long[] leftJoin(long leftTable, int[] leftJoinCols, long r
   private static native long[] leftJoinGatherMaps(long leftKeys, long rightKeys,
                                                   boolean compareNullsEqual) throws CudfException;
 
+  private static native long leftJoinRowCount(long leftTable, long rightHashJoin,
+                                              boolean nullsEqual) throws CudfException;
+
+  private static native long[] leftHashJoinGatherMaps(long leftTable, long rightHashJoin,
+                                                      boolean nullsEqual) throws CudfException;
+
+  private static native long[] leftHashJoinGatherMapsWithCount(long leftTable, long rightHashJoin,
+                                                               boolean nullsEqual,
+                                                               long outputRowCount) throws CudfException;
+
   private static native long[] innerJoin(long leftTable, int[] leftJoinCols, long rightTable,
                                          int[] rightJoinCols, boolean compareNullsEqual) throws CudfException;
 
   private static native long[] innerJoinGatherMaps(long leftKeys, long rightKeys,
                                                    boolean compareNullsEqual) throws CudfException;
 
+  private static native long innerJoinRowCount(long table, long hashJoin,
+                                               boolean nullsEqual) throws CudfException;
+
+  private static native long[] innerHashJoinGatherMaps(long table, long hashJoin,
+                                                       boolean nullsEqual) throws CudfException;
+
+  private static native long[] innerHashJoinGatherMapsWithCount(long table, long hashJoin,
+                                                                boolean nullsEqual,
+                                                                long outputRowCount) throws CudfException;
+
   private static native long[] fullJoin(long leftTable, int[] leftJoinCols, long rightTable,
                                          int[] rightJoinCols, boolean compareNullsEqual) throws CudfException;
 
   private static native long[] fullJoinGatherMaps(long leftKeys, long rightKeys,
                                                   boolean compareNullsEqual) throws CudfException;
 
+  private static native long fullJoinRowCount(long leftTable, long rightHashJoin,
+                                              boolean nullsEqual) throws CudfException;
+
+  private static native long[] fullHashJoinGatherMaps(long leftTable, long rightHashJoin,
+                                                      boolean nullsEqual) throws CudfException;
+
+  private static native long[] fullHashJoinGatherMapsWithCount(long leftTable, long rightHashJoin,
+                                                               boolean nullsEqual,
+                                                               long outputRowCount) throws CudfException;
+
   private static native long[] leftSemiJoin(long leftTable, int[] leftJoinCols, long rightTable,
       int[] rightJoinCols, boolean compareNullsEqual) throws CudfException;
 
@@ -2040,6 +2070,69 @@ public GatherMap[] leftJoinGatherMaps(Table rightKeys, boolean compareNullsEqual
     return buildJoinGatherMaps(gatherMapData);
   }
 
+  /**
+   * Computes the number of rows resulting from a left equi-join between two tables.
+   * It is assumed this table instance holds the key columns from the left table, and the
+   * {@link HashJoin} argument has been constructed from the key columns from the right table.
+   * @param rightHash hash table built from join key columns from the right table
+   * @return row count of the join result
+   */
+  public long leftJoinRowCount(HashJoin rightHash) {
+    if (getNumberOfColumns() != rightHash.getNumberOfColumns()) {
+      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
+          "rightKeys: " + rightHash.getNumberOfColumns());
+    }
+    return leftJoinRowCount(getNativeView(), rightHash.getNativeView(),
+        rightHash.getCompareNulls());
+  }
+
+  /**
+   * Computes the gather maps that can be used to manifest the result of a left equi-join between
+   * two tables. It is assumed this table instance holds the key columns from the left table, and
+   * the {@link HashJoin} argument has been constructed from the key columns from the right table.
+   * Two {@link GatherMap} instances will be returned that can be used to gather the left and right
+   * tables, respectively, to produce the result of the left join.
+   * It is the responsibility of the caller to close the resulting gather map instances.
+   * @param rightHash hash table built from join key columns from the right table
+   * @return left and right table gather maps
+   */
+  public GatherMap[] leftJoinGatherMaps(HashJoin rightHash) {
+    if (getNumberOfColumns() != rightHash.getNumberOfColumns()) {
+      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
+          "rightKeys: " + rightHash.getNumberOfColumns());
+    }
+    long[] gatherMapData =
+        leftHashJoinGatherMaps(getNativeView(), rightHash.getNativeView(),
+            rightHash.getCompareNulls());
+    return buildJoinGatherMaps(gatherMapData);
+  }
+
+  /**
+   * Computes the gather maps that can be used to manifest the result of a left equi-join between
+   * two tables. It is assumed this table instance holds the key columns from the left table, and
+   * the {@link HashJoin} argument has been constructed from the key columns from the right table.
+   * Two {@link GatherMap} instances will be returned that can be used to gather the left and right
+   * tables, respectively, to produce the result of the left join.
+   * It is the responsibility of the caller to close the resulting gather map instances.
+   * This interface allows passing an output row count that was previously computed from
+   * {@link #leftJoinRowCount(HashJoin)}.
+   * WARNING: Passing a row count that is smaller than the actual row count will result
+   * in undefined behavior.
+   * @param rightHash hash table built from join key columns from the right table
+   * @param outputRowCount number of output rows in the join result
+   * @return left and right table gather maps
+   */
+  public GatherMap[] leftJoinGatherMaps(HashJoin rightHash, long outputRowCount) {
+    if (getNumberOfColumns() != rightHash.getNumberOfColumns()) {
+      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
+          "rightKeys: " + rightHash.getNumberOfColumns());
+    }
+    long[] gatherMapData =
+        leftHashJoinGatherMapsWithCount(getNativeView(), rightHash.getNativeView(),
+            rightHash.getCompareNulls(), outputRowCount);
+    return buildJoinGatherMaps(gatherMapData);
+  }
+
   /**
    * Computes the number of rows from the result of a left join between two tables when a
    * conditional expression is true. It is assumed this table instance holds the columns from
@@ -2124,6 +2217,67 @@ public GatherMap[] innerJoinGatherMaps(Table rightKeys, boolean compareNullsEqua
     return buildJoinGatherMaps(gatherMapData);
   }
 
+  /**
+   * Computes the number of rows resulting from an inner equi-join between two tables.
+   * @param otherHash hash table built from join key columns from the other table
+   * @return row count of the join result
+   */
+  public long innerJoinRowCount(HashJoin otherHash) {
+    if (getNumberOfColumns() != otherHash.getNumberOfColumns()) {
+      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
+          "otherKeys: " + otherHash.getNumberOfColumns());
+    }
+    return innerJoinRowCount(getNativeView(), otherHash.getNativeView(),
+        otherHash.getCompareNulls());
+  }
+
+  /**
+   * Computes the gather maps that can be used to manifest the result of an inner equi-join between
+   * two tables. It is assumed this table instance holds the key columns from the left table, and
+   * the {@link HashJoin} argument has been constructed from the key columns from the right table.
+   * Two {@link GatherMap} instances will be returned that can be used to gather the left and right
+   * tables, respectively, to produce the result of the inner join.
+   * It is the responsibility of the caller to close the resulting gather map instances.
+   * @param rightHash hash table built from join key columns from the right table
+   * @return left and right table gather maps
+   */
+  public GatherMap[] innerJoinGatherMaps(HashJoin rightHash) {
+    if (getNumberOfColumns() != rightHash.getNumberOfColumns()) {
+      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
+          "rightKeys: " + rightHash.getNumberOfColumns());
+    }
+    long[] gatherMapData =
+        innerHashJoinGatherMaps(getNativeView(), rightHash.getNativeView(),
+            rightHash.getCompareNulls());
+    return buildJoinGatherMaps(gatherMapData);
+  }
+
+  /**
+   * Computes the gather maps that can be used to manifest the result of an inner equi-join between
+   * two tables. It is assumed this table instance holds the key columns from the left table, and
+   * the {@link HashJoin} argument has been constructed from the key columns from the right table.
+   * Two {@link GatherMap} instances will be returned that can be used to gather the left and right
+   * tables, respectively, to produce the result of the inner join.
+   * It is the responsibility of the caller to close the resulting gather map instances.
+   * This interface allows passing an output row count that was previously computed from
+   * {@link #innerJoinRowCount(HashJoin)}.
+   * WARNING: Passing a row count that is smaller than the actual row count will result
+   * in undefined behavior.
+   * @param rightHash hash table built from join key columns from the right table
+   * @param outputRowCount number of output rows in the join result
+   * @return left and right table gather maps
+   */
+  public GatherMap[] innerJoinGatherMaps(HashJoin rightHash, long outputRowCount) {
+    if (getNumberOfColumns() != rightHash.getNumberOfColumns()) {
+      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
+          "rightKeys: " + rightHash.getNumberOfColumns());
+    }
+    long[] gatherMapData =
+        innerHashJoinGatherMapsWithCount(getNativeView(), rightHash.getNativeView(),
+            rightHash.getCompareNulls(), outputRowCount);
+    return buildJoinGatherMaps(gatherMapData);
+  }
+
   /**
    * Computes the number of rows from the result of an inner join between two tables when a
    * conditional expression is true. It is assumed this table instance holds the columns from
@@ -2209,6 +2363,72 @@ public GatherMap[] fullJoinGatherMaps(Table rightKeys, boolean compareNullsEqual
     return buildJoinGatherMaps(gatherMapData);
   }
 
+  /**
+   * Computes the number of rows resulting from a full equi-join between two tables.
+   * It is assumed this table instance holds the key columns from the left table, and the
+   * {@link HashJoin} argument has been constructed from the key columns from the right table.
+   * Note that unlike {@link #leftJoinRowCount(HashJoin)} and {@link #innerJoinRowCount(HashJoin),
+   * this will perform some redundant calculations compared to
+   * {@link #fullJoinGatherMaps(HashJoin, long)}.
+   * @param rightHash hash table built from join key columns from the right table
+   * @return row count of the join result
+   */
+  public long fullJoinRowCount(HashJoin rightHash) {
+    if (getNumberOfColumns() != rightHash.getNumberOfColumns()) {
+      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
+          "rightKeys: " + rightHash.getNumberOfColumns());
+    }
+    return fullJoinRowCount(getNativeView(), rightHash.getNativeView(),
+        rightHash.getCompareNulls());
+  }
+
+  /**
+   * Computes the gather maps that can be used to manifest the result of a full equi-join between
+   * two tables. It is assumed this table instance holds the key columns from the left table, and
+   * the {@link HashJoin} argument has been constructed from the key columns from the right table.
+   * Two {@link GatherMap} instances will be returned that can be used to gather the left and right
+   * tables, respectively, to produce the result of the full join.
+   * It is the responsibility of the caller to close the resulting gather map instances.
+   * @param rightHash hash table built from join key columns from the right table
+   * @return left and right table gather maps
+   */
+  public GatherMap[] fullJoinGatherMaps(HashJoin rightHash) {
+    if (getNumberOfColumns() != rightHash.getNumberOfColumns()) {
+      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
+          "rightKeys: " + rightHash.getNumberOfColumns());
+    }
+    long[] gatherMapData =
+        fullHashJoinGatherMaps(getNativeView(), rightHash.getNativeView(),
+            rightHash.getCompareNulls());
+    return buildJoinGatherMaps(gatherMapData);
+  }
+
+  /**
+   * Computes the gather maps that can be used to manifest the result of a full equi-join between
+   * two tables. It is assumed this table instance holds the key columns from the left table, and
+   * the {@link HashJoin} argument has been constructed from the key columns from the right table.
+   * Two {@link GatherMap} instances will be returned that can be used to gather the left and right
+   * tables, respectively, to produce the result of the full join.
+   * It is the responsibility of the caller to close the resulting gather map instances.
+   * This interface allows passing an output row count that was previously computed from
+   * {@link #fullJoinRowCount(HashJoin)}.
+   * WARNING: Passing a row count that is smaller than the actual row count will result
+   * in undefined behavior.
+   * @param rightHash hash table built from join key columns from the right table
+   * @param outputRowCount number of output rows in the join result
+   * @return left and right table gather maps
+   */
+  public GatherMap[] fullJoinGatherMaps(HashJoin rightHash, long outputRowCount) {
+    if (getNumberOfColumns() != rightHash.getNumberOfColumns()) {
+      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
+          "rightKeys: " + rightHash.getNumberOfColumns());
+    }
+    long[] gatherMapData =
+        fullHashJoinGatherMapsWithCount(getNativeView(), rightHash.getNativeView(),
+            rightHash.getCompareNulls(), outputRowCount);
+    return buildJoinGatherMaps(gatherMapData);
+  }
+
   /**
    * Computes the gather maps that can be used to manifest the result of a full join between
    * two tables when a conditional expression is true. It is assumed this table instance holds
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 35ecae681b8..bc59e3aee64 100755
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -264,6 +264,7 @@ set(SOURCE_FILES
     "src/ColumnViewJni.cpp"
     "src/CompiledExpression.cpp"
     "src/ContiguousTableJni.cpp"
+    "src/HashJoinJni.cpp"
     "src/HostMemoryBufferNativeUtilsJni.cpp"
     "src/NvcompJni.cpp"
     "src/NvtxRangeJni.cpp"
diff --git a/java/src/main/native/src/HashJoinJni.cpp b/java/src/main/native/src/HashJoinJni.cpp
new file mode 100644
index 00000000000..0f78aef64bc
--- /dev/null
+++ b/java/src/main/native/src/HashJoinJni.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/join.hpp>
+
+#include "cudf_jni_apis.hpp"
+
+extern "C" {
+
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_HashJoin_create(JNIEnv *env, jclass, jlong j_table,
+                                                            jboolean j_nulls_equal) {
+  JNI_NULL_CHECK(env, j_table, "table handle is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto tview = reinterpret_cast<cudf::table_view const *>(j_table);
+    auto nulleq = j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+    auto hash_join_ptr = new cudf::hash_join(*tview, nulleq);
+    return reinterpret_cast<jlong>(hash_join_ptr);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_HashJoin_destroy(JNIEnv *env, jclass, jlong j_handle) {
+  try {
+    cudf::jni::auto_set_device(env);
+    auto hash_join_ptr = reinterpret_cast<cudf::hash_join *>(j_handle);
+    delete hash_join_ptr;
+  }
+  CATCH_STD(env, );
+}
+
+} // extern "C"
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 595bc1df151..f642a87b445 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -755,13 +755,46 @@ bool valid_window_parameters(native_jintArray const &values,
          values.size() == preceding.size() && values.size() == following.size();
 }
 
-// Generate gather maps needed to manifest the result of an equi-join between two tables.
+// Convert a cudf gather map pair into the form that Java expects
 // The resulting Java long array contains the following at each index:
 //   0: Size of each gather map in bytes
 //   1: Device address of the gather map for the left table
 //   2: Host address of the rmm::device_buffer instance that owns the left gather map data
 //   3: Device address of the gather map for the right table
 //   4: Host address of the rmm::device_buffer instance that owns the right gather map data
+jlongArray gather_maps_to_java(JNIEnv *env,
+                               std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
+                                         std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
+                                   maps) {
+  // release the underlying device buffer to Java
+  auto left_map_buffer = std::make_unique<rmm::device_buffer>(maps.first->release());
+  auto right_map_buffer = std::make_unique<rmm::device_buffer>(maps.second->release());
+  cudf::jni::native_jlongArray result(env, 5);
+  result[0] = static_cast<jlong>(left_map_buffer->size());
+  result[1] = reinterpret_cast<jlong>(left_map_buffer->data());
+  result[2] = reinterpret_cast<jlong>(left_map_buffer.release());
+  result[3] = reinterpret_cast<jlong>(right_map_buffer->data());
+  result[4] = reinterpret_cast<jlong>(right_map_buffer.release());
+  return result.get_jArray();
+}
+
+// Convert a cudf gather map into the form that Java expects
+// The resulting Java long array contains the following at each index:
+//   0: Size of the gather map in bytes
+//   1: Device address of the gather map
+//   2: Host address of the rmm::device_buffer instance that owns the gather map data
+jlongArray gather_map_to_java(JNIEnv *env,
+                              std::unique_ptr<rmm::device_uvector<cudf::size_type>> map) {
+  // release the underlying device buffer to Java
+  auto gather_map_buffer = std::make_unique<rmm::device_buffer>(map->release());
+  cudf::jni::native_jlongArray result(env, 3);
+  result[0] = static_cast<jlong>(gather_map_buffer->size());
+  result[1] = reinterpret_cast<jlong>(gather_map_buffer->data());
+  result[2] = reinterpret_cast<jlong>(gather_map_buffer.release());
+  return result.get_jArray();
+}
+
+// Generate gather maps needed to manifest the result of an equi-join between two tables.
 template <typename T>
 jlongArray join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
                             jboolean compare_nulls_equal, T join_func) {
@@ -772,31 +805,29 @@ jlongArray join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
     auto left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
     auto right_keys = reinterpret_cast<cudf::table_view const *>(j_right_keys);
     auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
-    std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
-              std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
-        join_maps = join_func(*left_keys, *right_keys, nulleq);
-
-    // release the underlying device buffer to Java
-    auto left_map_buffer = std::make_unique<rmm::device_buffer>(join_maps.first->release());
-    auto right_map_buffer = std::make_unique<rmm::device_buffer>(join_maps.second->release());
-    cudf::jni::native_jlongArray result(env, 5);
-    result[0] = static_cast<jlong>(left_map_buffer->size());
-    result[1] = reinterpret_cast<jlong>(left_map_buffer->data());
-    result[2] = reinterpret_cast<jlong>(left_map_buffer.release());
-    result[3] = reinterpret_cast<jlong>(right_map_buffer->data());
-    result[4] = reinterpret_cast<jlong>(right_map_buffer.release());
-    return result.get_jArray();
+    return gather_maps_to_java(env, join_func(*left_keys, *right_keys, nulleq));
+  }
+  CATCH_STD(env, NULL);
+}
+
+// Generate gather maps needed to manifest the result of an equi-join between a left table and
+// a hash table built from the join's right table.
+template <typename T>
+jlongArray hash_join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_hash_join,
+                                 jboolean compare_nulls_equal, T join_func) {
+  JNI_NULL_CHECK(env, j_left_keys, "left table is null", NULL);
+  JNI_NULL_CHECK(env, j_right_hash_join, "hash join is null", NULL);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
+    auto hash_join = reinterpret_cast<cudf::hash_join const *>(j_right_hash_join);
+    auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+    return gather_maps_to_java(env, join_func(*left_keys, *hash_join, nulleq));
   }
   CATCH_STD(env, NULL);
 }
 
 // Generate gather maps needed to manifest the result of a conditional join between two tables.
-// The resulting Java long array contains the following at each index:
-//   0: Size of each gather map in bytes
-//   1: Device address of the gather map for the left table
-//   2: Host address of the rmm::device_buffer instance that owns the left gather map data
-//   3: Device address of the gather map for the right table
-//   4: Host address of the rmm::device_buffer instance that owns the right gather map data
 template <typename T>
 jlongArray cond_join_gather_maps(JNIEnv *env, jlong j_left_table, jlong j_right_table,
                                  jlong j_condition, jboolean compare_nulls_equal, T join_func) {
@@ -809,29 +840,13 @@ jlongArray cond_join_gather_maps(JNIEnv *env, jlong j_left_table, jlong j_right_
     auto right_table = reinterpret_cast<cudf::table_view const *>(j_right_table);
     auto condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
     auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
-    std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
-              std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
-        join_maps = join_func(*left_table, *right_table, condition->get_top_expression(), nulleq);
-
-    // release the underlying device buffer to Java
-    auto left_map_buffer = std::make_unique<rmm::device_buffer>(join_maps.first->release());
-    auto right_map_buffer = std::make_unique<rmm::device_buffer>(join_maps.second->release());
-    cudf::jni::native_jlongArray result(env, 5);
-    result[0] = static_cast<jlong>(left_map_buffer->size());
-    result[1] = reinterpret_cast<jlong>(left_map_buffer->data());
-    result[2] = reinterpret_cast<jlong>(left_map_buffer.release());
-    result[3] = reinterpret_cast<jlong>(right_map_buffer->data());
-    result[4] = reinterpret_cast<jlong>(right_map_buffer.release());
-    return result.get_jArray();
+    return gather_maps_to_java(
+        env, join_func(*left_table, *right_table, condition->get_top_expression(), nulleq));
   }
   CATCH_STD(env, NULL);
 }
 
 // Generate a gather map needed to manifest the result of a semi/anti join between two tables.
-// The resulting Java long array contains the following at each index:
-//   0: Size of the gather map in bytes
-//   1: Device address of the gather map
-//   2: Host address of the rmm::device_buffer instance that owns the gather map data
 template <typename T>
 jlongArray join_gather_single_map(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
                                   jboolean compare_nulls_equal, T join_func) {
@@ -842,26 +857,13 @@ jlongArray join_gather_single_map(JNIEnv *env, jlong j_left_keys, jlong j_right_
     auto left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
     auto right_keys = reinterpret_cast<cudf::table_view const *>(j_right_keys);
     auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
-    std::unique_ptr<rmm::device_uvector<cudf::size_type>> join_map =
-        join_func(*left_keys, *right_keys, nulleq);
-
-    // release the underlying device buffer to Java
-    auto gather_map_buffer = std::make_unique<rmm::device_buffer>(join_map->release());
-    cudf::jni::native_jlongArray result(env, 3);
-    result[0] = static_cast<jlong>(gather_map_buffer->size());
-    result[1] = reinterpret_cast<jlong>(gather_map_buffer->data());
-    result[2] = reinterpret_cast<jlong>(gather_map_buffer.release());
-    return result.get_jArray();
+    return gather_map_to_java(env, join_func(*left_keys, *right_keys, nulleq));
   }
   CATCH_STD(env, NULL);
 }
 
 // Generate a gather map needed to manifest the result of a conditional semi/anti join
 // between two tables.
-// The resulting Java long array contains the following at each index:
-//   0: Size of the gather map in bytes
-//   1: Device address of the gather map
-//   2: Host address of the rmm::device_buffer instance that owns the gather map data
 template <typename T>
 jlongArray cond_join_gather_single_map(JNIEnv *env, jlong j_left_table, jlong j_right_table,
                                        jlong j_condition, jboolean compare_nulls_equal,
@@ -875,16 +877,8 @@ jlongArray cond_join_gather_single_map(JNIEnv *env, jlong j_left_table, jlong j_
     auto right_table = reinterpret_cast<cudf::table_view const *>(j_right_table);
     auto condition = reinterpret_cast<cudf::jni::ast::compiled_expr *>(j_condition);
     auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
-    std::unique_ptr<rmm::device_uvector<cudf::size_type>> join_map =
-        join_func(*left_table, *right_table, condition->get_top_expression(), nulleq);
-
-    // release the underlying device buffer to Java
-    auto gather_map_buffer = std::make_unique<rmm::device_buffer>(join_map->release());
-    cudf::jni::native_jlongArray result(env, 3);
-    result[0] = static_cast<jlong>(gather_map_buffer->size());
-    result[1] = reinterpret_cast<jlong>(gather_map_buffer->data());
-    result[2] = reinterpret_cast<jlong>(gather_map_buffer.release());
-    return result.get_jArray();
+    return gather_map_to_java(
+        env, join_func(*left_table, *right_table, condition->get_top_expression(), nulleq));
   }
   CATCH_STD(env, NULL);
 }
@@ -1951,6 +1945,45 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftJoinGatherMaps(
       });
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_leftJoinRowCount(JNIEnv *env, jclass,
+                                                                   jlong j_left_table,
+                                                                   jlong j_right_hash_join,
+                                                                   jboolean compare_nulls_equal) {
+  JNI_NULL_CHECK(env, j_left_table, "left table is null", 0);
+  JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
+    auto hash_join = reinterpret_cast<cudf::hash_join const *>(j_right_hash_join);
+    auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+    auto row_count = hash_join->left_join_size(*left_table, nulleq);
+    return static_cast<jlong>(row_count);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftHashJoinGatherMaps(
+    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join,
+    jboolean compare_nulls_equal) {
+  return cudf::jni::hash_join_gather_maps(
+      env, j_left_table, j_right_hash_join, compare_nulls_equal,
+      [](cudf::table_view const &left, cudf::hash_join const &hash, cudf::null_equality nulleq) {
+        return hash.left_join(left, nulleq);
+      });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftHashJoinGatherMapsWithCount(
+    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jboolean compare_nulls_equal,
+    jlong j_output_row_count) {
+  auto output_row_count = static_cast<std::size_t>(j_output_row_count);
+  return cudf::jni::hash_join_gather_maps(env, j_left_table, j_right_hash_join, compare_nulls_equal,
+                                          [output_row_count](cudf::table_view const &left,
+                                                             cudf::hash_join const &hash,
+                                                             cudf::null_equality nulleq) {
+                                            return hash.left_join(left, nulleq, output_row_count);
+                                          });
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalLeftJoinRowCount(
     JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
     jboolean compare_nulls_equal) {
@@ -2002,6 +2035,45 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerJoinGatherMaps(
       });
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_innerJoinRowCount(JNIEnv *env, jclass,
+                                                                    jlong j_left_table,
+                                                                    jlong j_right_hash_join,
+                                                                    jboolean compare_nulls_equal) {
+  JNI_NULL_CHECK(env, j_left_table, "left table is null", 0);
+  JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
+    auto hash_join = reinterpret_cast<cudf::hash_join const *>(j_right_hash_join);
+    auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+    auto row_count = hash_join->inner_join_size(*left_table, nulleq);
+    return static_cast<jlong>(row_count);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerHashJoinGatherMaps(
+    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join,
+    jboolean compare_nulls_equal) {
+  return cudf::jni::hash_join_gather_maps(
+      env, j_left_table, j_right_hash_join, compare_nulls_equal,
+      [](cudf::table_view const &left, cudf::hash_join const &hash, cudf::null_equality nulleq) {
+        return hash.inner_join(left, nulleq);
+      });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerHashJoinGatherMapsWithCount(
+    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jboolean compare_nulls_equal,
+    jlong j_output_row_count) {
+  auto output_row_count = static_cast<std::size_t>(j_output_row_count);
+  return cudf::jni::hash_join_gather_maps(env, j_left_table, j_right_hash_join, compare_nulls_equal,
+                                          [output_row_count](cudf::table_view const &left,
+                                                             cudf::hash_join const &hash,
+                                                             cudf::null_equality nulleq) {
+                                            return hash.inner_join(left, nulleq, output_row_count);
+                                          });
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalInnerJoinRowCount(
     JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
     jboolean compare_nulls_equal) {
@@ -2053,6 +2125,45 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullJoinGatherMaps(
       });
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_fullJoinRowCount(JNIEnv *env, jclass,
+                                                                   jlong j_left_table,
+                                                                   jlong j_right_hash_join,
+                                                                   jboolean compare_nulls_equal) {
+  JNI_NULL_CHECK(env, j_left_table, "left table is null", 0);
+  JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
+    auto hash_join = reinterpret_cast<cudf::hash_join const *>(j_right_hash_join);
+    auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+    auto row_count = hash_join->full_join_size(*left_table, nulleq);
+    return static_cast<jlong>(row_count);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullHashJoinGatherMaps(
+    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join,
+    jboolean compare_nulls_equal) {
+  return cudf::jni::hash_join_gather_maps(
+      env, j_left_table, j_right_hash_join, compare_nulls_equal,
+      [](cudf::table_view const &left, cudf::hash_join const &hash, cudf::null_equality nulleq) {
+        return hash.full_join(left, nulleq);
+      });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullHashJoinGatherMapsWithCount(
+    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jboolean compare_nulls_equal,
+    jlong j_output_row_count) {
+  auto output_row_count = static_cast<std::size_t>(j_output_row_count);
+  return cudf::jni::hash_join_gather_maps(env, j_left_table, j_right_hash_join, compare_nulls_equal,
+                                          [output_row_count](cudf::table_view const &left,
+                                                             cudf::hash_join const &hash,
+                                                             cudf::null_equality nulleq) {
+                                            return hash.full_join(left, nulleq, output_row_count);
+                                          });
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalFullJoinGatherMaps(
     JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
     jboolean compare_nulls_equal) {
diff --git a/java/src/test/java/ai/rapids/cudf/HashJoinTest.java b/java/src/test/java/ai/rapids/cudf/HashJoinTest.java
new file mode 100644
index 00000000000..be6125340ec
--- /dev/null
+++ b/java/src/test/java/ai/rapids/cudf/HashJoinTest.java
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class HashJoinTest {
+  @Test
+  void testGetNumberOfColumns() {
+    try (Table t = new Table.TestBuilder().column(1, 2).column(3, 4).column(5, 6).build();
+         HashJoin hashJoin = new HashJoin(t, false)) {
+      assertEquals(3, hashJoin.getNumberOfColumns());
+    }
+  }
+
+  @Test
+  void testGetCompareNulls() {
+    try (Table t = new Table.TestBuilder().column(1, 2, 3, 4).column(5, 6, 7, 8).build()) {
+      try (HashJoin hashJoin = new HashJoin(t, false)) {
+        assertFalse(hashJoin.getCompareNulls());
+      }
+      try (HashJoin hashJoin = new HashJoin(t, true)) {
+        assertTrue(hashJoin.getCompareNulls());
+      }
+    }
+  }
+}
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 8e4e3df612b..aeb94e4824a 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -30,10 +30,6 @@
 import ai.rapids.cudf.ast.ColumnReference;
 import ai.rapids.cudf.ast.CompiledExpression;
 import ai.rapids.cudf.ast.TableReference;
-import org.apache.arrow.memory.RootAllocator;
-import org.apache.arrow.vector.VectorSchemaRoot;
-import org.apache.arrow.vector.ipc.ArrowFileReader;
-import org.apache.arrow.vector.ipc.SeekableReadChannel;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.parquet.hadoop.ParquetFileReader;
@@ -1500,6 +1496,102 @@ void testLeftJoinGatherMapsNulls() {
     }
   }
 
+  @Test
+  void testLeftHashJoinGatherMaps() {
+    final int inv = Integer.MIN_VALUE;
+    try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
+         Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build();
+         HashJoin rightHash = new HashJoin(rightKeys, false);
+         Table expected = new Table.TestBuilder()
+             .column(  0,   1, 2,   3,   4,   5,   6, 7, 8, 9)
+             .column(inv, inv, 2, inv, inv, inv, inv, 0, 1, 3)
+             .build()) {
+      GatherMap[] maps = leftKeys.leftJoinGatherMaps(rightHash);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testLeftHashJoinGatherMapsWithCount() {
+    final int inv = Integer.MIN_VALUE;
+    try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
+         Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build();
+         HashJoin rightHash = new HashJoin(rightKeys, false);
+         Table expected = new Table.TestBuilder()
+             .column(  0,   1, 2,   3,   4,   5,   6, 7, 8, 9)
+             .column(inv, inv, 2, inv, inv, inv, inv, 0, 1, 3)
+             .build()) {
+      long rowCount = leftKeys.leftJoinRowCount(rightHash);
+      assertEquals(expected.getRowCount(), rowCount);
+      GatherMap[] maps = leftKeys.leftJoinGatherMaps(rightHash, rowCount);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testLeftHashJoinGatherMapsNulls() {
+    final int inv = Integer.MIN_VALUE;
+    try (Table leftKeys = new Table.TestBuilder()
+        .column(2, 3, 9, 0, 1, 7, 4, null, null, 8)
+        .build();
+         Table rightKeys = new Table.TestBuilder()
+             .column(null, null, 9, 8, 10, 32)
+             .build();
+         HashJoin rightHash = new HashJoin(rightKeys, true);
+         Table expected = new Table.TestBuilder()
+             .column(  0,   1, 2,   3,   4,   5,   6, 7, 7, 8, 8, 9) // left
+             .column(inv, inv, 2, inv, inv, inv, inv, 0, 1, 0, 1, 3) // right
+             .build()) {
+      GatherMap[] maps = leftKeys.leftJoinGatherMaps(rightHash);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testLeftHashJoinGatherMapsNullsWithCount() {
+    final int inv = Integer.MIN_VALUE;
+    try (Table leftKeys = new Table.TestBuilder()
+        .column(2, 3, 9, 0, 1, 7, 4, null, null, 8)
+        .build();
+         Table rightKeys = new Table.TestBuilder()
+             .column(null, null, 9, 8, 10, 32)
+             .build();
+         HashJoin rightHash = new HashJoin(rightKeys,true);
+         Table expected = new Table.TestBuilder()
+             .column(  0,   1, 2,   3,   4,   5,   6, 7, 7, 8, 8, 9) // left
+             .column(inv, inv, 2, inv, inv, inv, inv, 0, 1, 0, 1, 3) // right
+             .build()) {
+      long rowCount = leftKeys.leftJoinRowCount(rightHash);
+      assertEquals(expected.getRowCount(), rowCount);
+      GatherMap[] maps = leftKeys.leftJoinGatherMaps(rightHash, rowCount);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
   @Test
   void testConditionalLeftJoinGatherMaps() {
     final int inv = Integer.MIN_VALUE;
@@ -1654,6 +1746,98 @@ void testInnerJoinGatherMapsNulls() {
     }
   }
 
+  @Test
+  void testInnerHashJoinGatherMaps() {
+    try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
+         Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build();
+         HashJoin rightHash = new HashJoin(rightKeys, false);
+         Table expected = new Table.TestBuilder()
+             .column(2, 7, 8, 9) // left
+             .column(2, 0, 1, 3) // right
+             .build()) {
+      GatherMap[] maps = leftKeys.innerJoinGatherMaps(rightHash);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testInnerHashJoinGatherMapsWithCount() {
+    try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
+         Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build();
+         HashJoin rightHash = new HashJoin(rightKeys, false);
+         Table expected = new Table.TestBuilder()
+             .column(2, 7, 8, 9) // left
+             .column(2, 0, 1, 3) // right
+             .build()) {
+      long rowCount = leftKeys.innerJoinRowCount(rightHash);
+      assertEquals(expected.getRowCount(), rowCount);
+      GatherMap[] maps = leftKeys.innerJoinGatherMaps(rightHash, rowCount);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testInnerHashJoinGatherMapsNulls() {
+    try (Table leftKeys = new Table.TestBuilder()
+        .column(2, 3, 9, 0, 1, 7, 4, null, null, 8)
+        .build();
+         Table rightKeys = new Table.TestBuilder()
+             .column(null, null, 9, 8, 10, 32)
+             .build();
+         HashJoin rightHash = new HashJoin(rightKeys, true);
+         Table expected = new Table.TestBuilder()
+             .column(2, 7, 7, 8, 8, 9) // left
+             .column(2, 0, 1, 0, 1, 3) // right
+             .build()) {
+      GatherMap[] maps = leftKeys.innerJoinGatherMaps(rightHash);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testInnerHashJoinGatherMapsNullsWithCount() {
+    try (Table leftKeys = new Table.TestBuilder()
+        .column(2, 3, 9, 0, 1, 7, 4, null, null, 8)
+        .build();
+         Table rightKeys = new Table.TestBuilder()
+             .column(null, null, 9, 8, 10, 32)
+             .build();
+         HashJoin rightHash = new HashJoin(rightKeys, true);
+         Table expected = new Table.TestBuilder()
+             .column(2, 7, 7, 8, 8, 9) // left
+             .column(2, 0, 1, 0, 1, 3) // right
+             .build()) {
+      long rowCount = leftKeys.innerJoinRowCount(rightHash);
+      assertEquals(expected.getRowCount(), rowCount);
+      GatherMap[] maps = leftKeys.innerJoinGatherMaps(rightHash, rowCount);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
   @Test
   void testConditionalInnerJoinGatherMaps() {
     BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
@@ -1806,6 +1990,102 @@ void testFullJoinGatherMapsNulls() {
     }
   }
 
+  @Test
+  void testFullHashJoinGatherMaps() {
+    final int inv = Integer.MIN_VALUE;
+    try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, null, 1, 7, 4, 6, 5, 8).build();
+         Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, null).build();
+         HashJoin rightHash = new HashJoin(rightKeys, false);
+         Table expected = new Table.TestBuilder()
+             .column(inv, inv,   0,   1, 2,   3,   4,   5,   6, 7, 8, 9) // left
+             .column(  4,   5, inv, inv, 2, inv, inv, inv, inv, 0, 1, 3) // right
+             .build()) {
+      GatherMap[] maps = leftKeys.fullJoinGatherMaps(rightHash);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testFullHashJoinGatherMapsWithCount() {
+    final int inv = Integer.MIN_VALUE;
+    try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, null, 1, 7, 4, 6, 5, 8).build();
+         Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, null).build();
+         HashJoin rightHash = new HashJoin(rightKeys, false);
+         Table expected = new Table.TestBuilder()
+             .column(inv, inv,   0,   1, 2,   3,   4,   5,   6, 7, 8, 9) // left
+             .column(  4,   5, inv, inv, 2, inv, inv, inv, inv, 0, 1, 3) // right
+             .build()) {
+      long rowCount = leftKeys.fullJoinRowCount(rightHash);
+      assertEquals(expected.getRowCount(), rowCount);
+      GatherMap[] maps = leftKeys.fullJoinGatherMaps(rightHash, rowCount);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testFullHashJoinGatherMapsNulls() {
+    final int inv = Integer.MIN_VALUE;
+    try (Table leftKeys = new Table.TestBuilder()
+        .column(2, 3, 9, 0, 1, 7, 4, null, null, 8)
+        .build();
+         Table rightKeys = new Table.TestBuilder()
+             .column(null, null, 9, 8, 10, 32)
+             .build();
+         HashJoin rightHash = new HashJoin(rightKeys, true);
+         Table expected = new Table.TestBuilder()
+             .column(inv, inv,   0,   1, 2,   3,   4,   5,   6, 7, 7, 8, 8, 9) // left
+             .column(  4,   5, inv, inv, 2, inv, inv, inv, inv, 0, 1, 0, 1, 3) // right
+             .build()) {
+      GatherMap[] maps = leftKeys.fullJoinGatherMaps(rightHash);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testFullHashJoinGatherMapsNullsWithCount() {
+    final int inv = Integer.MIN_VALUE;
+    try (Table leftKeys = new Table.TestBuilder()
+        .column(2, 3, 9, 0, 1, 7, 4, null, null, 8)
+        .build();
+         Table rightKeys = new Table.TestBuilder()
+             .column(null, null, 9, 8, 10, 32)
+             .build();
+         HashJoin rightHash = new HashJoin(rightKeys, true);
+         Table expected = new Table.TestBuilder()
+             .column(inv, inv,   0,   1, 2,   3,   4,   5,   6, 7, 7, 8, 8, 9) // left
+             .column(  4,   5, inv, inv, 2, inv, inv, inv, inv, 0, 1, 0, 1, 3) // right
+             .build()) {
+      long rowCount = leftKeys.fullJoinRowCount(rightHash);
+      assertEquals(expected.getRowCount(), rowCount);
+      GatherMap[] maps = leftKeys.fullJoinGatherMaps(rightHash, rowCount);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
   @Test
   void testConditionalFullJoinGatherMaps() {
     final int inv = Integer.MIN_VALUE;

From ca58c1e35d5f10519f2083f10b8463db3f7a3cc4 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 20 Aug 2021 09:09:13 -0700
Subject: [PATCH 18/46] Remove stale code in `ColumnBase._fill` (#9078)

This PR removes unreachable code in `ColumnBase._fill`.
closes #8566

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/9078
---
 python/cudf/cudf/core/column/column.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index b20c42926dc..d52f63a79f5 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -392,14 +392,6 @@ def _fill(
 
         return self
 
-        fill_code = self._encode(fill_value)
-        fill_scalar = as_device_scalar(fill_code, self.codes.dtype)
-
-        result = self if inplace else self.copy()
-
-        libcudf.filling.fill_in_place(result.codes, begin, end, fill_scalar)
-        return result
-
     def shift(self, offset: int, fill_value: ScalarLike) -> ColumnBase:
         return libcudf.copying.shift(self, offset, fill_value)
 

From 58692648043eb07d92e8462224c3eee9d8febb3d Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 20 Aug 2021 10:27:42 -0700
Subject: [PATCH 19/46] Remove remaining "support" methods from DataFrame
 (#9068)

This PR rewrites DataFrame's `kurtosis` and `skew` to use the `_reduce` method introduced in #8944, and it inlines the logic for the `count` to bypass the `_apply_support_method` machinery. This allows us to remove most of that logic entirely aside from the code for row-wise reductions and scans that dispatches to `cupy`.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: https://github.com/rapidsai/cudf/pull/9068
---
 python/cudf/cudf/core/dataframe.py | 140 ++++-------------------------
 python/cudf/cudf/core/frame.py     | 126 ++++++++++++++++++++++++++
 python/cudf/cudf/core/series.py    |  97 --------------------
 3 files changed, 143 insertions(+), 220 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index f66cb570fbb..721ebf22de7 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6305,12 +6305,9 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs):
         if axis != 0:
             raise NotImplementedError("Only axis=0 is currently supported.")
 
-        return self._apply_support_method(
-            "count",
-            axis=axis,
-            level=level,
-            numeric_only=numeric_only,
-            **kwargs,
+        return Series._from_data(
+            {None: [self._data[col].valid_count for col in self._data.names]},
+            as_index(self._data.names),
         )
 
     _SUPPORT_AXIS_LOOKUP = {
@@ -6343,7 +6340,7 @@ def _reduce(
                 {None: result}, as_index(self._data.names)
             )
         elif axis == 1:
-            return self._apply_support_method_axis_1(op, **kwargs)
+            return self._apply_cupy_method_axis_1(op, **kwargs)
 
     def _scan(
         self, op, axis=None, *args, **kwargs,
@@ -6353,7 +6350,7 @@ def _scan(
         if axis == 0:
             return super()._scan(op, axis=axis, *args, **kwargs)
         elif axis == 1:
-            return self._apply_support_method_axis_1(f"cum{op}", **kwargs)
+            return self._apply_cupy_method_axis_1(f"cum{op}", **kwargs)
 
     def mode(self, axis=0, numeric_only=False, dropna=True):
         """
@@ -6458,100 +6455,17 @@ def mode(self, axis=0, numeric_only=False, dropna=True):
     def kurtosis(
         self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
     ):
-        """
-        Return Fisher's unbiased kurtosis of a sample.
-
-        Kurtosis obtained using Fisher’s definition of
-        kurtosis (kurtosis of normal == 0.0). Normalized by N-1.
-
-        Parameters
-        ----------
-
-        skipna: bool, default True
-            Exclude NA/null values when computing the result.
-
-        Returns
-        -------
-        Series
-
-        Notes
-        -----
-        Parameters currently not supported are `axis`, `level` and
-        `numeric_only`
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
-        >>> df.kurt()
-        a   -1.2
-        b   -1.2
-        dtype: float64
-        """
-        if axis not in (0, "index", None):
-            raise NotImplementedError("Only axis=0 is currently supported.")
-
-        if numeric_only not in (None, True):
-            msg = "Kurtosis only supports int, float, and bool dtypes."
-            raise NotImplementedError(msg)
-
-        filtered = self.select_dtypes(include=[np.number, np.bool_])
-        return filtered._apply_support_method(
-            "kurtosis",
-            axis=axis,
-            skipna=skipna,
-            level=level,
-            numeric_only=numeric_only,
-            **kwargs,
+        obj = self.select_dtypes(include=[np.number, np.bool_])
+        return super(DataFrame, obj).kurtosis(
+            axis, skipna, level, numeric_only, **kwargs
         )
 
-    # Alias for kurtosis.
-    kurt = kurtosis
-
     def skew(
         self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
     ):
-        """
-        Return unbiased Fisher-Pearson skew of a sample.
-
-        Parameters
-        ----------
-        skipna: bool, default True
-            Exclude NA/null values when computing the result.
-
-        Returns
-        -------
-        Series
-
-        Notes
-        -----
-        Parameters currently not supported are `axis`, `level` and
-        `numeric_only`
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 8, 10, 10]})
-        >>> df.skew()
-        a    0.00000
-        b   -0.37037
-        dtype: float64
-        """
-        if axis not in (0, "index", None):
-            raise NotImplementedError("Only axis=0 is currently supported.")
-
-        if numeric_only not in (None, True):
-            msg = "Skew only supports int, float, and bool dtypes."
-            raise NotImplementedError(msg)
-
-        filtered = self.select_dtypes(include=[np.number, np.bool_])
-        return filtered._apply_support_method(
-            "skew",
-            axis=axis,
-            skipna=skipna,
-            level=level,
-            numeric_only=numeric_only,
-            **kwargs,
+        obj = self.select_dtypes(include=[np.number, np.bool_])
+        return super(DataFrame, obj).skew(
+            axis, skipna, level, numeric_only, **kwargs
         )
 
     def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
@@ -6562,23 +6476,11 @@ def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
         obj = self.select_dtypes(include="bool") if bool_only else self
         return super(DataFrame, obj).any(axis, skipna, level, **kwargs)
 
-    def _apply_support_method_axis_0(self, method, *args, **kwargs):
-        result = [
-            getattr(self[col], method)(*args, **kwargs)
-            for col in self._data.names
-        ]
+    def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
+        # This method uses cupy to perform scans and reductions along rows of a
+        # DataFrame. Since cuDF is designed around columnar storage and
+        # operations, we convert DataFrames to 2D cupy arrays for these ops.
 
-        if isinstance(result[0], Series):
-            support_result = result
-            result = DataFrame(index=support_result[0].index)
-            for idx, col in enumerate(self._data.names):
-                result[col] = support_result[idx]
-        else:
-            result = Series(result)
-            result = result.set_index(self._data.names)
-        return result
-
-    def _apply_support_method_axis_1(self, method, *args, **kwargs):
         # for dask metadata compatibility
         skipna = kwargs.pop("skipna", None)
         skipna = True if skipna is None else skipna
@@ -6608,13 +6510,13 @@ def _apply_support_method_axis_1(self, method, *args, **kwargs):
         min_count = kwargs.pop("min_count", None)
         if min_count not in (None, 0):
             raise NotImplementedError(
-                "Row-wise operations currently do not " "support `min_count`."
+                "Row-wise operations currently do not support `min_count`."
             )
 
         bool_only = kwargs.pop("bool_only", None)
         if bool_only not in (None, True):
             raise NotImplementedError(
-                "Row-wise operations currently do not " "support `bool_only`."
+                "Row-wise operations currently do not support `bool_only`."
             )
 
         # This parameter is only necessary for axis 0 reductions that cuDF
@@ -6674,14 +6576,6 @@ def _apply_support_method_axis_1(self, method, *args, **kwargs):
             result_df.columns = prepared.columns
             return result_df
 
-    def _apply_support_method(self, method, axis=0, *args, **kwargs):
-        axis = self._get_axis_from_axis_arg(axis)
-
-        if axis == 0:
-            return self._apply_support_method_axis_0(method, *args, **kwargs)
-        elif axis == 1:
-            return self._apply_support_method_axis_1(method, *args, **kwargs)
-
     def _columns_view(self, columns):
         """
         Return a subset of the DataFrame's columns as a view.
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 5f1ac4e0c20..9f743cd8c85 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -27,6 +27,7 @@
 )
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.join import merge
+from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
     _is_non_decimal_numeric_dtype,
     _is_scalar_or_zero_d_array,
@@ -4056,6 +4057,131 @@ def var(
             **kwargs,
         )
 
+    def kurtosis(
+        self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
+    ):
+        """
+        Return Fisher's unbiased kurtosis of a sample.
+
+        Kurtosis obtained using Fisher’s definition of
+        kurtosis (kurtosis of normal == 0.0). Normalized by N-1.
+
+        Parameters
+        ----------
+
+        axis: {index (0), columns(1)}
+            Axis for the function to be applied on.
+        skipna: bool, default True
+            Exclude NA/null values when computing the result.
+
+        Returns
+        -------
+        Series or scalar
+
+        Notes
+        -----
+        Parameters currently not supported are `level` and `numeric_only`
+
+        Examples
+        --------
+        **Series**
+
+        >>> import cudf
+        >>> series = cudf.Series([1, 2, 3, 4])
+        >>> series.kurtosis()
+        -1.1999999999999904
+
+        **DataFrame**
+
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
+        >>> df.kurt()
+        a   -1.2
+        b   -1.2
+        dtype: float64
+        """
+        if axis not in (0, "index", None):
+            raise NotImplementedError("Only axis=0 is currently supported.")
+
+        return self._reduce(
+            "kurtosis",
+            axis=axis,
+            skipna=skipna,
+            level=level,
+            numeric_only=numeric_only,
+            **kwargs,
+        )
+
+    # Alias for kurtosis.
+    @copy_docstring(kurtosis)
+    def kurt(
+        self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
+    ):
+        return self.kurtosis(
+            axis=axis,
+            skipna=skipna,
+            level=level,
+            numeric_only=numeric_only,
+            **kwargs,
+        )
+
+    def skew(
+        self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
+    ):
+        """
+        Return unbiased Fisher-Pearson skew of a sample.
+
+        Parameters
+        ----------
+        skipna: bool, default True
+            Exclude NA/null values when computing the result.
+
+        Returns
+        -------
+        Series
+
+        Notes
+        -----
+        Parameters currently not supported are `axis`, `level` and
+        `numeric_only`
+
+        Examples
+        --------
+        **Series**
+
+        >>> import cudf
+        >>> series = cudf.Series([1, 2, 3, 4, 5, 6, 6])
+        >>> series
+        0    1
+        1    2
+        2    3
+        3    4
+        4    5
+        5    6
+        6    6
+        dtype: int64
+
+        **DataFrame**
+
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 8, 10, 10]})
+        >>> df.skew()
+        a    0.00000
+        b   -0.37037
+        dtype: float64
+        """
+        if axis not in (0, "index", None):
+            raise NotImplementedError("Only axis=0 is currently supported.")
+
+        return self._reduce(
+            "skew",
+            axis=axis,
+            skipna=skipna,
+            level=level,
+            numeric_only=numeric_only,
+            **kwargs,
+        )
+
     def all(self, axis=0, skipna=True, level=None, **kwargs):
         """
         Return whether all elements are True in DataFrame.
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 380e1838534..ff3b9fc68ef 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -4036,103 +4036,6 @@ def round(self, decimals=0, how="half_even"):
             dtype=self.dtype,
         )
 
-    def kurtosis(
-        self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
-    ):
-        """
-        Return Fisher's unbiased kurtosis of a sample.
-
-        Kurtosis obtained using Fisher’s definition of
-        kurtosis (kurtosis of normal == 0.0). Normalized by N-1.
-
-        Parameters
-        ----------
-
-        skipna : bool, default True
-            Exclude NA/null values when computing the result.
-
-        Returns
-        -------
-        scalar
-
-        Notes
-        -----
-        Parameters currently not supported are `axis`, `level` and
-        `numeric_only`
-
-        Examples
-        --------
-        >>> import cudf
-        >>> series = cudf.Series([1, 2, 3, 4])
-        >>> series.kurtosis()
-        -1.1999999999999904
-        """
-        if axis not in (None, 0):
-            raise NotImplementedError("axis parameter is not implemented yet")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not implemented yet")
-
-        if numeric_only not in (None, True):
-            raise NotImplementedError(
-                "numeric_only parameter is not implemented yet"
-            )
-
-        return self._column.kurtosis(skipna=skipna)
-
-    # Alias for kurtosis.
-    kurt = kurtosis
-
-    def skew(
-        self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
-    ):
-        """
-        Return unbiased Fisher-Pearson skew of a sample.
-
-        Parameters
-        ----------
-        skipna : bool, default True
-            Exclude NA/null values when computing the result.
-
-        Returns
-        -------
-        scalar
-
-        Notes
-        -----
-        Parameters currently not supported are `axis`, `level` and
-        `numeric_only`
-
-        Examples
-        --------
-        >>> import cudf
-        >>> series = cudf.Series([1, 2, 3, 4, 5, 6, 6])
-        >>> series
-        0    1
-        1    2
-        2    3
-        3    4
-        4    5
-        5    6
-        6    6
-        dtype: int64
-        >>> series.skew()
-        -0.288195490292614
-        """
-
-        if axis not in (None, 0):
-            raise NotImplementedError("axis parameter is not implemented yet")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not implemented yet")
-
-        if numeric_only not in (None, True):
-            raise NotImplementedError(
-                "numeric_only parameter is not implemented yet"
-            )
-
-        return self._column.skew(skipna=skipna)
-
     def cov(self, other, min_periods=None):
         """
         Compute covariance with Series, excluding missing values.

From 3fec3d89bc6769a328ca226432e456f85a75609e Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Fri, 20 Aug 2021 15:15:51 -0500
Subject: [PATCH 20/46] Fetch rapids-cmake to work around cuCollection cmake
 issue (#9075)

Fixes #9073.

Adds explicit fetching of rapids-cmake v21.10 to work around build issues outside of the conda environment caused by https://github.com/NVIDIA/cuCollections/pull/104.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Alessandro Bellina (https://github.com/abellina)

URL: https://github.com/rapidsai/cudf/pull/9075
---
 cpp/CMakeLists.txt | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6a972891958..3eee1147414 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -28,6 +28,17 @@ elseif(CMAKE_CUDA_ARCHITECTURES STREQUAL "")
   set(CUDF_BUILD_FOR_DETECTED_ARCHS TRUE)
 endif()
 
+file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.10/RAPIDS.cmake
+    ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
+include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
+
+include(rapids-cmake)
+include(rapids-cpm)
+include(rapids-cuda)
+include(rapids-export)
+include(rapids-find)
+
+
 project(CUDF VERSION 21.10.00 LANGUAGES C CXX)
 
 # Needed because GoogleBenchmark changes the state of FindThreads.cmake,

From 8c92812981a928d487bc485e7ab62bf075d45a12 Mon Sep 17 00:00:00 2001
From: MithunR <mythrocks@gmail.com>
Date: Fri, 20 Aug 2021 14:28:34 -0700
Subject: [PATCH 21/46] Support "unflatten" of columns flattened via
 `flatten_nested_columns()`: (#8956)

`cudf::flatten_nested_columns()` flattens out `STRUCT` columns into their
constituent member columns, and includes the `STRUCT`'s validity information
as a `BOOL8` column.
E.g. `STRUCT_1< STRUCT_2< A, B >, C >` is flattened to:
     1. Null Vector for `STRUCT_1`
     2. Null Vector for `STRUCT_2`
     3. Member `STRUCT_2::A`
     4. Member `STRUCT_2::B`
     5. Member `STRUCT_1::C`

This commit adds an `unflatten_nested_columns()` method to convert back
from a flattened representation to the nested columns.

Authors:
  - MithunR (https://github.com/mythrocks)

Approvers:
  - Devavret Makkar (https://github.com/devavret)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/8956
---
 cpp/src/structs/utilities.cpp         | 132 +++++++++++++++-
 cpp/src/structs/utilities.hpp         |  29 ++++
 cpp/tests/CMakeLists.txt              |   1 +
 cpp/tests/structs/utilities_tests.cpp | 220 ++++++++++++++++++++++++++
 4 files changed, 374 insertions(+), 8 deletions(-)
 create mode 100644 cpp/tests/structs/utilities_tests.cpp

diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp
index aa32c555324..bfeb6ef3533 100644
--- a/cpp/src/structs/utilities.cpp
+++ b/cpp/src/structs/utilities.cpp
@@ -16,8 +16,10 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
+#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/structs/structs_column_view.hpp>
+#include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/error.hpp>
@@ -61,6 +63,24 @@ std::vector<std::vector<column_view>> extract_ordered_struct_children(
   return result;
 }
 
+namespace {
+/**
+ * @brief Check whether the specified column is of type `STRUCT`.
+ */
+bool is_struct(cudf::column_view const& col) { return col.type().id() == type_id::STRUCT; }
+
+/**
+ * @brief Check whether the specified column is of type LIST, or any LISTs in its descendent
+ * columns.
+ */
+bool is_or_has_nested_lists(cudf::column_view const& col)
+{
+  auto is_list = [](cudf::column_view const& col) { return col.type().id() == type_id::LIST; };
+
+  return is_list(col) || std::any_of(col.child_begin(), col.child_end(), is_or_has_nested_lists);
+}
+}  // namespace
+
 /**
  * @brief Flattens struct columns to constituent non-struct columns in the input table.
  *
@@ -86,6 +106,13 @@ struct flattened_table {
       null_precedence(null_precedence),
       nullability(nullability)
   {
+    fail_if_unsupported_types(input);
+  }
+
+  void fail_if_unsupported_types(table_view const& input) const
+  {
+    auto const has_lists = std::any_of(input.begin(), input.end(), is_or_has_nested_lists);
+    CUDF_EXPECTS(not has_lists, "Flattening LIST columns is not supported.");
   }
 
   // Convert null_mask to BOOL8 columns and flatten the struct children in order.
@@ -156,9 +183,6 @@ struct flattened_table {
   }
 };
 
-/**
- * @copydoc cudf::detail::flatten_nested_columns
- */
 std::tuple<table_view,
            std::vector<order>,
            std::vector<null_order>,
@@ -168,15 +192,107 @@ flatten_nested_columns(table_view const& input,
                        std::vector<null_order> const& null_precedence,
                        column_nullability nullability)
 {
-  std::vector<std::unique_ptr<column>> validity_as_column;
-  auto const has_struct = std::any_of(
-    input.begin(), input.end(), [](auto const& col) { return col.type().id() == type_id::STRUCT; });
-  if (not has_struct)
-    return std::make_tuple(input, column_order, null_precedence, std::move(validity_as_column));
+  auto const has_struct = std::any_of(input.begin(), input.end(), is_struct);
+  if (not has_struct) {
+    return std::make_tuple(
+      input, column_order, null_precedence, std::vector<std::unique_ptr<column>>{});
+  }
 
   return flattened_table{input, column_order, null_precedence, nullability}();
 }
 
+namespace {
+using vector_of_columns = std::vector<std::unique_ptr<cudf::column>>;
+using column_index_t    = typename vector_of_columns::size_type;
+
+// Forward declaration, to enable recursion via `unflattener`.
+std::unique_ptr<cudf::column> unflatten_struct(vector_of_columns& flattened,
+                                               column_index_t& current_index,
+                                               cudf::column_view const& blueprint);
+
+/**
+ * @brief Helper functor to reconstruct STRUCT columns from its flattened member columns.
+ *
+ */
+class unflattener {
+ public:
+  unflattener(vector_of_columns& flattened_, column_index_t& current_index_)
+    : flattened{flattened_}, current_index{current_index_}
+  {
+  }
+
+  auto operator()(column_view const& blueprint)
+  {
+    return is_struct(blueprint) ? unflatten_struct(flattened, current_index, blueprint)
+                                : std::move(flattened[current_index++]);
+  }
+
+ private:
+  vector_of_columns& flattened;
+  column_index_t& current_index;
+
+};  // class unflattener;
+
+std::unique_ptr<cudf::column> unflatten_struct(vector_of_columns& flattened,
+                                               column_index_t& current_index,
+                                               cudf::column_view const& blueprint)
+{
+  // "Consume" columns from `flattened`, starting at `current_index`,
+  // based on the provided `blueprint` struct col. Recurse for struct children.
+  CUDF_EXPECTS(blueprint.type().id() == type_id::STRUCT,
+               "Expected blueprint column to be a STRUCT column.");
+
+  CUDF_EXPECTS(current_index < flattened.size(), "STRUCT column can't have 0 children.");
+
+  auto const num_rows = flattened[current_index]->size();
+
+  // cudf::flatten_nested_columns() executes depth first, and serializes the struct null vector
+  // before the child/member columns.
+  // E.g. STRUCT_1< STRUCT_2< A, B >, C > is flattened to:
+  //      1. Null Vector for STRUCT_1
+  //      2. Null Vector for STRUCT_2
+  //      3. Member STRUCT_2::A
+  //      4. Member STRUCT_2::B
+  //      5. Member STRUCT_1::C
+  //
+  // Extract null-vector *before* child columns are constructed.
+  auto struct_null_column_contents = flattened[current_index++]->release();
+  auto unflattening_iter =
+    thrust::make_transform_iterator(blueprint.child_begin(), unflattener{flattened, current_index});
+
+  return cudf::make_structs_column(
+    num_rows,
+    vector_of_columns{unflattening_iter, unflattening_iter + blueprint.num_children()},
+    UNKNOWN_NULL_COUNT,  // Do count?
+    std::move(*struct_null_column_contents.null_mask));
+}
+}  // namespace
+
+std::unique_ptr<cudf::table> unflatten_nested_columns(std::unique_ptr<cudf::table>&& flattened,
+                                                      table_view const& blueprint)
+{
+  // Bail, if LISTs are present.
+  auto const has_lists = std::any_of(blueprint.begin(), blueprint.end(), is_or_has_nested_lists);
+  CUDF_EXPECTS(not has_lists, "Unflattening LIST columns is not supported.");
+
+  // If there are no STRUCTs, unflattening is a NOOP.
+  auto const has_structs = std::any_of(blueprint.begin(), blueprint.end(), is_struct);
+  if (not has_structs) {
+    return std::move(flattened);  // Unchanged.
+  }
+
+  // There be struct columns.
+  // Note: Requires null vectors for all struct input columns.
+  auto flattened_columns = flattened->release();
+  auto current_idx       = column_index_t{0};
+
+  auto unflattening_iter =
+    thrust::make_transform_iterator(blueprint.begin(), unflattener{flattened_columns, current_idx});
+
+  return std::make_unique<cudf::table>(
+    vector_of_columns{unflattening_iter, unflattening_iter + blueprint.num_columns()});
+}
+
 // Helper function to superimpose validity of parent struct
 // over the specified member (child) column.
 void superimpose_parent_nulls(bitmask_type const* parent_null_mask,
diff --git a/cpp/src/structs/utilities.hpp b/cpp/src/structs/utilities.hpp
index eee9ca63146..a68f09574ce 100644
--- a/cpp/src/structs/utilities.hpp
+++ b/cpp/src/structs/utilities.hpp
@@ -76,6 +76,35 @@ flatten_nested_columns(table_view const& input,
                        std::vector<null_order> const& null_precedence,
                        column_nullability nullability = column_nullability::MATCH_INCOMING);
 
+/**
+ * @brief Unflatten columns flattened as by `flatten_nested_columns()`,
+ *        based on the provided `blueprint`.
+ *
+ * cudf::flatten_nested_columns() executes depth first, and serializes the struct null vector
+ * before the child/member columns.
+ * E.g. STRUCT_1< STRUCT_2< A, B >, C > is flattened to:
+ *      1. Null Vector for STRUCT_1
+ *      2. Null Vector for STRUCT_2
+ *      3. Member STRUCT_2::A
+ *      4. Member STRUCT_2::B
+ *      5. Member STRUCT_1::C
+ *
+ * `unflatten_nested_columns()` reconstructs nested columns from flattened input that follows
+ * the convention above.
+ *
+ * Note: This function requires a null-mask vector for each STRUCT column, including for nested
+ * STRUCT members.
+ *
+ * @param flattened "Flattened" `table` of input columns, following the conventions in
+ * `flatten_nested_columns()`.
+ * @param blueprint The exemplar `table_view` with nested columns intact, whose structure defines
+ * the nesting of the reconstructed output table.
+ * @return std::unique_ptr<cudf::table> Unflattened table (with nested STRUCT columns) reconstructed
+ * based on `blueprint`.
+ */
+std::unique_ptr<cudf::table> unflatten_nested_columns(std::unique_ptr<cudf::table>&& flattened,
+                                                      table_view const& blueprint);
+
 /**
  * @brief Pushdown nulls from a parent mask into a child column, using AND.
  *
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index c82826b8c60..19421e3115d 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -379,6 +379,7 @@ ConfigureTest(STRINGS_TEST
 # - structs test ----------------------------------------------------------------------------------
 ConfigureTest(STRUCTS_TEST 
     structs/structs_column_tests.cpp
+    structs/utilities_tests.cpp
     )
 
 ###################################################################################################
diff --git a/cpp/tests/structs/utilities_tests.cpp b/cpp/tests/structs/utilities_tests.cpp
new file mode 100644
index 00000000000..d4ded02adce
--- /dev/null
+++ b/cpp/tests/structs/utilities_tests.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <structs/utilities.hpp>
+
+namespace cudf::test {
+
+/**
+ * @brief Round-trip input table through flatten/unflatten,
+ *        verify that the table remains equivalent.
+ */
+void flatten_unflatten_compare(table_view const& input_table)
+{
+  using namespace cudf::structs::detail;
+
+  auto [flattened, _, __, ___] =
+    flatten_nested_columns(input_table, {}, {}, column_nullability::FORCE);
+  auto unflattened =
+    unflatten_nested_columns(std::make_unique<cudf::table>(flattened), input_table);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, unflattened->view());
+}
+
+using namespace cudf;
+using iterators::null_at;
+using strings = strings_column_wrapper;
+using structs = structs_column_wrapper;
+
+struct StructUtilitiesTest : BaseFixture {
+};
+
+template <typename T>
+struct TypedStructUtilitiesTest : StructUtilitiesTest {
+};
+
+TYPED_TEST_CASE(TypedStructUtilitiesTest, FixedWidthTypes);
+
+TYPED_TEST(TypedStructUtilitiesTest, ListsAtTopLevelUnsupported)
+{
+  using T     = TypeParam;
+  using lists = lists_column_wrapper<T, int32_t>;
+  using nums  = fixed_width_column_wrapper<T, int32_t>;
+
+  auto lists_col = lists{{0, 1}, {22, 33}, {44, 55, 66}};
+  auto nums_col  = nums{{0, 1, 2}, null_at(6)};
+
+  EXPECT_THROW(flatten_unflatten_compare(cudf::table_view{{lists_col, nums_col}}),
+               cudf::logic_error);
+}
+
+TYPED_TEST(TypedStructUtilitiesTest, NestedListsUnsupported)
+{
+  using T     = TypeParam;
+  using lists = lists_column_wrapper<T, int32_t>;
+  using nums  = fixed_width_column_wrapper<T, int32_t>;
+
+  auto lists_member = lists{{0, 1}, {22, 33}, {44, 55, 66}};
+  auto nums_member  = nums{{0, 1, 2}, null_at(6)};
+  auto structs_col  = structs{{nums_member, lists_member}};
+
+  auto nums_col = nums{{0, 1, 2}, null_at(6)};
+
+  EXPECT_THROW(flatten_unflatten_compare(cudf::table_view{{nums_col, structs_col}}),
+               cudf::logic_error);
+}
+
+TYPED_TEST(TypedStructUtilitiesTest, NoStructs)
+{
+  using T    = TypeParam;
+  using nums = fixed_width_column_wrapper<T, int32_t>;
+
+  auto nums_col        = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(0)};
+  auto strings_col     = strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)};
+  auto nuther_nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)};
+
+  flatten_unflatten_compare(cudf::table_view{{nums_col, strings_col, nuther_nums_col}});
+}
+
+TYPED_TEST(TypedStructUtilitiesTest, SingleLevelStruct)
+{
+  using T    = TypeParam;
+  using nums = fixed_width_column_wrapper<T, int32_t>;
+
+  auto nums_member    = nums{{0, 1, 22, 333, 44, 55, 66}, null_at(0)};
+  auto strings_member = strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)};
+  auto structs_col    = structs{{nums_member, strings_member}};
+  auto nums_col       = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)};
+
+  flatten_unflatten_compare(cudf::table_view{{nums_col, structs_col}});
+}
+
+TYPED_TEST(TypedStructUtilitiesTest, SingleLevelStructWithNulls)
+{
+  using T    = TypeParam;
+  using nums = fixed_width_column_wrapper<T, int32_t>;
+
+  auto nums_member    = nums{{0, 1, 22, 333, 44, 55, 66}, null_at(0)};
+  auto strings_member = strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)};
+  auto structs_col    = structs{{nums_member, strings_member}, null_at(2)};
+  auto nums_col       = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)};
+
+  flatten_unflatten_compare(cudf::table_view{{nums_col, structs_col}});
+}
+
+TYPED_TEST(TypedStructUtilitiesTest, StructOfStruct)
+{
+  using T    = TypeParam;
+  using nums = fixed_width_column_wrapper<T, int32_t>;
+
+  auto nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)};
+
+  auto struct_0_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(0)};
+  auto struct_0_strings_member =
+    strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)};
+  auto structs_1_structs_member = structs{{struct_0_nums_member, struct_0_strings_member}};
+
+  auto struct_1_nums_member  = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(3)};
+  auto struct_of_structs_col = structs{{struct_1_nums_member, structs_1_structs_member}};
+
+  flatten_unflatten_compare(cudf::table_view{{nums_col, struct_of_structs_col}});
+}
+
+TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtLeafLevel)
+{
+  using T    = TypeParam;
+  using nums = fixed_width_column_wrapper<T, int32_t>;
+
+  auto nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)};
+
+  auto struct_0_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(0)};
+  auto struct_0_strings_member =
+    strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)};
+  auto structs_1_structs_member =
+    structs{{struct_0_nums_member, struct_0_strings_member}, null_at(2)};
+
+  auto struct_1_nums_member  = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(3)};
+  auto struct_of_structs_col = structs{{struct_1_nums_member, structs_1_structs_member}};
+
+  flatten_unflatten_compare(cudf::table_view{{nums_col, struct_of_structs_col}});
+}
+
+TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtTopLevel)
+{
+  using T    = TypeParam;
+  using nums = fixed_width_column_wrapper<T, int32_t>;
+
+  auto nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)};
+
+  auto struct_0_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(0)};
+  auto struct_0_strings_member =
+    strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)};
+  auto structs_1_structs_member = structs{{struct_0_nums_member, struct_0_strings_member}};
+
+  auto struct_1_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(3)};
+  auto struct_of_structs_col =
+    structs{{struct_1_nums_member, structs_1_structs_member}, null_at(4)};
+
+  flatten_unflatten_compare(cudf::table_view{{nums_col, struct_of_structs_col}});
+}
+
+TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtAllLevels)
+{
+  using T    = TypeParam;
+  using nums = fixed_width_column_wrapper<T, int32_t>;
+
+  auto nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)};
+
+  auto struct_0_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(0)};
+  auto struct_0_strings_member =
+    strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)};
+  auto structs_1_structs_member =
+    structs{{struct_0_nums_member, struct_0_strings_member}, null_at(2)};
+
+  auto struct_1_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(3)};
+  auto struct_of_structs_col =
+    structs{{struct_1_nums_member, structs_1_structs_member}, null_at(4)};
+
+  flatten_unflatten_compare(cudf::table_view{{nums_col, struct_of_structs_col}});
+}
+
+TYPED_TEST(TypedStructUtilitiesTest, ListsAreUnsupported)
+{
+  using T    = TypeParam;
+  using ints = fixed_width_column_wrapper<int32_t>;
+  using lcw  = lists_column_wrapper<T, int32_t>;
+
+  // clang-format off
+  auto lists_member = lcw{  {0,1,2}, {3,4,5}, {6,7,8,9} };
+  auto ints_member  = ints{       0,       1,         2 };
+  // clang-format on
+
+  auto structs_with_lists_col = structs{lists_member, ints_member};
+
+  EXPECT_THROW(flatten_unflatten_compare(cudf::table_view{{structs_with_lists_col}}),
+               cudf::logic_error);
+}
+
+}  // namespace cudf::test

From 6cd01678b04a32aa72605644fc7b79a1fff1a797 Mon Sep 17 00:00:00 2001
From: Raza Jafri <razajafri@users.noreply.github.com>
Date: Fri, 20 Aug 2021 15:54:43 -0700
Subject: [PATCH 22/46] Added method to remove null_masks if the column has no
 nulls (#9061)

This PR adds a method to remove the validity vector in cases where there are columns in a Table with no nulls but still have a validity vector.

Authors:
  - Raza Jafri (https://github.com/razajafri)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/9061
---
 .../main/java/ai/rapids/cudf/ColumnView.java  |  56 +++++-
 java/src/main/java/ai/rapids/cudf/Table.java  |  11 +-
 java/src/main/native/src/TableJni.cpp         |  63 ++++++-
 .../test/java/ai/rapids/cudf/TableTest.java   | 170 +++++++++++++++---
 4 files changed, 269 insertions(+), 31 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 55bd5ec5ff9..4d9991d0dd9 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -101,11 +101,39 @@ public ColumnView(DType type, long rows, Optional<Long> nullCount,
         || !nullCount.isPresent();
   }
 
+  /**
+   * Create a new column view based off of data already on the device. Ref count on the buffers
+   * is not incremented and none of the underlying buffers are owned by this view. The returned
+   * ColumnView is only valid as long as the underlying buffers remain valid. If the buffers are
+   * closed before this ColumnView is closed, it will result in undefined behavior.
+   *
+   * If ownership is needed, call {@link ColumnView#copyToColumnVector}
+   *
+   * @param type           the type of the vector
+   * @param rows           the number of rows in this vector.
+   * @param nullCount      the number of nulls in the dataset.
+   * @param dataBuffer     a host buffer required for nested types including strings and string
+   *                       categories. The ownership doesn't change on this buffer
+   * @param validityBuffer an optional validity buffer. Must be provided if nullCount != 0.
+   *                       The ownership doesn't change on this buffer
+   * @param offsetBuffer   The offsetbuffer for columns that need an offset buffer
+   */
+  public ColumnView(DType type, long rows, Optional<Long> nullCount,
+                    BaseDeviceMemoryBuffer dataBuffer,
+                    BaseDeviceMemoryBuffer validityBuffer, BaseDeviceMemoryBuffer offsetBuffer) {
+    this(type, (int) rows, nullCount.orElse(UNKNOWN_NULL_COUNT).intValue(),
+        dataBuffer, validityBuffer, offsetBuffer, null);
+    assert (!type.isNestedType());
+    assert (nullCount.isPresent() && nullCount.get() <= Integer.MAX_VALUE)
+        || !nullCount.isPresent();
+  }
+
   private ColumnView(DType type, long rows, int nullCount,
                      BaseDeviceMemoryBuffer dataBuffer, BaseDeviceMemoryBuffer validityBuffer,
                      BaseDeviceMemoryBuffer offsetBuffer, ColumnView[] children) {
     this(ColumnVector.initViewHandle(type, (int) rows, nullCount, dataBuffer, validityBuffer,
-        offsetBuffer, Arrays.stream(children).mapToLong(c -> c.getNativeView()).toArray()));
+        offsetBuffer, children == null ? new long[]{} :
+            Arrays.stream(children).mapToLong(c -> c.getNativeView()).toArray()));
   }
 
   /** Creates a ColumnVector from a column view handle
@@ -140,6 +168,32 @@ public final DType getType() {
     return type;
   }
 
+  /**
+   * Returns the child column views for this view
+   * Please note that it is the responsibility of the caller to close these views.
+   * @return an array of child column views
+   */
+  public final ColumnView[] getChildColumnViews() {
+    int numChildren = getNumChildren();
+    if (!getType().isNestedType()) {
+      return null;
+    }
+    ColumnView[] views = new ColumnView[numChildren];
+    try {
+      for (int i = 0; i < numChildren; i++) {
+        views[i] = getChildColumnView(i);
+      }
+      return views;
+    } catch(Throwable t) {
+      for (ColumnView v: views) {
+        if (v != null) {
+          v.close();
+        }
+      }
+      throw t;
+    }
+  }
+
   /**
    * Returns the child column view at a given index.
    * Please note that it is the responsibility of the caller to close this view.
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index e725932ed5e..eeb2d308f1a 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -170,10 +170,19 @@ public long getDeviceMemorySize() {
     return total;
   }
 
+  /**
+   * This method is internal and exposed purely for testing purpopses
+   */
+  static Table removeNullMasksIfNeeded(Table table) {
+    return new Table(removeNullMasksIfNeeded(table.nativeHandle));
+  }
+
   /////////////////////////////////////////////////////////////////////////////
   // NATIVE APIs
   /////////////////////////////////////////////////////////////////////////////
-  
+
+  private static native long[] removeNullMasksIfNeeded(long tableView) throws CudfException;
+
   private static native ContiguousTable[] contiguousSplit(long inputTable, int[] indices);
 
   private static native long[] partition(long inputTable, long partitionView,
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index f642a87b445..2bb56565f7a 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -929,6 +929,45 @@ jlongArray combine_join_results(JNIEnv *env, cudf::table &left_results,
   return combine_join_results(env, std::move(left_cols), std::move(right_cols));
 }
 
+cudf::column_view remove_validity_from_col(cudf::column_view column_view) {
+  if (!cudf::is_compound(column_view.type())) {
+    if (column_view.nullable() && column_view.null_count() == 0) {
+      // null_mask is allocated but no nulls present therefore we create a new column_view without
+      // the null_mask to avoid things blowing up in reading the parquet file
+      return cudf::column_view(column_view.type(), column_view.size(), column_view.head(), nullptr,
+                               0, column_view.offset());
+    } else {
+      return cudf::column_view(column_view);
+    }
+  } else {
+    std::unique_ptr<cudf::column_view> ret;
+    std::vector<cudf::column_view> children;
+    children.reserve(column_view.num_children());
+    for (auto it = column_view.child_begin(); it != column_view.child_end(); it++) {
+      children.push_back(remove_validity_from_col(*it));
+    }
+    if (!column_view.nullable() || column_view.null_count() != 0) {
+      ret.reset(new cudf::column_view(column_view.type(), column_view.size(), nullptr,
+                                      column_view.null_mask(), column_view.null_count(),
+                                      column_view.offset(), children));
+    } else {
+      ret.reset(new cudf::column_view(column_view.type(), column_view.size(), nullptr, nullptr, 0,
+                                      column_view.offset(), children));
+    }
+    return *ret.release();
+  }
+}
+
+cudf::table_view remove_validity_if_needed(cudf::table_view *input_table_view) {
+  std::vector<cudf::column_view> views;
+  views.reserve(input_table_view->num_columns());
+  for (auto it = input_table_view->begin(); it != input_table_view->end(); it++) {
+    views.push_back(remove_validity_from_col(*it));
+  }
+
+  return cudf::table_view(views);
+}
+
 } // namespace
 
 } // namespace jni
@@ -936,6 +975,25 @@ jlongArray combine_join_results(JNIEnv *env, cudf::table &left_results,
 
 extern "C" {
 
+// This is a method purely added for testing remove_validity_if_needed method
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_removeNullMasksIfNeeded(JNIEnv *env, jclass,
+                                                                               jlong j_table_view) {
+  JNI_NULL_CHECK(env, j_table_view, "table view handle is null", 0);
+  try {
+    cudf::table_view *tview = reinterpret_cast<cudf::table_view *>(j_table_view);
+    cudf::table_view result = cudf::jni::remove_validity_if_needed(tview);
+    cudf::table m_tbl(result);
+    std::vector<std::unique_ptr<cudf::column>> cols = m_tbl.release();
+    auto results = cudf::jni::native_jlongArray(env, cols.size());
+    int i = 0;
+    for (auto it = cols.begin(); it != cols.end(); it++) {
+      results[i++] = reinterpret_cast<jlong>(it->release());
+    }
+    return results.get_jArray();
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_createCudfTableView(JNIEnv *env, jclass,
                                                                       jlongArray j_cudf_columns) {
   JNI_NULL_CHECK(env, j_cudf_columns, "columns are null", 0);
@@ -1357,7 +1415,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetChunk(JNIEnv *env,
   JNI_NULL_CHECK(env, j_state, "null state", );
 
   using namespace cudf::io;
-  cudf::table_view *tview = reinterpret_cast<cudf::table_view *>(j_table);
+  cudf::table_view *tview_with_empty_nullmask = reinterpret_cast<cudf::table_view *>(j_table);
+  cudf::table_view tview = cudf::jni::remove_validity_if_needed(tview_with_empty_nullmask);
   cudf::jni::native_parquet_writer_handle *state =
       reinterpret_cast<cudf::jni::native_parquet_writer_handle *>(j_state);
 
@@ -1367,7 +1426,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetChunk(JNIEnv *env,
   }
   try {
     cudf::jni::auto_set_device(env);
-    state->writer->write(*tview);
+    state->writer->write(tview);
   }
   CATCH_STD(env, )
 }
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index aeb94e4824a..cc030c392cb 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -49,19 +49,14 @@
 import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
 import java.util.stream.Collectors;
 
 import static ai.rapids.cudf.ParquetColumnWriterOptions.mapColumn;
 import static ai.rapids.cudf.ParquetWriterOptions.listBuilder;
 import static ai.rapids.cudf.ParquetWriterOptions.structBuilder;
 import static ai.rapids.cudf.Table.TestBuilder;
+import static ai.rapids.cudf.Table.removeNullMasksIfNeeded;
 import static org.junit.jupiter.api.Assertions.assertArrayEquals;
 import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
 import static org.junit.jupiter.api.Assertions.assertEquals;
@@ -111,7 +106,7 @@ public static void assertColumnsAreEqual(ColumnView expect, ColumnView cv) {
    * @param colName The name of the column
    */
   public static void assertColumnsAreEqual(ColumnView expected, ColumnView cv, String colName) {
-    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true);
+    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false);
   }
 
   /**
@@ -121,7 +116,7 @@ public static void assertColumnsAreEqual(ColumnView expected, ColumnView cv, Str
    * @param colName The name of the host column
    */
   public static void assertColumnsAreEqual(HostColumnVector expected, HostColumnVector cv, String colName) {
-    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true);
+    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false);
   }
 
   /**
@@ -130,7 +125,7 @@ public static void assertColumnsAreEqual(HostColumnVector expected, HostColumnVe
    * @param cv The input Struct column
    */
   public static void assertStructColumnsAreEqual(ColumnView expected, ColumnView cv) {
-    assertPartialStructColumnsAreEqual(expected, 0, expected.getRowCount(), cv, "unnamed", true);
+    assertPartialStructColumnsAreEqual(expected, 0, expected.getRowCount(), cv, "unnamed", true, false);
   }
 
   /**
@@ -140,13 +135,14 @@ public static void assertStructColumnsAreEqual(ColumnView expected, ColumnView c
    * @param length The number of rows to consider
    * @param cv The input Struct column
    * @param colName The name of the column
-   * @param enableNullCheck Whether to check for nulls in the Struct column
+   * @param enableNullCountCheck Whether to check for nulls in the Struct column
+   * @param enableNullabilityCheck Whether the table have a validity mask
    */
   public static void assertPartialStructColumnsAreEqual(ColumnView expected, long rowOffset, long length,
-      ColumnView cv, String colName, boolean enableNullCheck) {
+      ColumnView cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) {
     try (HostColumnVector hostExpected = expected.copyToHost();
          HostColumnVector hostcv = cv.copyToHost()) {
-      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCheck);
+      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCountCheck, enableNullabilityCheck);
     }
   }
 
@@ -156,12 +152,13 @@ public static void assertPartialStructColumnsAreEqual(ColumnView expected, long
    * @param cv The input column
    * @param colName The name of the column
    * @param enableNullCheck Whether to check for nulls in the column
+   * @param enableNullabilityCheck Whether the table have a validity mask
    */
   public static void assertPartialColumnsAreEqual(ColumnView expected, long rowOffset, long length,
-      ColumnView cv, String colName, boolean enableNullCheck) {
+      ColumnView cv, String colName, boolean enableNullCheck, boolean enableNullabilityCheck) {
     try (HostColumnVector hostExpected = expected.copyToHost();
          HostColumnVector hostcv = cv.copyToHost()) {
-      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCheck);
+      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCheck, enableNullabilityCheck);
     }
   }
 
@@ -172,18 +169,21 @@ public static void assertPartialColumnsAreEqual(ColumnView expected, long rowOff
    * @param length  number of rows from starting offset
    * @param cv The input host column
    * @param colName The name of the host column
-   * @param enableNullCheck Whether to check for nulls in the host column
+   * @param enableNullCountCheck Whether to check for nulls in the host column
    */
   public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, long rowOffset, long length,
-                                                  HostColumnVectorCore cv, String colName, boolean enableNullCheck) {
+                                                  HostColumnVectorCore cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) {
     assertEquals(expected.getType(), cv.getType(), "Type For Column " + colName);
     assertEquals(length, cv.getRowCount(), "Row Count For Column " + colName);
     assertEquals(expected.getNumChildren(), cv.getNumChildren(), "Child Count for Column " + colName);
-    if (enableNullCheck) {
+    if (enableNullCountCheck) {
       assertEquals(expected.getNullCount(), cv.getNullCount(), "Null Count For Column " + colName);
     } else {
       // TODO add in a proper check when null counts are supported by serializing a partitioned column
     }
+    if (enableNullabilityCheck) {
+      assertEquals(expected.hasValidityVector(), cv.hasValidityVector(), "Column nullability is different than expected");
+    }
     DType type = expected.getType();
     for (long expectedRow = rowOffset; expectedRow < (rowOffset + length); expectedRow++) {
       long tableRow = expectedRow - rowOffset;
@@ -269,7 +269,7 @@ public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, l
           }
           assertPartialColumnsAreEqual(expected.getNestedChildren().get(0), expectedChildRowOffset,
               numChildRows, cv.getNestedChildren().get(0), colName + " list child",
-              enableNullCheck);
+              enableNullCountCheck, enableNullabilityCheck);
           break;
         case STRUCT:
           List<HostColumnVectorCore> expectedChildren = expected.getNestedChildren();
@@ -280,7 +280,7 @@ public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, l
             String childName = colName + " child " + i;
             assertEquals(length, cvChild.getRowCount(), "Row Count for Column " + colName);
             assertPartialColumnsAreEqual(expectedChild, rowOffset, length, cvChild,
-                colName, enableNullCheck);
+                colName, enableNullCountCheck, enableNullabilityCheck);
           }
           break;
         default:
@@ -296,9 +296,10 @@ public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, l
    * @param length the number of rows to check
    * @param table the input table to compare against expected
    * @param enableNullCheck whether to check for nulls or not
+   * @param enableNullabilityCheck whether the table have a validity mask
    */
   public static void assertPartialTablesAreEqual(Table expected, long rowOffset, long length, Table table,
-                                                 boolean enableNullCheck) {
+                                                 boolean enableNullCheck, boolean enableNullabilityCheck) {
     assertEquals(expected.getNumberOfColumns(), table.getNumberOfColumns());
     assertEquals(length, table.getRowCount(), "ROW COUNT");
     for (int col = 0; col < expected.getNumberOfColumns(); col++) {
@@ -308,7 +309,7 @@ public static void assertPartialTablesAreEqual(Table expected, long rowOffset, l
       if (rowOffset != 0 || length != expected.getRowCount()) {
         name = name + " PART " + rowOffset + "-" + (rowOffset + length - 1);
       }
-      assertPartialColumnsAreEqual(expect, rowOffset, length, cv, name, enableNullCheck);
+      assertPartialColumnsAreEqual(expect, rowOffset, length, cv, name, enableNullCheck, enableNullabilityCheck);
     }
   }
 
@@ -318,7 +319,7 @@ public static void assertPartialTablesAreEqual(Table expected, long rowOffset, l
    * @param table the input table to compare against expected
    */
   public static void assertTablesAreEqual(Table expected, Table table) {
-    assertPartialTablesAreEqual(expected, 0, expected.getRowCount(), table, true);
+    assertPartialTablesAreEqual(expected, 0, expected.getRowCount(), table, true, false);
   }
 
   void assertTablesHaveSameValues(HashMap<Object, Integer>[] expectedTable, Table table) {
@@ -3235,7 +3236,7 @@ void testSerializationRoundTripConcatHostSide() throws IOException {
           try (Table found = JCudfSerialization.readAndConcat(
               headers.toArray(new JCudfSerialization.SerializedTableHeader[headers.size()]),
               buffers.toArray(new HostMemoryBuffer[buffers.size()]))) {
-            assertPartialTablesAreEqual(t, 0, t.getRowCount(), found, false);
+            assertPartialTablesAreEqual(t, 0, t.getRowCount(), found, false, false);
           }
         } finally {
           for (HostMemoryBuffer buff: buffers) {
@@ -3288,7 +3289,7 @@ void testConcatHost() throws IOException {
         try (Table result = JCudfSerialization.readAndConcat(
             new JCudfSerialization.SerializedTableHeader[] {header, header},
             new HostMemoryBuffer[] {buff, buff})) {
-          assertPartialTablesAreEqual(expected, 0, expected.getRowCount(), result, false);
+          assertPartialTablesAreEqual(expected, 0, expected.getRowCount(), result, false, false);
         }
       }
     }
@@ -3329,7 +3330,7 @@ void testSerializationRoundTripSlicedHostSide() throws IOException {
               buffers.toArray(new HostMemoryBuffer[buffers.size()]), bout2);
           ByteArrayInputStream bin2 = new ByteArrayInputStream(bout2.toByteArray());
           try (JCudfSerialization.TableAndRowCountPair found = JCudfSerialization.readTableFrom(bin2)) {
-            assertPartialTablesAreEqual(t, 0, t.getRowCount(), found.getTable(), false);
+            assertPartialTablesAreEqual(t, 0, t.getRowCount(), found.getTable(), false, false);
             assertEquals(found.getTable(), found.getContiguousTable().getTable());
             assertNotNull(found.getContiguousTable().getBuffer());
           }
@@ -3355,7 +3356,7 @@ void testSerializationRoundTripSliced() throws IOException {
           JCudfSerialization.writeToStream(t, bout, i, len);
           ByteArrayInputStream bin = new ByteArrayInputStream(bout.toByteArray());
           try (JCudfSerialization.TableAndRowCountPair found = JCudfSerialization.readTableFrom(bin)) {
-            assertPartialTablesAreEqual(t, i, len, found.getTable(), i == 0 && len == t.getRowCount());
+            assertPartialTablesAreEqual(t, i, len, found.getTable(), i == 0 && len == t.getRowCount(), false);
             assertEquals(found.getTable(), found.getContiguousTable().getTable());
             assertNotNull(found.getContiguousTable().getBuffer());
           }
@@ -6360,6 +6361,121 @@ void testAllFilteredFromValidity() {
     }
   }
 
+  ColumnView replaceValidity(ColumnView cv, DeviceMemoryBuffer validity, long nullCount) {
+    assert (validity.length >= BitVectorHelper.getValidityAllocationSizeInBytes(cv.rows));
+    if (cv.type.isNestedType()) {
+      ColumnView[] children = cv.getChildColumnViews();
+      try {
+        return new ColumnView(cv.type,
+            cv.rows,
+            Optional.of(nullCount),
+            validity,
+            cv.getOffsets(),
+            children);
+      } finally {
+        for (ColumnView v : children) {
+          if (v != null) {
+            v.close();
+          }
+        }
+      }
+    } else {
+      return new ColumnView(cv.type, cv.rows, Optional.of(nullCount), cv.getData(), validity, cv.getOffsets());
+    }
+  }
+
+  @Test
+  void testRemoveNullMasksIfNeeded() {
+    ListType nestedType = new ListType(true, new StructType(false,
+        new BasicType(true, DType.INT32),
+        new BasicType(true, DType.INT64)));
+
+    List data1 = Arrays.asList(10, 20L);
+    List data2 = Arrays.asList(50, 60L);
+    HostColumnVector.StructData structData1 = new HostColumnVector.StructData(data1);
+    HostColumnVector.StructData structData2 = new HostColumnVector.StructData(data2);
+
+    //First we create ColumnVectors
+    try (ColumnVector nonNullVector0 = ColumnVector.fromBoxedInts(1, 2, 3);
+         ColumnVector nonNullVector2 = ColumnVector.fromStrings("1", "2", "3");
+         ColumnVector nonNullVector1 = ColumnVector.fromLists(nestedType,
+             Arrays.asList(structData1, structData2),
+             Arrays.asList(structData1, structData2),
+             Arrays.asList(structData1, structData2))) {
+      //Then we take the created ColumnVectors and add validity masks even though the nullCount = 0
+      long allocSize = BitVectorHelper.getValidityAllocationSizeInBytes(nonNullVector0.rows);
+      try (DeviceMemoryBuffer dm0 = DeviceMemoryBuffer.allocate(allocSize);
+           DeviceMemoryBuffer dm1 = DeviceMemoryBuffer.allocate(allocSize);
+           DeviceMemoryBuffer dm2 = DeviceMemoryBuffer.allocate(allocSize);
+           DeviceMemoryBuffer dm3_child =
+               DeviceMemoryBuffer.allocate(BitVectorHelper.getValidityAllocationSizeInBytes(2))) {
+        Cuda.memset(dm0.address, (byte) 0xFF, allocSize);
+        Cuda.memset(dm1.address, (byte) 0xFF, allocSize);
+        Cuda.memset(dm2.address, (byte) 0xFF, allocSize);
+        Cuda.memset(dm3_child.address, (byte) 0xFF,
+            BitVectorHelper.getValidityAllocationSizeInBytes(2));
+
+        try (ColumnView cv0View = replaceValidity(nonNullVector0, dm0, 0);
+             ColumnVector cv0 = cv0View.copyToColumnVector();
+             ColumnView struct = nonNullVector1.getChildColumnView(0);
+             ColumnView structChild0 = struct.getChildColumnView(0);
+             ColumnView newStructChild0 = replaceValidity(structChild0, dm3_child, 0);
+             ColumnView newStruct = struct.replaceChildrenWithViews(new int[]{0}, new ColumnView[]{newStructChild0});
+             ColumnView list = nonNullVector1.replaceChildrenWithViews(new int[]{0}, new ColumnView[]{newStruct});
+             ColumnView cv1View = replaceValidity(list, dm1, 0);
+             ColumnVector cv1 = cv1View.copyToColumnVector();
+             ColumnView cv2View = replaceValidity(nonNullVector2, dm2, 0);
+             ColumnVector cv2 = cv2View.copyToColumnVector()) {
+
+          try (Table t = new Table(new ColumnVector[]{cv0, cv1, cv2});
+               Table tableWithoutNullMask = removeNullMasksIfNeeded(t);
+               ColumnView tableStructChild0 = t.getColumn(1).getChildColumnView(0).getChildColumnView(0);
+               ColumnVector tableStructChild0Cv = tableStructChild0.copyToColumnVector();
+               Table expected = new Table(new ColumnVector[]{nonNullVector0, nonNullVector1,
+                nonNullVector2})) {
+            assertTrue(t.getColumn(0).hasValidityVector());
+            assertTrue(t.getColumn(1).hasValidityVector());
+            assertTrue(t.getColumn(2).hasValidityVector());
+            assertTrue(tableStructChild0Cv.hasValidityVector());
+
+            assertPartialTablesAreEqual(expected,
+                0,
+                expected.getRowCount(),
+                tableWithoutNullMask,
+                true,
+                true);
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  void testRemoveNullMasksIfNeededWithNulls() {
+    ListType nestedType = new ListType(true, new StructType(true,
+        new BasicType(true, DType.INT32),
+        new BasicType(true, DType.INT64)));
+
+    List data1 = Arrays.asList(0, 10L);
+    List data2 = Arrays.asList(50, null);
+    HostColumnVector.StructData structData1 = new HostColumnVector.StructData(data1);
+    HostColumnVector.StructData structData2 = new HostColumnVector.StructData(data2);
+
+    //First we create ColumnVectors
+    try (ColumnVector nonNullVector0 = ColumnVector.fromBoxedInts(1, null, 2, 3);
+         ColumnVector nonNullVector1 = ColumnVector.fromStrings("1", "2", null, "3");
+         ColumnVector nonNullVector2 = ColumnVector.fromLists(nestedType,
+             Arrays.asList(structData1, structData2),
+             null,
+             Arrays.asList(structData1, structData2),
+             Arrays.asList(structData1, structData2))) {
+      try (Table expected = new Table(new ColumnVector[]{nonNullVector0, nonNullVector1, nonNullVector2});
+           Table unchangedTable = removeNullMasksIfNeeded(expected)) {
+        assertTablesAreEqual(expected, unchangedTable);
+      }
+    }
+  }
+
   @Test
   void testMismatchedSizesForFilter() {
     Boolean[] maskVals = new Boolean[3];

From e42464ce44b1728b69b0df2f104b6df924052041 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 23 Aug 2021 09:59:36 -0400
Subject: [PATCH 23/46] Fix memcheck read error in libcudf contiguous_split
 (#9067)

Reference #8883

The `cudf::contiguous_split` was failing on memcheck using the `compute-sanitizer` with a 4-byte out-of-bounds read. This was traced to the `copy_buffer` device function that was reading 1 past the end of the input buffer when performing a value-shift. The ternary check was incorrectly protecting the out-of-bounds read. The logic is corrected by this PR.

Also, I fixed some `const` removal casts from the same source file by adding appropriate `const` qualifiers to the input data variables.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/9067
---
 cpp/src/copying/contiguous_split.cu | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 779a6a74f1d..a9194ceea93 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -132,7 +132,7 @@ struct dst_buf_info {
  */
 template <int block_size>
 __device__ void copy_buffer(uint8_t* __restrict__ dst,
-                            uint8_t* __restrict__ src,
+                            uint8_t const* __restrict__ src,
                             int t,
                             std::size_t num_elements,
                             std::size_t element_size,
@@ -193,11 +193,12 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst,
     // and will never both be true at the same time.
     if (value_shift || bit_shift) {
       std::size_t idx = (num_bytes - remainder) / 4;
-      uint32_t v      = remainder > 0 ? (reinterpret_cast<uint32_t*>(src)[idx] - value_shift) : 0;
+      uint32_t v = remainder > 0 ? (reinterpret_cast<uint32_t const*>(src)[idx] - value_shift) : 0;
       while (remainder) {
-        uint32_t const next =
-          remainder > 0 ? (reinterpret_cast<uint32_t*>(src)[idx + 1] - value_shift) : 0;
-        uint32_t const val = (v >> bit_shift) | (next << (32 - bit_shift));
+        uint32_t const next = bit_shift > 0 || remainder > 4
+                                ? (reinterpret_cast<uint32_t const*>(src)[idx + 1] - value_shift)
+                                : 0;
+        uint32_t const val  = (v >> bit_shift) | (next << (32 - bit_shift));
         if (valid_count) { thread_valid_count += __popc(val); }
         reinterpret_cast<uint32_t*>(dst)[idx] = val;
         v                                     = next;
@@ -207,7 +208,7 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst,
     } else {
       while (remainder) {
         std::size_t const idx = num_bytes - remainder--;
-        uint32_t const val    = reinterpret_cast<uint8_t*>(src)[idx];
+        uint32_t const val    = reinterpret_cast<uint8_t const*>(src)[idx];
         if (valid_count) { thread_valid_count += __popc(val); }
         reinterpret_cast<uint8_t*>(dst)[idx] = val;
       }
@@ -255,7 +256,7 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst,
  */
 template <int block_size>
 __global__ void copy_partition(int num_src_bufs,
-                               uint8_t** src_bufs,
+                               uint8_t const** src_bufs,
                                uint8_t** dst_bufs,
                                dst_buf_info* buf_info)
 {
@@ -349,13 +350,13 @@ OutputIter setup_src_buf_data(InputIter begin, InputIter end, OutputIter out_buf
 {
   std::for_each(begin, end, [&out_buf](column_view const& col) {
     if (col.nullable()) {
-      *out_buf = reinterpret_cast<uint8_t*>(const_cast<bitmask_type*>(col.null_mask()));
+      *out_buf = reinterpret_cast<uint8_t const*>(col.null_mask());
       out_buf++;
     }
     // NOTE: we're always returning the base pointer here.  column-level offset is accounted
     // for later. Also, for some column types (string, list, struct) this pointer will be null
     // because there is no associated data with the root column.
-    *out_buf = const_cast<uint8_t*>(col.head<uint8_t>());
+    *out_buf = col.head<uint8_t>();
     out_buf++;
 
     out_buf = setup_src_buf_data(col.child_begin(), col.child_end(), out_buf);
@@ -1020,14 +1021,14 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
     cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align);
   // host-side
   std::vector<uint8_t> h_src_and_dst_buffers(src_bufs_size + dst_bufs_size);
-  uint8_t** h_src_bufs = reinterpret_cast<uint8_t**>(h_src_and_dst_buffers.data());
+  uint8_t const** h_src_bufs = reinterpret_cast<uint8_t const**>(h_src_and_dst_buffers.data());
   uint8_t** h_dst_bufs = reinterpret_cast<uint8_t**>(h_src_and_dst_buffers.data() + src_bufs_size);
   // device-side
   rmm::device_buffer d_src_and_dst_buffers(src_bufs_size + dst_bufs_size + offset_stack_size,
                                            stream,
                                            rmm::mr::get_current_device_resource());
-  uint8_t** d_src_bufs = reinterpret_cast<uint8_t**>(d_src_and_dst_buffers.data());
-  uint8_t** d_dst_bufs = reinterpret_cast<uint8_t**>(
+  uint8_t const** d_src_bufs = reinterpret_cast<uint8_t const**>(d_src_and_dst_buffers.data());
+  uint8_t** d_dst_bufs       = reinterpret_cast<uint8_t**>(
     reinterpret_cast<uint8_t*>(d_src_and_dst_buffers.data()) + src_bufs_size);
 
   // setup src buffers

From d4c3f32af00388dd41b78428486e4a2f53257384 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 23 Aug 2021 10:09:00 -0400
Subject: [PATCH 24/46] Fix memory write error in
 get_list_child_to_list_row_mapping utility (#8994)

Reference issue #8883 and depends on fixes in PR #8884

The `get_list_child_to_list_row_mapping` builds a map for rolling operation on a lists column. In the `thrust::scatter` call a map value includes the last offset which will always be out-of-bounds to given output vector. This output vector is used to build the resultant output map by calling `thrust::inclusive_scan` but the out-of-bounds offset value is not used -- which is why the utility does not fail. The fix in this PR simply allocates an extra row in the intermediate vector so the `thrust::scatter` will not write to out-of-bounds memory. Since the value is eventually ignored, it does not effect the result.

The code in this function was creating many temporary columns incorrectly using the passed in `device_resource_manager` variable `mr`. The code was corrected by changing these to be just `device_uvector's` instead making it more clear that these are internal temporary memory buffers. Further the code calling `get_list_child_to_list_row_mapping` utility is using the output as a temporary column and so this PR fixes the logic to correct the memory resource usage.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - MithunR (https://github.com/mythrocks)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/8994
---
 cpp/CMakeLists.txt                       |   3 +-
 cpp/src/rolling/rolling_collect_list.cu  | 157 ++++++++++++++++++++++
 cpp/src/rolling/rolling_collect_list.cuh | 163 +++--------------------
 3 files changed, 176 insertions(+), 147 deletions(-)
 create mode 100644 cpp/src/rolling/rolling_collect_list.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 3eee1147414..d6b457a94d4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -368,8 +368,9 @@ add_library(cudf
     src/reshape/interleave_columns.cu
     src/reshape/tile.cu
     src/rolling/grouped_rolling.cu
-    src/rolling/rolling.cu
     src/rolling/range_window_bounds.cpp
+    src/rolling/rolling.cu
+    src/rolling/rolling_collect_list.cu
     src/round/round.cu
     src/scalar/scalar.cpp
     src/scalar/scalar_factories.cpp
diff --git a/cpp/src/rolling/rolling_collect_list.cu b/cpp/src/rolling/rolling_collect_list.cu
new file mode 100644
index 00000000000..ecef90dc8e1
--- /dev/null
+++ b/cpp/src/rolling/rolling_collect_list.cu
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <rolling/rolling_collect_list.cuh>
+
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/iterator.cuh>
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/scan.h>
+#include <thrust/scatter.h>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @see cudf::detail::get_list_child_to_list_row_mapping
+ */
+std::unique_ptr<column> get_list_child_to_list_row_mapping(cudf::column_view const& offsets,
+                                                           rmm::cuda_stream_view stream)
+{
+  // First, scatter the count for each repeated offset (except the first and last),
+  // into a column of N `0`s, where N == number of child rows.
+  // For example:
+  //   offsets        == [0, 2, 5, 8, 11, 13]
+  //   scatter result == [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]
+  //
+  // An example with empty list row at index 2:
+  //   offsets        == [0, 2, 5, 5, 8, 11, 13]
+  //   scatter result == [0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0]
+  //
+  auto const num_child_rows{
+    cudf::detail::get_value<size_type>(offsets, offsets.size() - 1, stream)};
+  auto per_row_mapping = make_fixed_width_column(
+    data_type{type_to_id<size_type>()}, num_child_rows, mask_state::UNALLOCATED, stream);
+  auto per_row_mapping_begin = per_row_mapping->mutable_view().template begin<size_type>();
+  thrust::fill_n(rmm::exec_policy(stream), per_row_mapping_begin, num_child_rows, 0);
+
+  auto const begin = thrust::make_counting_iterator<size_type>(0);
+  thrust::scatter_if(rmm::exec_policy(stream),
+                     begin,
+                     begin + offsets.size() - 1,
+                     offsets.begin<size_type>(),
+                     begin,  // stencil iterator
+                     per_row_mapping_begin,
+                     [offset = offsets.begin<size_type>()] __device__(auto i) {
+                       return offset[i] != offset[i + 1];
+                     });  // [0,0,1,0,0,3,...]
+
+  // Next, generate mapping with inclusive_scan(max) on the scatter result.
+  // For the example above:
+  //   scatter result == [0, 0, 1, 0, 0, 2, 0, 0, 3, 0, 0, 4, 0]
+  //   inclusive_scan == [0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4]
+  //
+  // For the case with an empty list at index 2:
+  //   scatter result == [0, 0, 1, 0, 0, 3, 0, 0, 4, 0, 0, 5, 0]
+  //   inclusive_scan == [0, 0, 1, 1, 1, 3, 3, 3, 4, 4, 4, 5, 5]
+  thrust::inclusive_scan(rmm::exec_policy(stream),
+                         per_row_mapping_begin,
+                         per_row_mapping_begin + num_child_rows,
+                         per_row_mapping_begin,
+                         thrust::maximum<size_type>{});
+  return per_row_mapping;
+}
+
+/**
+ * @see cudf::detail::count_child_nulls
+ */
+size_type count_child_nulls(column_view const& input,
+                            std::unique_ptr<column> const& gather_map,
+                            rmm::cuda_stream_view stream)
+{
+  auto input_device_view = column_device_view::create(input, stream);
+
+  auto input_row_is_null = [d_input = *input_device_view] __device__(auto i) {
+    return d_input.is_null_nocheck(i);
+  };
+
+  return thrust::count_if(rmm::exec_policy(stream),
+                          gather_map->view().begin<size_type>(),
+                          gather_map->view().end<size_type>(),
+                          input_row_is_null);
+}
+
+/**
+ * @see cudf::detail::rolling_collect_list
+ */
+std::pair<std::unique_ptr<column>, std::unique_ptr<column>> purge_null_entries(
+  column_view const& input,
+  column_view const& gather_map,
+  column_view const& offsets,
+  size_type num_child_nulls,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  auto input_device_view = column_device_view::create(input, stream);
+
+  auto input_row_not_null = [d_input = *input_device_view] __device__(auto i) {
+    return d_input.is_valid_nocheck(i);
+  };
+
+  // Purge entries in gather_map that correspond to null input.
+  auto new_gather_map = make_fixed_width_column(data_type{type_to_id<size_type>()},
+                                                gather_map.size() - num_child_nulls,
+                                                mask_state::UNALLOCATED,
+                                                stream);
+  thrust::copy_if(rmm::exec_policy(stream),
+                  gather_map.template begin<size_type>(),
+                  gather_map.template end<size_type>(),
+                  new_gather_map->mutable_view().template begin<size_type>(),
+                  input_row_not_null);
+
+  // Recalculate offsets after null entries are purged.
+  auto new_sizes = make_fixed_width_column(
+    data_type{type_to_id<size_type>()}, input.size(), mask_state::UNALLOCATED, stream);
+
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(input.size()),
+                    new_sizes->mutable_view().template begin<size_type>(),
+                    [d_gather_map  = gather_map.template begin<size_type>(),
+                     d_old_offsets = offsets.template begin<size_type>(),
+                     input_row_not_null] __device__(auto i) {
+                      return thrust::count_if(thrust::seq,
+                                              d_gather_map + d_old_offsets[i],
+                                              d_gather_map + d_old_offsets[i + 1],
+                                              input_row_not_null);
+                    });
+
+  auto new_offsets =
+    strings::detail::make_offsets_child_column(new_sizes->view().template begin<size_type>(),
+                                               new_sizes->view().template end<size_type>(),
+                                               stream,
+                                               mr);
+
+  return std::make_pair<std::unique_ptr<column>, std::unique_ptr<column>>(std::move(new_gather_map),
+                                                                          std::move(new_offsets));
+}
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/rolling/rolling_collect_list.cuh b/cpp/src/rolling/rolling_collect_list.cuh
index 0ffafe349b9..95eb1a124c6 100644
--- a/cpp/src/rolling/rolling_collect_list.cuh
+++ b/cpp/src/rolling/rolling_collect_list.cuh
@@ -16,24 +16,20 @@
 
 #pragma once
 
-#include <cudf/aggregation.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/copying.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/gather.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/reduce.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace detail {
 
-namespace {
 /**
  * @brief Creates the offsets child of the result of the `COLLECT_LIST` window aggregation
  *
@@ -97,73 +93,7 @@ std::unique_ptr<column> create_collect_offsets(size_type input_size,
  *  Mapping back to `input`    == [0,1,0,1,2,1,2,3,2,3,4,3,4]
  */
 std::unique_ptr<column> get_list_child_to_list_row_mapping(cudf::column_view const& offsets,
-                                                           rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr)
-{
-  auto static constexpr size_data_type = data_type{type_to_id<size_type>()};
-
-  // First, reduce offsets column by key, to identify the number of times
-  // an offset appears.
-  // Next, scatter the count for each offset (except the first and last),
-  // into a column of N `0`s, where N == number of child rows.
-  // For the example above:
-  //   offsets        == [0, 2, 5, 8, 11, 13]
-  //   scatter result == [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]
-  //
-  // If the above example had an empty list row at index 2,
-  // the same columns would look as follows:
-  //   offsets        == [0, 2, 5, 5, 8, 11, 13]
-  //   scatter result == [0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0]
-  //
-  // Note: To correctly handle null list rows at the beginning of
-  // the output column, care must be taken to skip the first `0`
-  // in the offsets column, when running `reduce_by_key()`.
-  // This accounts for the `0` added by default to the offsets
-  // column, marking the beginning of the column.
-
-  auto const num_child_rows{
-    cudf::detail::get_value<size_type>(offsets, offsets.size() - 1, stream)};
-
-  auto scatter_values =
-    make_fixed_width_column(size_data_type, offsets.size(), mask_state::UNALLOCATED, stream, mr);
-  auto scatter_keys =
-    make_fixed_width_column(size_data_type, offsets.size(), mask_state::UNALLOCATED, stream, mr);
-  auto reduced_by_key =
-    thrust::reduce_by_key(rmm::exec_policy(stream),
-                          offsets.template begin<size_type>() + 1,  // Skip first 0 in offsets.
-                          offsets.template end<size_type>(),
-                          thrust::make_constant_iterator<size_type>(1),
-                          scatter_keys->mutable_view().template begin<size_type>(),
-                          scatter_values->mutable_view().template begin<size_type>());
-  auto scatter_values_end = reduced_by_key.second;
-  auto scatter_output =
-    make_fixed_width_column(size_data_type, num_child_rows, mask_state::UNALLOCATED, stream, mr);
-  thrust::fill_n(rmm::exec_policy(stream),
-                 scatter_output->mutable_view().template begin<size_type>(),
-                 num_child_rows,
-                 0);  // [0,0,0,...0]
-  thrust::scatter(rmm::exec_policy(stream),
-                  scatter_values->mutable_view().template begin<size_type>(),
-                  scatter_values_end,
-                  scatter_keys->view().template begin<size_type>(),
-                  scatter_output->mutable_view().template begin<size_type>());  // [0,0,1,0,0,1,...]
-
-  // Next, generate mapping with inclusive_scan() on scatter() result.
-  // For the example above:
-  //   scatter result == [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]
-  //   inclusive_scan == [0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4]
-  //
-  // For the case with an empty list at index 3:
-  //   scatter result == [0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0]
-  //   inclusive_scan == [0, 0, 1, 1, 1, 3, 3, 3, 4, 4, 4, 5, 5]
-  auto per_row_mapping =
-    make_fixed_width_column(size_data_type, num_child_rows, mask_state::UNALLOCATED, stream, mr);
-  thrust::inclusive_scan(rmm::exec_policy(stream),
-                         scatter_output->view().template begin<size_type>(),
-                         scatter_output->view().template end<size_type>(),
-                         per_row_mapping->mutable_view().template begin<size_type>());
-  return per_row_mapping;
-}
+                                                           rmm::cuda_stream_view stream);
 
 /**
  * @brief Create gather map to generate the child column of the result of
@@ -173,14 +103,10 @@ template <typename PrecedingIter>
 std::unique_ptr<column> create_collect_gather_map(column_view const& child_offsets,
                                                   column_view const& per_row_mapping,
                                                   PrecedingIter preceding_iter,
-                                                  rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::cuda_stream_view stream)
 {
-  auto gather_map = make_fixed_width_column(data_type{type_to_id<size_type>()},
-                                            per_row_mapping.size(),
-                                            mask_state::UNALLOCATED,
-                                            stream,
-                                            mr);
+  auto gather_map = make_fixed_width_column(
+    data_type{type_to_id<size_type>()}, per_row_mapping.size(), mask_state::UNALLOCATED, stream);
   thrust::transform(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<size_type>(0),
@@ -205,19 +131,7 @@ std::unique_ptr<column> create_collect_gather_map(column_view const& child_offse
  */
 size_type count_child_nulls(column_view const& input,
                             std::unique_ptr<column> const& gather_map,
-                            rmm::cuda_stream_view stream)
-{
-  auto input_device_view = column_device_view::create(input, stream);
-
-  auto input_row_is_null = [d_input = *input_device_view] __device__(auto i) {
-    return d_input.is_null_nocheck(i);
-  };
-
-  return thrust::count_if(rmm::exec_policy(stream),
-                          gather_map->view().template begin<size_type>(),
-                          gather_map->view().template end<size_type>(),
-                          input_row_is_null);
-}
+                            rmm::cuda_stream_view stream);
 
 /**
  * @brief Purge entries for null inputs from gather_map, and adjust offsets.
@@ -228,54 +142,7 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> purge_null_entries(
   column_view const& offsets,
   size_type num_child_nulls,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
-{
-  auto input_device_view = column_device_view::create(input, stream);
-
-  auto input_row_not_null = [d_input = *input_device_view] __device__(auto i) {
-    return d_input.is_valid_nocheck(i);
-  };
-
-  // Purge entries in gather_map that correspond to null input.
-  auto new_gather_map = make_fixed_width_column(data_type{type_to_id<size_type>()},
-                                                gather_map.size() - num_child_nulls,
-                                                mask_state::UNALLOCATED,
-                                                stream,
-                                                mr);
-  thrust::copy_if(rmm::exec_policy(stream),
-                  gather_map.template begin<size_type>(),
-                  gather_map.template end<size_type>(),
-                  new_gather_map->mutable_view().template begin<size_type>(),
-                  input_row_not_null);
-
-  // Recalculate offsets after null entries are purged.
-  auto new_sizes = make_fixed_width_column(
-    data_type{type_to_id<size_type>()}, input.size(), mask_state::UNALLOCATED, stream, mr);
-
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(input.size()),
-                    new_sizes->mutable_view().template begin<size_type>(),
-                    [d_gather_map  = gather_map.template begin<size_type>(),
-                     d_old_offsets = offsets.template begin<size_type>(),
-                     input_row_not_null] __device__(auto i) {
-                      return thrust::count_if(thrust::seq,
-                                              d_gather_map + d_old_offsets[i],
-                                              d_gather_map + d_old_offsets[i + 1],
-                                              input_row_not_null);
-                    });
-
-  auto new_offsets =
-    strings::detail::make_offsets_child_column(new_sizes->view().template begin<size_type>(),
-                                               new_sizes->view().template end<size_type>(),
-                                               stream,
-                                               mr);
-
-  return std::make_pair<std::unique_ptr<column>, std::unique_ptr<column>>(std::move(new_gather_map),
-                                                                          std::move(new_offsets));
-}
-
-}  // anonymous namespace
+  rmm::mr::device_memory_resource* mr);
 
 template <typename PrecedingIter, typename FollowingIter>
 std::unique_ptr<column> rolling_collect_list(column_view const& input,
@@ -313,11 +180,11 @@ std::unique_ptr<column> rolling_collect_list(column_view const& input,
 
   // Map each element of the collect() result's child column
   // to the index where it appears in the input.
-  auto per_row_mapping = get_list_child_to_list_row_mapping(offsets->view(), stream, mr);
+  auto per_row_mapping = get_list_child_to_list_row_mapping(offsets->view(), stream);
 
   // Generate gather map to produce the collect() result's child column.
-  auto gather_map = create_collect_gather_map(
-    offsets->view(), per_row_mapping->view(), preceding_begin, stream, mr);
+  auto gather_map =
+    create_collect_gather_map(offsets->view(), per_row_mapping->view(), preceding_begin, stream);
 
   // If gather_map collects null elements, and null_policy == EXCLUDE,
   // those elements must be filtered out, and offsets recomputed.
@@ -330,8 +197,12 @@ std::unique_ptr<column> rolling_collect_list(column_view const& input,
   }
 
   // gather(), to construct child column.
-  auto gather_output =
-    cudf::gather(table_view{std::vector<column_view>{input}}, gather_map->view());
+  auto gather_output = cudf::detail::gather(table_view{std::vector<column_view>{input}},
+                                            gather_map->view(),
+                                            cudf::out_of_bounds_policy::DONT_CHECK,
+                                            cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                            stream,
+                                            mr);
 
   rmm::device_buffer null_mask;
   size_type null_count;

From 332dedf0ff6cf61e9426b5e1958a2f19f5eebb02 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Mon, 23 Aug 2021 20:04:23 +0530
Subject: [PATCH 25/46] Enable compiled binary ops in libcudf, python and java
 (#8741)

closes https://github.com/rapidsai/cudf/issues/7801

`cudf::binary_operation` calls compiled binary ops.
`cudf::jit::binary_operation` calls  jit binary ops
So, compiled binary ops is called in libcudf (groupby, rescale), python (binary ops) and java (binary ops)

**Breaking change:**
New: Logical and Comparison operators can have output type to be only bool type.
Old: Logical operators can have integer or any other output type that can be constructed from bool type. Comparison operators required bool type only.


In this release (21.10), `experimental` namespace is dropped, and compiled binary ops replaces jit binary ops in libcudf, except for user defined op.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - https://github.com/nvdbaranec
  - Nghia Truong (https://github.com/ttnghia)
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/8741
---
 .../binaryop/binaryop_benchmark.cpp           |   8 +-
 .../binaryop/compiled_binaryop_benchmark.cpp  |   4 +-
 .../binaryop/jit_binaryop_benchmark.cpp       |   4 +-
 cpp/include/cudf/binaryop.hpp                 |  28 +-
 cpp/include/cudf/detail/binaryop.hpp          |  51 ++-
 cpp/src/binaryop/binaryop.cpp                 | 358 +++++++++--------
 cpp/src/binaryop/compiled/binary_ops.cuh      |   4 +-
 cpp/src/binaryop/compiled/binary_ops.hpp      |  23 +-
 cpp/src/binaryop/compiled/util.cpp            |   3 +-
 .../binop-compiled-fixed_point-test.cpp       | 146 +++----
 cpp/tests/binaryop/binop-compiled-test.cpp    |  71 +++-
 cpp/tests/binaryop/binop-integration-test.cpp | 377 +++++++++---------
 cpp/tests/binaryop/binop-null-test.cpp        |  32 +-
 .../binaryop/binop-verify-input-test.cpp      |  12 +-
 cpp/tests/fixed_point/fixed_point_tests.cpp   |  12 +-
 python/cudf/cudf/_lib/cpp/binaryop.pxd        |  24 ++
 python/cudf/cudf/core/column/numerical.py     |  12 +-
 17 files changed, 640 insertions(+), 529 deletions(-)

diff --git a/cpp/benchmarks/binaryop/binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/binaryop_benchmark.cpp
index 314d657679b..9de1112a9db 100644
--- a/cpp/benchmarks/binaryop/binaryop_benchmark.cpp
+++ b/cpp/benchmarks/binaryop/binaryop_benchmark.cpp
@@ -74,14 +74,14 @@ static void BM_binaryop_transform(benchmark::State& state)
     auto const op         = cudf::binary_operator::ADD;
     auto result_data_type = cudf::data_type(cudf::type_to_id<key_type>());
     if (reuse_columns) {
-      auto result = cudf::binary_operation(columns.at(0), columns.at(0), op, result_data_type);
+      auto result = cudf::jit::binary_operation(columns.at(0), columns.at(0), op, result_data_type);
       for (cudf::size_type i = 0; i < tree_levels - 1; i++) {
-        result = cudf::binary_operation(result->view(), columns.at(0), op, result_data_type);
+        result = cudf::jit::binary_operation(result->view(), columns.at(0), op, result_data_type);
       }
     } else {
-      auto result = cudf::binary_operation(columns.at(0), columns.at(1), op, result_data_type);
+      auto result = cudf::jit::binary_operation(columns.at(0), columns.at(1), op, result_data_type);
       std::for_each(std::next(columns.cbegin(), 2), columns.cend(), [&](auto const& col) {
-        result = cudf::binary_operation(result->view(), col, op, result_data_type);
+        result = cudf::jit::binary_operation(result->view(), col, op, result_data_type);
       });
     }
   }
diff --git a/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
index aa86f3bedf8..bc0818ace4b 100644
--- a/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
+++ b/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
@@ -41,11 +41,11 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
   auto output_dtype = cudf::data_type(cudf::type_to_id<TypeOut>());
 
   // Call once for hot cache.
-  cudf::experimental::binary_operation(lhs, rhs, binop, output_dtype);
+  cudf::binary_operation(lhs, rhs, binop, output_dtype);
 
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
-    cudf::experimental::binary_operation(lhs, rhs, binop, output_dtype);
+    cudf::binary_operation(lhs, rhs, binop, output_dtype);
   }
 }
 
diff --git a/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp
index 3c02f47eeb7..7fda4a50ea1 100644
--- a/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp
+++ b/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp
@@ -41,11 +41,11 @@ void BM_binaryop(benchmark::State& state, cudf::binary_operator binop)
   auto output_dtype = cudf::data_type(cudf::type_to_id<TypeOut>());
 
   // Call once for hot cache.
-  cudf::binary_operation(lhs, rhs, binop, output_dtype);
+  cudf::jit::binary_operation(lhs, rhs, binop, output_dtype);
 
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
-    cudf::binary_operation(lhs, rhs, binop, output_dtype);
+    cudf::jit::binary_operation(lhs, rhs, binop, output_dtype);
   }
 }
 
diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index e6ff6b0eadc..fe548a36cf0 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -82,7 +82,7 @@ enum class binary_operator : int32_t {
  * This distinction is significant in case of non-commutative binary operations
  *
  * Regardless of the operator, the validity of the output value is the logical
- * AND of the validity of the two operands
+ * AND of the validity of the two operands except NullMin and NullMax (logical OR).
  *
  * @param lhs         The left operand scalar
  * @param rhs         The right operand column
@@ -92,6 +92,8 @@ enum class binary_operator : int32_t {
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
  * @throw cudf::logic_error if @p output_type dtype isn't fixed-width
+ * @throw cudf::logic_error if @p output_type dtype isn't boolean for comparison and logical
+ * operations.
  */
 std::unique_ptr<column> binary_operation(
   scalar const& lhs,
@@ -108,7 +110,7 @@ std::unique_ptr<column> binary_operation(
  * This distinction is significant in case of non-commutative binary operations
  *
  * Regardless of the operator, the validity of the output value is the logical
- * AND of the validity of the two operands
+ * AND of the validity of the two operands except NullMin and NullMax (logical OR).
  *
  * @param lhs         The left operand column
  * @param rhs         The right operand scalar
@@ -118,6 +120,8 @@ std::unique_ptr<column> binary_operation(
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
  * @throw cudf::logic_error if @p output_type dtype isn't fixed-width
+ * @throw cudf::logic_error if @p output_type dtype isn't boolean for comparison and logical
+ * operations.
  */
 std::unique_ptr<column> binary_operation(
   column_view const& lhs,
@@ -132,7 +136,7 @@ std::unique_ptr<column> binary_operation(
  * The output contains the result of `op(lhs[i], rhs[i])` for all `0 <= i < lhs.size()`
  *
  * Regardless of the operator, the validity of the output value is the logical
- * AND of the validity of the two operands
+ * AND of the validity of the two operands except NullMin and NullMax (logical OR).
  *
  * @param lhs         The left operand column
  * @param rhs         The right operand column
@@ -142,6 +146,8 @@ std::unique_ptr<column> binary_operation(
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
  * @throw cudf::logic_error if @p lhs and @p rhs are different sizes
+ * @throw cudf::logic_error if @p output_type dtype isn't boolean for comparison and logical
+ * operations.
  * @throw cudf::logic_error if @p output_type dtype isn't fixed-width
  */
 std::unique_ptr<column> binary_operation(
@@ -204,7 +210,7 @@ cudf::data_type binary_operation_fixed_point_output_type(binary_operator op,
                                                          cudf::data_type const& lhs,
                                                          cudf::data_type const& rhs);
 
-namespace experimental {
+namespace jit {
 /**
  * @brief Performs a binary operation between a scalar and a column.
  *
@@ -213,7 +219,7 @@ namespace experimental {
  * This distinction is significant in case of non-commutative binary operations
  *
  * Regardless of the operator, the validity of the output value is the logical
- * AND of the validity of the two operands except NullMin and NullMax (logical OR).
+ * AND of the validity of the two operands
  *
  * @param lhs         The left operand scalar
  * @param rhs         The right operand column
@@ -223,8 +229,6 @@ namespace experimental {
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
  * @throw cudf::logic_error if @p output_type dtype isn't fixed-width
- * @throw cudf::logic_error if @p output_type dtype isn't boolean for comparison and logical
- * operations.
  */
 std::unique_ptr<column> binary_operation(
   scalar const& lhs,
@@ -241,7 +245,7 @@ std::unique_ptr<column> binary_operation(
  * This distinction is significant in case of non-commutative binary operations
  *
  * Regardless of the operator, the validity of the output value is the logical
- * AND of the validity of the two operands except NullMin and NullMax (logical OR).
+ * AND of the validity of the two operands
  *
  * @param lhs         The left operand column
  * @param rhs         The right operand scalar
@@ -251,8 +255,6 @@ std::unique_ptr<column> binary_operation(
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
  * @throw cudf::logic_error if @p output_type dtype isn't fixed-width
- * @throw cudf::logic_error if @p output_type dtype isn't boolean for comparison and logical
- * operations.
  */
 std::unique_ptr<column> binary_operation(
   column_view const& lhs,
@@ -267,7 +269,7 @@ std::unique_ptr<column> binary_operation(
  * The output contains the result of `op(lhs[i], rhs[i])` for all `0 <= i < lhs.size()`
  *
  * Regardless of the operator, the validity of the output value is the logical
- * AND of the validity of the two operands except NullMin and NullMax (logical OR).
+ * AND of the validity of the two operands
  *
  * @param lhs         The left operand column
  * @param rhs         The right operand column
@@ -277,8 +279,6 @@ std::unique_ptr<column> binary_operation(
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
  * @throw cudf::logic_error if @p lhs and @p rhs are different sizes
- * @throw cudf::logic_error if @p output_type dtype isn't boolean for comparison and logical
- * operations.
  * @throw cudf::logic_error if @p output_type dtype isn't fixed-width
  */
 std::unique_ptr<column> binary_operation(
@@ -287,6 +287,6 @@ std::unique_ptr<column> binary_operation(
   binary_operator op,
   data_type output_type,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-}  // namespace experimental
+}  // namespace jit
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/binaryop.hpp b/cpp/include/cudf/detail/binaryop.hpp
index c12482967e1..ce7731ef7d2 100644
--- a/cpp/include/cudf/detail/binaryop.hpp
+++ b/cpp/include/cudf/detail/binaryop.hpp
@@ -22,8 +22,9 @@
 namespace cudf {
 //! Inner interfaces and implementations
 namespace detail {
+namespace jit {
 /**
- * @copydoc cudf::binary_operation(scalar const&, column_view const&, binary_operator,
+ * @copydoc cudf::jit::binary_operation(scalar const&, column_view const&, binary_operator,
  * data_type, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
@@ -37,7 +38,7 @@ std::unique_ptr<column> binary_operation(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @copydoc cudf::binary_operation(column_view const&, scalar const&, binary_operator,
+ * @copydoc cudf::jit::binary_operation(column_view const&, scalar const&, binary_operator,
  * data_type, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
@@ -51,7 +52,7 @@ std::unique_ptr<column> binary_operation(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @copydoc cudf::binary_operation(column_view const&, column_view const&,
+ * @copydoc cudf::jit::binary_operation(column_view const&, column_view const&,
  * binary_operator, data_type, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
@@ -63,9 +64,10 @@ std::unique_ptr<column> binary_operation(
   data_type output_type,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+}  // namespace jit
 
 /**
- * @copydoc cudf::binary_operation(column_view const&, column_view const&,
+ * @copydoc cudf::jit::binary_operation(column_view const&, column_view const&,
  * std::string const&, data_type, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
@@ -78,5 +80,46 @@ std::unique_ptr<column> binary_operation(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc cudf::binary_operation(scalar const&, column_view const&, binary_operator,
+ * data_type, rmm::mr::device_memory_resource *)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> binary_operation(
+  scalar const& lhs,
+  column_view const& rhs,
+  binary_operator op,
+  data_type output_type,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @copydoc cudf::binary_operation(column_view const&, scalar const&, binary_operator,
+ * data_type, rmm::mr::device_memory_resource *)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> binary_operation(
+  column_view const& lhs,
+  scalar const& rhs,
+  binary_operator op,
+  data_type output_type,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @copydoc cudf::binary_operation(column_view const&, column_view const&,
+ * binary_operator, data_type, rmm::mr::device_memory_resource *)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> binary_operation(
+  column_view const& lhs,
+  column_view const& rhs,
+  binary_operator op,
+  data_type output_type,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index aaf193ff5cf..a1b00a4cd6b 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -47,9 +47,7 @@
 #include <thrust/optional.h>
 
 namespace cudf {
-
 namespace binops {
-namespace detail {
 
 /**
  * @brief Computes output valid mask for op between a column and a scalar
@@ -69,7 +67,63 @@ rmm::device_buffer scalar_col_valid_mask_and(column_view const& col,
     return rmm::device_buffer{0, stream, mr};
   }
 }
-}  // namespace detail
+
+/**
+ * @brief Does the binop need to know if an operand is null/invalid to perform special
+ * processing?
+ */
+inline bool is_null_dependent(binary_operator op)
+{
+  return op == binary_operator::NULL_EQUALS || op == binary_operator::NULL_MIN ||
+         op == binary_operator::NULL_MAX;
+}
+
+/**
+ * @brief Returns `true` if `binary_operator` `op` is a basic arithmetic binary operation
+ */
+bool is_basic_arithmetic_binop(binary_operator op)
+{
+  return op == binary_operator::ADD or       // operator +
+         op == binary_operator::SUB or       // operator -
+         op == binary_operator::MUL or       // operator *
+         op == binary_operator::DIV or       // operator / using common type of lhs and rhs
+         op == binary_operator::NULL_MIN or  // 2 null = null, 1 null = value, else min
+         op == binary_operator::NULL_MAX;    // 2 null = null, 1 null = value, else max
+}
+
+/**
+ * @brief Returns `true` if `binary_operator` `op` is a comparison binary operation
+ */
+bool is_comparison_binop(binary_operator op)
+{
+  return op == binary_operator::EQUAL or          // operator ==
+         op == binary_operator::NOT_EQUAL or      // operator !=
+         op == binary_operator::LESS or           // operator <
+         op == binary_operator::GREATER or        // operator >
+         op == binary_operator::LESS_EQUAL or     // operator <=
+         op == binary_operator::GREATER_EQUAL or  // operator >=
+         op == binary_operator::NULL_EQUALS;      // 2 null = true; 1 null = false; else ==
+}
+
+/**
+ * @brief Returns `true` if `binary_operator` `op` is supported by `fixed_point`
+ */
+bool is_supported_fixed_point_binop(binary_operator op)
+{
+  return is_basic_arithmetic_binop(op) or is_comparison_binop(op);
+}
+
+/**
+ * @brief Helper predicate function that identifies if `op` requires scales to be the same
+ *
+ * @param op `binary_operator`
+ * @return true `op` requires scales of lhs and rhs to be the same
+ * @return false `op` does not require scales of lhs and rhs to be the same
+ */
+bool is_same_scale_necessary(binary_operator op)
+{
+  return op != binary_operator::MUL && op != binary_operator::DIV;
+}
 
 namespace jit {
 
@@ -208,8 +262,47 @@ void binary_operation(mutable_column_view& out,
              cudf::jit::get_data_ptr(lhs),
              cudf::jit::get_data_ptr(rhs));
 }
-
 }  // namespace jit
+
+// Compiled Binary operation
+namespace compiled {
+/**
+ * @copydoc cudf::binary_operation(column_view const&, column_view const&,
+ * binary_operator, data_type, rmm::mr::device_memory_resource*)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+template <typename LhsType, typename RhsType>
+std::unique_ptr<column> binary_operation(LhsType const& lhs,
+                                         RhsType const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  if constexpr (std::is_same_v<LhsType, column_view> and std::is_same_v<RhsType, column_view>)
+    CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match");
+
+  if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING and
+      output_type.id() == type_id::STRING and
+      (op == binary_operator::NULL_MAX or op == binary_operator::NULL_MIN))
+    return cudf::binops::compiled::string_null_min_max(lhs, rhs, op, output_type, stream, mr);
+
+  if (not cudf::binops::compiled::is_supported_operation(output_type, lhs.type(), rhs.type(), op))
+    CUDF_FAIL("Unsupported operator for these types");
+
+  auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
+
+  if constexpr (std::is_same_v<LhsType, column_view>)
+    if (lhs.is_empty()) return out;
+  if constexpr (std::is_same_v<RhsType, column_view>)
+    if (rhs.is_empty()) return out;
+
+  auto out_view = out->mutable_view();
+  cudf::binops::compiled::binary_operation(out_view, lhs, rhs, op, stream);
+  return out;
+}
+}  // namespace compiled
 }  // namespace binops
 
 namespace detail {
@@ -245,7 +338,7 @@ std::unique_ptr<column> make_fixed_width_column_for_output(scalar const& lhs,
   if (binops::is_null_dependent(op)) {
     return make_fixed_width_column(output_type, rhs.size(), mask_state::ALL_VALID, stream, mr);
   } else {
-    auto new_mask = binops::detail::scalar_col_valid_mask_and(rhs, lhs, stream, mr);
+    auto new_mask = binops::scalar_col_valid_mask_and(rhs, lhs, stream, mr);
     return make_fixed_width_column(
       output_type, rhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr);
   }
@@ -272,7 +365,7 @@ std::unique_ptr<column> make_fixed_width_column_for_output(column_view const& lh
   if (binops::is_null_dependent(op)) {
     return make_fixed_width_column(output_type, lhs.size(), mask_state::ALL_VALID, stream, mr);
   } else {
-    auto new_mask = binops::detail::scalar_col_valid_mask_and(lhs, rhs, stream, mr);
+    auto new_mask = binops::scalar_col_valid_mask_and(lhs, rhs, stream, mr);
     return make_fixed_width_column(
       output_type, lhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr);
   }
@@ -305,53 +398,6 @@ std::unique_ptr<column> make_fixed_width_column_for_output(column_view const& lh
   }
 };
 
-/**
- * @brief Returns `true` if `binary_operator` `op` is a basic arithmetic binary operation
- */
-bool is_basic_arithmetic_binop(binary_operator op)
-{
-  return op == binary_operator::ADD or       // operator +
-         op == binary_operator::SUB or       // operator -
-         op == binary_operator::MUL or       // operator *
-         op == binary_operator::DIV or       // operator / using common type of lhs and rhs
-         op == binary_operator::NULL_MIN or  // 2 null = null, 1 null = value, else min
-         op == binary_operator::NULL_MAX;    // 2 null = null, 1 null = value, else max
-}
-
-/**
- * @brief Returns `true` if `binary_operator` `op` is a comparison binary operation
- */
-bool is_comparison_binop(binary_operator op)
-{
-  return op == binary_operator::EQUAL or          // operator ==
-         op == binary_operator::NOT_EQUAL or      // operator !=
-         op == binary_operator::LESS or           // operator <
-         op == binary_operator::GREATER or        // operator >
-         op == binary_operator::LESS_EQUAL or     // operator <=
-         op == binary_operator::GREATER_EQUAL or  // operator >=
-         op == binary_operator::NULL_EQUALS;      // 2 null = true; 1 null = false; else ==
-}
-
-/**
- * @brief Returns `true` if `binary_operator` `op` is supported by `fixed_point`
- */
-bool is_supported_fixed_point_binop(binary_operator op)
-{
-  return is_basic_arithmetic_binop(op) or is_comparison_binop(op);
-}
-
-/**
- * @brief Helper predicate function that identifies if `op` requires scales to be the same
- *
- * @param op `binary_operator`
- * @return true `op` requires scales of lhs and rhs to be the same
- * @return false `op` does not require scales of lhs and rhs to be the same
- */
-bool is_same_scale_necessary(binary_operator op)
-{
-  return op != binary_operator::MUL && op != binary_operator::DIV;
-}
-
 template <typename Lhs, typename Rhs>
 void fixed_point_binary_operation_validation(binary_operator op,
                                              Lhs lhs,
@@ -360,10 +406,11 @@ void fixed_point_binary_operation_validation(binary_operator op,
 {
   CUDF_EXPECTS(is_fixed_point(lhs), "Input must have fixed_point data_type.");
   CUDF_EXPECTS(is_fixed_point(rhs), "Input must have fixed_point data_type.");
-  CUDF_EXPECTS(is_supported_fixed_point_binop(op), "Unsupported fixed_point binary operation");
+  CUDF_EXPECTS(binops::is_supported_fixed_point_binop(op),
+               "Unsupported fixed_point binary operation");
   CUDF_EXPECTS(lhs.id() == rhs.id(), "Data type mismatch");
   if (output_type.has_value()) {
-    if (is_comparison_binop(op))
+    if (binops::is_comparison_binop(op))
       CUDF_EXPECTS(output_type == cudf::data_type{type_id::BOOL8},
                    "Comparison operations require boolean output type.");
     else
@@ -372,6 +419,7 @@ void fixed_point_binary_operation_validation(binary_operator op,
   }
 }
 
+namespace jit {
 /**
  * @brief Function to compute binary operation of one `column_view` and one `scalar`
  *
@@ -397,12 +445,12 @@ std::unique_ptr<column> fixed_point_binary_operation(scalar const& lhs,
     return make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
 
   auto const scale = binary_operation_fixed_point_scale(op, lhs.type().scale(), rhs.type().scale());
-  auto const type =
-    is_comparison_binop(op) ? data_type{type_id::BOOL8} : cudf::data_type{rhs.type().id(), scale};
-  auto out      = make_fixed_width_column_for_output(lhs, rhs, op, type, stream, mr);
-  auto out_view = out->mutable_view();
+  auto const type  = binops::is_comparison_binop(op) ? data_type{type_id::BOOL8}
+                                                     : cudf::data_type{rhs.type().id(), scale};
+  auto out         = make_fixed_width_column_for_output(lhs, rhs, op, type, stream, mr);
+  auto out_view    = out->mutable_view();
 
-  if (lhs.type().scale() != rhs.type().scale() && is_same_scale_necessary(op)) {
+  if (lhs.type().scale() != rhs.type().scale() && binops::is_same_scale_necessary(op)) {
     // Adjust scalar/column so they have they same scale
     if (rhs.type().scale() < lhs.type().scale()) {
       auto const diff = lhs.type().scale() - rhs.type().scale();
@@ -426,12 +474,12 @@ std::unique_ptr<column> fixed_point_binary_operation(scalar const& lhs,
         if (lhs.type().id() == type_id::DECIMAL32) {
           auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
           auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
-          return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
+          return jit::binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
         } else {
           CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
           auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
           auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
-          return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
+          return jit::binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
         }
       }();
       binops::jit::binary_operation(out_view, lhs, result->view(), op, stream);
@@ -467,12 +515,12 @@ std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
     return make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
 
   auto const scale = binary_operation_fixed_point_scale(op, lhs.type().scale(), rhs.type().scale());
-  auto const type =
-    is_comparison_binop(op) ? data_type{type_id::BOOL8} : cudf::data_type{lhs.type().id(), scale};
-  auto out      = make_fixed_width_column_for_output(lhs, rhs, op, type, stream, mr);
-  auto out_view = out->mutable_view();
+  auto const type  = binops::is_comparison_binop(op) ? data_type{type_id::BOOL8}
+                                                     : cudf::data_type{lhs.type().id(), scale};
+  auto out         = make_fixed_width_column_for_output(lhs, rhs, op, type, stream, mr);
+  auto out_view    = out->mutable_view();
 
-  if (lhs.type().scale() != rhs.type().scale() && is_same_scale_necessary(op)) {
+  if (lhs.type().scale() != rhs.type().scale() && binops::is_same_scale_necessary(op)) {
     // Adjust scalar/column so they have they same scale
     if (rhs.type().scale() > lhs.type().scale()) {
       auto const diff = rhs.type().scale() - lhs.type().scale();
@@ -496,12 +544,12 @@ std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
         if (rhs.type().id() == type_id::DECIMAL32) {
           auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
           auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
-          return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
+          return jit::binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
         } else {
           CUDF_EXPECTS(rhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
           auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
           auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
-          return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
+          return jit::binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
         }
       }();
       binops::jit::binary_operation(out_view, result->view(), rhs, op, stream);
@@ -537,24 +585,24 @@ std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
     return make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
 
   auto const scale = binary_operation_fixed_point_scale(op, lhs.type().scale(), rhs.type().scale());
-  auto const type =
-    is_comparison_binop(op) ? data_type{type_id::BOOL8} : cudf::data_type{lhs.type().id(), scale};
-  auto out      = make_fixed_width_column_for_output(lhs, rhs, op, type, stream, mr);
-  auto out_view = out->mutable_view();
+  auto const type  = binops::is_comparison_binop(op) ? data_type{type_id::BOOL8}
+                                                     : cudf::data_type{lhs.type().id(), scale};
+  auto out         = make_fixed_width_column_for_output(lhs, rhs, op, type, stream, mr);
+  auto out_view    = out->mutable_view();
 
-  if (lhs.type().scale() != rhs.type().scale() && is_same_scale_necessary(op)) {
+  if (lhs.type().scale() != rhs.type().scale() && binops::is_same_scale_necessary(op)) {
     if (rhs.type().scale() < lhs.type().scale()) {
       auto const diff   = lhs.type().scale() - rhs.type().scale();
       auto const result = [&] {
         if (lhs.type().id() == type_id::DECIMAL32) {
           auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
           auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
-          return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
+          return jit::binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
         } else {
           CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
           auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
           auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
-          return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
+          return jit::binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
         }
       }();
       binops::jit::binary_operation(out_view, result->view(), rhs, op, stream);
@@ -564,12 +612,12 @@ std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
         if (lhs.type().id() == type_id::DECIMAL32) {
           auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
           auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
-          return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
+          return jit::binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
         } else {
           CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
           auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
           auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
-          return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
+          return jit::binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
         }
       }();
       binops::jit::binary_operation(out_view, lhs, result->view(), op, stream);
@@ -587,8 +635,9 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
+  // calls compiled ops for string types
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
-    return experimental::binary_operation(lhs, rhs, op, output_type, mr);
+    return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 
   if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
     return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
@@ -614,8 +663,9 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
+  // calls compiled ops for string types
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
-    return experimental::binary_operation(lhs, rhs, op, output_type, mr);
+    return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 
   if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
     return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
@@ -643,8 +693,9 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
 {
   CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match");
 
+  // calls compiled ops for string types
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
-    return experimental::binary_operation(lhs, rhs, op, output_type, mr);
+    return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 
   if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
     return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
@@ -662,6 +713,72 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
   binops::jit::binary_operation(out_view, lhs, rhs, op, stream);
   return out;
 }
+}  // namespace jit
+}  // namespace detail
+
+namespace jit {
+std::unique_ptr<column> binary_operation(scalar const& lhs,
+                                         column_view const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::jit::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> binary_operation(column_view const& lhs,
+                                         scalar const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::jit::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> binary_operation(column_view const& lhs,
+                                         column_view const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::jit::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
+}
+}  // namespace jit
+
+namespace detail {
+std::unique_ptr<column> binary_operation(scalar const& lhs,
+                                         column_view const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  return binops::compiled::binary_operation<scalar, column_view>(
+    lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
+}
+std::unique_ptr<column> binary_operation(column_view const& lhs,
+                                         scalar const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  return binops::compiled::binary_operation<column_view, scalar>(
+    lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
+}
+std::unique_ptr<column> binary_operation(column_view const& lhs,
+                                         column_view const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  return binops::compiled::binary_operation<column_view, column_view>(
+    lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
+}
 
 std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          column_view const& rhs,
@@ -693,14 +810,13 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
   binops::jit::binary_operation(out_view, lhs, rhs, ptx, stream);
   return out;
 }
-
 }  // namespace detail
 
 int32_t binary_operation_fixed_point_scale(binary_operator op,
                                            int32_t left_scale,
                                            int32_t right_scale)
 {
-  CUDF_EXPECTS(cudf::detail::is_supported_fixed_point_binop(op),
+  CUDF_EXPECTS(binops::is_supported_fixed_point_binop(op),
                "Unsupported fixed_point binary operation.");
   if (op == binary_operator::MUL) return left_scale + right_scale;
   if (op == binary_operator::DIV) return left_scale - right_scale;
@@ -726,7 +842,6 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
   CUDF_FUNC_RANGE();
   return detail::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
 }
-
 std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          scalar const& rhs,
                                          binary_operator op,
@@ -736,7 +851,6 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
   CUDF_FUNC_RANGE();
   return detail::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
 }
-
 std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          column_view const& rhs,
                                          binary_operator op,
@@ -757,78 +871,4 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
   return detail::binary_operation(lhs, rhs, ptx, output_type, rmm::cuda_stream_default, mr);
 }
 
-// Experimental Compiled Binary operation
-namespace experimental {
-namespace detail {
-/**
- * @copydoc cudf::experimental::binary_operation(column_view const&, column_view const&,
- * binary_operator, data_type, rmm::mr::device_memory_resource*)
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
- */
-template <typename LhsType, typename RhsType>
-std::unique_ptr<column> binary_operation(LhsType const& lhs,
-                                         RhsType const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  if constexpr (std::is_same_v<LhsType, column_view> and std::is_same_v<RhsType, column_view>)
-    CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match");
-
-  if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING and
-      output_type.id() == type_id::STRING and
-      (op == binary_operator::NULL_MAX or op == binary_operator::NULL_MIN))
-    return binops::compiled::string_null_min_max(lhs, rhs, op, output_type, stream, mr);
-
-  if (not binops::compiled::is_supported_operation(output_type, lhs.type(), rhs.type(), op))
-    CUDF_FAIL("Unsupported operator for these types");
-
-  // TODO check if scale conversion required?
-  // if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
-  //  CUDF_FAIL("Not yet supported fixed_point");
-  // return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
-
-  auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
-
-  if constexpr (std::is_same_v<LhsType, column_view>)
-    if (lhs.is_empty()) return out;
-  if constexpr (std::is_same_v<RhsType, column_view>)
-    if (rhs.is_empty()) return out;
-
-  auto out_view = out->mutable_view();
-  cudf::binops::compiled::binary_operation(out_view, lhs, rhs, op, stream);
-  return out;
-}
-}  // namespace detail
-
-std::unique_ptr<column> binary_operation(scalar const& lhs,
-                                         column_view const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
-}
-std::unique_ptr<column> binary_operation(column_view const& lhs,
-                                         scalar const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
-}
-std::unique_ptr<column> binary_operation(column_view const& lhs,
-                                         column_view const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
-}
-}  // namespace experimental
 }  // namespace cudf
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index b17f3eddc5d..84147fc9220 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -68,7 +68,9 @@ struct typed_casted_writer {
     if constexpr (mutable_column_device_view::has_element_accessor<Element>() and
                   std::is_constructible_v<Element, FromType>) {
       col.element<Element>(i) = static_cast<Element>(val);
-    } else if constexpr (is_fixed_point<Element>() and std::is_constructible_v<Element, FromType>) {
+    } else if constexpr (is_fixed_point<Element>() and
+                         (is_fixed_point<FromType>() or
+                          std::is_constructible_v<Element, FromType>)) {
       if constexpr (is_fixed_point<FromType>())
         col.data<Element::rep>()[i] = val.rescaled(numeric::scale_type{col.type().scale()}).value();
       else
diff --git a/cpp/src/binaryop/compiled/binary_ops.hpp b/cpp/src/binaryop/compiled/binary_ops.hpp
index 2a814c16d57..cf3a6025847 100644
--- a/cpp/src/binaryop/compiled/binary_ops.hpp
+++ b/cpp/src/binaryop/compiled/binary_ops.hpp
@@ -29,26 +29,6 @@ class column_device_view;
 class mutable_column_device_view;
 
 namespace binops {
-namespace detail {
-/**
- * @brief Computes output valid mask for op between a column and a scalar
- */
-rmm::device_buffer scalar_col_valid_mask_and(column_view const& col,
-                                             scalar const& s,
-                                             rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
-}  // namespace detail
-
-/**
- * @brief Does the binop need to know if an operand is null/invalid to perform special
- * processing?
- */
-inline bool is_null_dependent(binary_operator op)
-{
-  return op == binary_operator::NULL_EQUALS || op == binary_operator::NULL_MIN ||
-         op == binary_operator::NULL_MAX;
-}
-
 namespace compiled {
 
 std::unique_ptr<column> string_null_min_max(
@@ -132,8 +112,7 @@ std::unique_ptr<column> binary_operation(
  *
  * @note The sizes of @p lhs and @p rhs should be the same
  *
- * The output contains the result of op(lhs[i], rhs[i]) for all 0 <= i <
- * lhs.size()
+ * The output contains the result of op(lhs[i], rhs[i]) for all 0 <= i < lhs.size()
  *
  * Regardless of the operator, the validity of the output value is the logical
  * AND of the validity of the two operands
diff --git a/cpp/src/binaryop/compiled/util.cpp b/cpp/src/binaryop/compiled/util.cpp
index d6ce4d3edeb..f89941a3d68 100644
--- a/cpp/src/binaryop/compiled/util.cpp
+++ b/cpp/src/binaryop/compiled/util.cpp
@@ -89,7 +89,8 @@ struct is_binary_operation_supported {
         using common_t = std::common_type_t<TypeLhs, TypeRhs>;
         if constexpr (std::is_invocable_v<BinaryOperator, common_t, common_t>) {
           using ReturnType = std::invoke_result_t<BinaryOperator, common_t, common_t>;
-          return std::is_constructible_v<TypeOut, ReturnType>;
+          return std::is_constructible_v<TypeOut, ReturnType> or
+                 (is_fixed_point<ReturnType>() and is_fixed_point<TypeOut>());
         }
       } else {
         if constexpr (std::is_invocable_v<BinaryOperator, TypeLhs, TypeRhs>) {
diff --git a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
index feb75cc3f09..a6477247356 100644
--- a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
@@ -68,8 +68,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpAdd)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result =
-    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, result->view());
 }
@@ -102,8 +101,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpMultiply)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::MUL,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result =
-    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, result->view());
 }
@@ -125,8 +123,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpMultiply2)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::MUL,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result =
-    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -145,8 +142,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpDiv)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::DIV,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result =
-    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -165,8 +161,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpDiv2)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::DIV,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result =
-    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -183,8 +178,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpDiv3)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::DIV, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result =
-    cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -204,8 +198,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpDiv4)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::DIV, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result =
-    cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -224,8 +217,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpAdd2)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result =
-    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -244,8 +236,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpAdd3)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result =
-    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -262,8 +253,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpAdd4)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::ADD, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result =
-    cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -280,8 +270,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpAdd5)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::ADD, lhs->type(), static_cast<cudf::column_view>(rhs).type());
-  auto const result =
-    cudf::experimental::binary_operation(*lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -298,10 +287,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpAdd6)
   auto const expected2 = fp_wrapper<RepType>{{6, 0, 1, 1, 1, 1}, scale_type{1}};
   auto const type1     = cudf::data_type{cudf::type_to_id<decimalXX>(), 0};
   auto const type2     = cudf::data_type{cudf::type_to_id<decimalXX>(), 1};
-  auto const result1 =
-    cudf::experimental::binary_operation(col, col, cudf::binary_operator::ADD, type1);
-  auto const result2 =
-    cudf::experimental::binary_operation(col, col, cudf::binary_operator::ADD, type2);
+  auto const result1   = cudf::binary_operation(col, col, cudf::binary_operator::ADD, type1);
+  auto const result2   = cudf::binary_operation(col, col, cudf::binary_operator::ADD, type2);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
@@ -333,8 +320,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpMultiplyScalar)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::MUL, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result =
-    cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::MUL, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::MUL, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -353,8 +339,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpSimplePlus)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result =
-    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -370,8 +355,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpEqualSimple)
   auto const col2     = fp_wrapper<RepType>{{100, 200, 300, 400}, scale_type{-2}};
   auto const expected = wrapper<bool>(trues.begin(), trues.end());
 
-  auto const result = cudf::experimental::binary_operation(
-    col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+  auto const result =
+    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -386,8 +371,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpEqualSimpleScale0)
   auto const col      = fp_wrapper<RepType>{{1, 2, 3, 4}, scale_type{0}};
   auto const expected = wrapper<bool>(trues.begin(), trues.end());
 
-  auto const result = cudf::experimental::binary_operation(
-    col, col, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+  auto const result =
+    cudf::binary_operation(col, col, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -402,8 +387,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpEqualSimpleScale0Nu
   auto const col2     = fp_wrapper<RepType>{{1, 2, 3, 4}, {0, 0, 0, 0}, scale_type{0}};
   auto const expected = wrapper<bool>{{0, 1, 0, 1}, {0, 0, 0, 0}};
 
-  auto const result = cudf::experimental::binary_operation(
-    col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+  auto const result =
+    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -418,8 +403,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpEqualSimpleScale2Nu
   auto const col2     = fp_wrapper<RepType>{{1, 2, 3, 4}, {0, 0, 0, 0}, scale_type{0}};
   auto const expected = wrapper<bool>{{0, 1, 0, 1}, {0, 0, 0, 0}};
 
-  auto const result = cudf::experimental::binary_operation(
-    col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+  auto const result =
+    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -445,8 +430,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpEqualLessGreater)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(iota_3).type(),
                                                    static_cast<cudf::column_view>(zeros_3).type());
-  auto const iota_3_after_add =
-    cudf::experimental::binary_operation(zeros_3, iota_3, binary_operator::ADD, type);
+  auto const iota_3_after_add = cudf::binary_operation(zeros_3, iota_3, binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(iota_3, iota_3_after_add->view());
 
@@ -455,17 +439,17 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpEqualLessGreater)
   auto const trues    = std::vector<bool>(sz, true);
   auto const true_col = wrapper<bool>(trues.begin(), trues.end());
 
-  auto const btype        = cudf::data_type{type_id::BOOL8};
-  auto const equal_result = cudf::experimental::binary_operation(
-    iota_3, iota_3_after_add->view(), binary_operator::EQUAL, btype);
+  auto const btype = cudf::data_type{type_id::BOOL8};
+  auto const equal_result =
+    cudf::binary_operation(iota_3, iota_3_after_add->view(), binary_operator::EQUAL, btype);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, equal_result->view());
 
-  auto const less_result = cudf::experimental::binary_operation(
-    zeros_3, iota_3_after_add->view(), binary_operator::LESS, btype);
+  auto const less_result =
+    cudf::binary_operation(zeros_3, iota_3_after_add->view(), binary_operator::LESS, btype);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, less_result->view());
 
-  auto const greater_result = cudf::experimental::binary_operation(
-    iota_3_after_add->view(), zeros_3, binary_operator::GREATER, btype);
+  auto const greater_result =
+    cudf::binary_operation(iota_3_after_add->view(), zeros_3, binary_operator::GREATER, btype);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, greater_result->view());
 }
 
@@ -484,8 +468,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpNullMaxSimple)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::NULL_MAX,
                                                    static_cast<cudf::column_view>(col1).type(),
                                                    static_cast<cudf::column_view>(col2).type());
-  auto const result =
-    cudf::experimental::binary_operation(col1, col2, binary_operator::NULL_MAX, type);
+  auto const result = cudf::binary_operation(col1, col2, binary_operator::NULL_MAX, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -505,8 +488,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpNullMinSimple)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::NULL_MIN,
                                                    static_cast<cudf::column_view>(col1).type(),
                                                    static_cast<cudf::column_view>(col2).type());
-  auto const result =
-    cudf::experimental::binary_operation(col1, col2, binary_operator::NULL_MIN, type);
+  auto const result = cudf::binary_operation(col1, col2, binary_operator::NULL_MIN, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -522,7 +504,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpNullEqualsSimple)
   auto const col2     = fp_wrapper<RepType>{{40, 200, 20, 400}, {1, 0, 1, 0}, scale_type{-1}};
   auto const expected = wrapper<bool>{{1, 0, 0, 1}, {1, 1, 1, 1}};
 
-  auto const result = cudf::experimental::binary_operation(
+  auto const result = cudf::binary_operation(
     col1, col2, binary_operator::NULL_EQUALS, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
@@ -538,9 +520,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div)
   auto const rhs      = fp_wrapper<RepType>{{4, 4, 4, 4}, scale_type{0}};
   auto const expected = fp_wrapper<RepType>{{25, 75, 125, 175}, scale_type{-2}};
 
-  auto const type = data_type{type_to_id<decimalXX>(), -2};
-  auto const result =
-    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const type   = data_type{type_to_id<decimalXX>(), -2};
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -555,9 +536,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div2)
   auto const rhs      = fp_wrapper<RepType>{{20, 20, 20, 20}, scale_type{-1}};
   auto const expected = fp_wrapper<RepType>{{5000, 15000, 25000, 35000}, scale_type{-2}};
 
-  auto const type = data_type{type_to_id<decimalXX>(), -2};
-  auto const result =
-    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const type   = data_type{type_to_id<decimalXX>(), -2};
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -572,9 +552,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div3)
   auto const rhs      = fp_wrapper<RepType>{{3, 9, 3, 3}, scale_type{0}};
   auto const expected = fp_wrapper<RepType>{{3333, 3333, 16666, 23333}, scale_type{-2}};
 
-  auto const type = data_type{type_to_id<decimalXX>(), -2};
-  auto const result =
-    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const type   = data_type{type_to_id<decimalXX>(), -2};
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -589,9 +568,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div4)
   auto const rhs      = make_fixed_point_scalar<decimalXX>(3, scale_type{0});
   auto const expected = fp_wrapper<RepType>{{3, 10, 16, 23}, scale_type{1}};
 
-  auto const type = data_type{type_to_id<decimalXX>(), 1};
-  auto const result =
-    cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const type   = data_type{type_to_id<decimalXX>(), 1};
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -607,9 +585,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div6)
 
   auto const expected = fp_wrapper<RepType>{{300, 100, 60, 42}, scale_type{-2}};
 
-  auto const type = data_type{type_to_id<decimalXX>(), -2};
-  auto const result =
-    cudf::experimental::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const type   = data_type{type_to_id<decimalXX>(), -2};
+  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -625,9 +602,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div7)
 
   auto const expected = fp_wrapper<RepType>{{12, 6, 4, 2, 2, 1, 1, 0}, scale_type{2}};
 
-  auto const type = data_type{type_to_id<decimalXX>(), 2};
-  auto const result =
-    cudf::experimental::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const type   = data_type{type_to_id<decimalXX>(), 2};
+  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -642,9 +618,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div8)
   auto const rhs      = make_fixed_point_scalar<decimalXX>(5000, scale_type{-3});
   auto const expected = fp_wrapper<RepType>{{0, 1, 16}, scale_type{2}};
 
-  auto const type = data_type{type_to_id<decimalXX>(), 2};
-  auto const result =
-    cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const type   = data_type{type_to_id<decimalXX>(), 2};
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -659,9 +634,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div9)
   auto const rhs      = make_fixed_point_scalar<decimalXX>(7, scale_type{1});
   auto const expected = fp_wrapper<RepType>{{1, 2, 4}, scale_type{1}};
 
-  auto const type = data_type{type_to_id<decimalXX>(), 1};
-  auto const result =
-    cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const type   = data_type{type_to_id<decimalXX>(), 1};
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -676,9 +650,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div10)
   auto const rhs      = make_fixed_point_scalar<decimalXX>(7, scale_type{0});
   auto const expected = fp_wrapper<RepType>{{14, 28, 42}, scale_type{1}};
 
-  auto const type = data_type{type_to_id<decimalXX>(), 1};
-  auto const result =
-    cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const type   = data_type{type_to_id<decimalXX>(), 1};
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -693,9 +666,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div11)
   auto const rhs      = fp_wrapper<RepType>{{7, 7, 7}, scale_type{0}};
   auto const expected = fp_wrapper<RepType>{{142, 285, 428}, scale_type{1}};
 
-  auto const type = data_type{type_to_id<decimalXX>(), 1};
-  auto const result =
-    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const type   = data_type{type_to_id<decimalXX>(), 1};
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -708,14 +680,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpThrows)
 
   auto const col           = fp_wrapper<RepType>{{100, 300, 500, 700}, scale_type{-2}};
   auto const non_bool_type = data_type{type_to_id<decimalXX>(), -2};
-  auto const float_type    = data_type{type_id::FLOAT32};
-  EXPECT_THROW(
-    cudf::experimental::binary_operation(col, col, cudf::binary_operator::LESS, non_bool_type),
-    cudf::logic_error);
-  // Allowed now, but not allowed in jit.
-  // EXPECT_THROW(cudf::experimental::binary_operation(col, col, cudf::binary_operator::MUL,
-  // float_type),
-  //              cudf::logic_error);
+  EXPECT_THROW(cudf::binary_operation(col, col, cudf::binary_operator::LESS, non_bool_type),
+               cudf::logic_error);
 }
 
 }  // namespace cudf::test::binop
diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp
index 081ae41fef1..25d2f1d2c24 100644
--- a/cpp/tests/binaryop/binop-compiled-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-test.cpp
@@ -79,15 +79,24 @@ struct BinaryOperationCompiledTest : public BinaryOperationTest {
     auto lhs = lhs_random_column<TypeLhs>(col_size);
     auto rhs = rhs_random_column<TypeRhs>(col_size);
 
-    auto out = cudf::experimental::binary_operation(lhs, rhs, op, data_type(type_to_id<TypeOut>()));
+    auto out = cudf::binary_operation(lhs, rhs, op, data_type(type_to_id<TypeOut>()));
     ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, OPERATOR());
 
     auto s_lhs = this->template make_random_wrapped_scalar<TypeLhs>();
     auto s_rhs = this->template make_random_wrapped_scalar<TypeRhs>();
+    s_lhs.set_valid_async(true);
+    s_rhs.set_valid_async(true);
 
-    out = cudf::experimental::binary_operation(lhs, s_rhs, op, data_type(type_to_id<TypeOut>()));
+    out = cudf::binary_operation(lhs, s_rhs, op, data_type(type_to_id<TypeOut>()));
     ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, s_rhs, OPERATOR());
-    out = cudf::experimental::binary_operation(s_lhs, rhs, op, data_type(type_to_id<TypeOut>()));
+    out = cudf::binary_operation(s_lhs, rhs, op, data_type(type_to_id<TypeOut>()));
+    ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, s_lhs, rhs, OPERATOR());
+
+    s_lhs.set_valid_async(false);
+    s_rhs.set_valid_async(false);
+    out = cudf::binary_operation(lhs, s_rhs, op, data_type(type_to_id<TypeOut>()));
+    ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, s_rhs, OPERATOR());
+    out = cudf::binary_operation(s_lhs, rhs, op, data_type(type_to_id<TypeOut>()));
     ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, s_lhs, rhs, OPERATOR());
   }
 };
@@ -305,8 +314,8 @@ TYPED_TEST(BinaryOperationCompiledTest_FloatOps, Pow_Vector_Vector)
   }();
   auto rhs = rhs_random_column<TypeRhs>(100);
 
-  auto out = cudf::experimental::binary_operation(
-    lhs, rhs, cudf::binary_operator::POW, data_type(type_to_id<TypeOut>()));
+  auto out =
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::POW, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, POW(), NearEqualComparator<TypeOut>{2});
 }
@@ -333,7 +342,7 @@ TYPED_TEST(BinaryOperationCompiledTest_FloatOps, LogBase_Vector_Vector)
   auto rhs_elements = cudf::detail::make_counting_transform_iterator(0, [](auto) { return 7; });
   fixed_width_column_wrapper<TypeRhs> rhs(rhs_elements, rhs_elements + 50);
 
-  auto out = cudf::experimental::binary_operation(
+  auto out = cudf::binary_operation(
     lhs, rhs, cudf::binary_operator::LOG_BASE, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LOG_BASE());
@@ -355,7 +364,7 @@ TYPED_TEST(BinaryOperationCompiledTest_FloatOps, ATan2_Vector_Vector)
   auto lhs = lhs_random_column<TypeLhs>(col_size);
   auto rhs = rhs_random_column<TypeRhs>(col_size);
 
-  auto out = cudf::experimental::binary_operation(
+  auto out = cudf::binary_operation(
     lhs, rhs, cudf::binary_operator::ATAN2, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ATAN2(), NearEqualComparator<TypeOut>{2});
@@ -519,6 +528,11 @@ struct BinaryOperationCompiledTest_NullOps : public BinaryOperationCompiledTest<
 };
 TYPED_TEST_CASE(BinaryOperationCompiledTest_NullOps, Null_types);
 
+template <typename T>
+using column_wrapper = std::conditional_t<std::is_same_v<T, std::string>,
+                                          cudf::test::strings_column_wrapper,
+                                          cudf::test::fixed_width_column_wrapper<T>>;
+
 template <typename TypeOut, typename TypeLhs, typename TypeRhs, class OP>
 auto NullOp_Result(column_view lhs, column_view rhs)
 {
@@ -537,8 +551,7 @@ auto NullOp_Result(column_view lhs, column_view rhs)
                    result_mask.push_back(output_valid);
                    return result;
                  });
-  return cudf::test::fixed_width_column_wrapper<TypeOut>(
-    result.cbegin(), result.cend(), result_mask.cbegin());
+  return column_wrapper<TypeOut>(result.cbegin(), result.cend(), result_mask.cbegin());
 }
 
 TYPED_TEST(BinaryOperationCompiledTest_NullOps, NullEquals_Vector_Vector)
@@ -552,7 +565,7 @@ TYPED_TEST(BinaryOperationCompiledTest_NullOps, NullEquals_Vector_Vector)
   auto rhs            = rhs_random_column<TypeRhs>(col_size);
   auto const expected = NullOp_Result<TypeOut, TypeLhs, TypeRhs, NULL_EQUALS>(lhs, rhs);
 
-  auto const result = cudf::experimental::binary_operation(
+  auto const result = cudf::binary_operation(
     lhs, rhs, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -570,7 +583,7 @@ TEST_F(BinaryOperationCompiledTest_NullOpsString, NullEquals_Vector_Vector)
   auto rhs            = rhs_random_column<TypeRhs>(col_size);
   auto const expected = NullOp_Result<TypeOut, TypeLhs, TypeRhs, NULL_EQUALS>(lhs, rhs);
 
-  auto const result = cudf::experimental::binary_operation(
+  auto const result = cudf::binary_operation(
     lhs, rhs, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -586,7 +599,7 @@ TYPED_TEST(BinaryOperationCompiledTest_NullOps, NullMax_Vector_Vector)
   auto rhs            = rhs_random_column<TypeRhs>(col_size);
   auto const expected = NullOp_Result<TypeOut, TypeLhs, TypeRhs, NULL_MAX>(lhs, rhs);
 
-  auto const result = cudf::experimental::binary_operation(
+  auto const result = cudf::binary_operation(
     lhs, rhs, cudf::binary_operator::NULL_MAX, data_type(type_to_id<TypeOut>()));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -602,9 +615,41 @@ TYPED_TEST(BinaryOperationCompiledTest_NullOps, NullMin_Vector_Vector)
   auto rhs            = rhs_random_column<TypeRhs>(col_size);
   auto const expected = NullOp_Result<TypeOut, TypeLhs, TypeRhs, NULL_MIN>(lhs, rhs);
 
-  auto const result = cudf::experimental::binary_operation(
+  auto const result = cudf::binary_operation(
     lhs, rhs, cudf::binary_operator::NULL_MIN, data_type(type_to_id<TypeOut>()));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
+TEST_F(BinaryOperationCompiledTest_NullOpsString, NullMax_Vector_Vector)
+{
+  using TypeOut  = std::string;
+  using TypeLhs  = std::string;
+  using TypeRhs  = std::string;
+  using NULL_MAX = cudf::library::operation::NullMax<TypeOut, TypeLhs, TypeRhs>;
+
+  auto lhs            = lhs_random_column<TypeLhs>(col_size);
+  auto rhs            = rhs_random_column<TypeRhs>(col_size);
+  auto const expected = NullOp_Result<TypeOut, TypeLhs, TypeRhs, NULL_MAX>(lhs, rhs);
+
+  auto const result = cudf::binary_operation(
+    lhs, rhs, cudf::binary_operator::NULL_MAX, data_type(type_to_id<cudf::string_view>()));
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
+}
+
+TEST_F(BinaryOperationCompiledTest_NullOpsString, NullMin_Vector_Vector)
+{
+  using TypeOut  = std::string;
+  using TypeLhs  = std::string;
+  using TypeRhs  = std::string;
+  using NULL_MIN = cudf::library::operation::NullMin<TypeOut, TypeLhs, TypeRhs>;
+
+  auto lhs            = lhs_random_column<TypeLhs>(col_size);
+  auto rhs            = rhs_random_column<TypeRhs>(col_size);
+  auto const expected = NullOp_Result<TypeOut, TypeLhs, TypeRhs, NULL_MIN>(lhs, rhs);
+
+  auto const result = cudf::binary_operation(
+    lhs, rhs, cudf::binary_operator::NULL_MIN, data_type(type_to_id<cudf::string_view>()));
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
+}
+
 }  // namespace cudf::test::binop
diff --git a/cpp/tests/binaryop/binop-integration-test.cpp b/cpp/tests/binaryop/binop-integration-test.cpp
index 68a8845132b..ec011a84037 100644
--- a/cpp/tests/binaryop/binop-integration-test.cpp
+++ b/cpp/tests/binaryop/binop-integration-test.cpp
@@ -53,8 +53,8 @@ TEST_F(BinaryOperationIntegrationTest, Add_Scalar_Vector_SI32_FP32_SI64)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   auto rhs = make_random_wrapped_column<TypeRhs>(10000);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -70,8 +70,8 @@ TEST_F(BinaryOperationIntegrationTest, Add_Vector_Vector_SI32_FP32_FP32)
   auto lhs = make_random_wrapped_column<TypeLhs>(10000);
   auto rhs = make_random_wrapped_column<TypeRhs>(10000);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -87,8 +87,8 @@ TEST_F(BinaryOperationIntegrationTest, Sub_Scalar_Vector_SI32_FP32_FP32)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   auto rhs = make_random_wrapped_column<TypeRhs>(10000);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
 }
@@ -103,8 +103,8 @@ TEST_F(BinaryOperationIntegrationTest, Add_Vector_Scalar_SI08_SI16_SI32)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_scalar<TypeRhs>();
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -119,8 +119,8 @@ TEST_F(BinaryOperationIntegrationTest, Add_Vector_Vector_SI32_FP64_SI08)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -135,8 +135,8 @@ TEST_F(BinaryOperationIntegrationTest, Sub_Vector_Vector_SI64)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
 }
@@ -152,8 +152,8 @@ TEST_F(BinaryOperationIntegrationTest, Sub_Vector_Scalar_SI64_FP64_SI32)
   auto lhs = make_random_wrapped_column<TypeLhs>(10000);
   auto rhs = make_random_wrapped_scalar<TypeRhs>();
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
 }
@@ -168,8 +168,8 @@ TEST_F(BinaryOperationIntegrationTest, Sub_Vector_Vector_TimepointD_DurationS_Ti
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
 }
@@ -184,8 +184,8 @@ TEST_F(BinaryOperationIntegrationTest, Sub_Vector_Scalar_TimepointD_TimepointS_D
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = cudf::scalar_type_t<TypeRhs>(typename TypeRhs::duration{34}, true);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
 }
@@ -200,8 +200,8 @@ TEST_F(BinaryOperationIntegrationTest, Sub_Scalar_Vector_DurationS_DurationD_Dur
 
   auto lhs = cudf::scalar_type_t<TypeLhs>(TypeLhs{-9});
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
 }
@@ -216,8 +216,8 @@ TEST_F(BinaryOperationIntegrationTest, Mul_Vector_Vector_SI64)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MUL());
 }
@@ -232,8 +232,8 @@ TEST_F(BinaryOperationIntegrationTest, Mul_Vector_Vector_SI64_FP32_FP32)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MUL());
 }
@@ -249,8 +249,8 @@ TEST_F(BinaryOperationIntegrationTest, Mul_Scalar_Vector_SI32_DurationD_Duration
 
   auto lhs = cudf::scalar_type_t<TypeLhs>(2);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MUL());
 }
@@ -266,8 +266,8 @@ TEST_F(BinaryOperationIntegrationTest, Mul_Vector_Vector_DurationS_SI32_Duration
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MUL());
 }
@@ -282,8 +282,8 @@ TEST_F(BinaryOperationIntegrationTest, Div_Vector_Vector_SI64)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, DIV());
 }
@@ -298,8 +298,8 @@ TEST_F(BinaryOperationIntegrationTest, Div_Vector_Vector_SI64_FP32_FP32)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, DIV());
 }
@@ -315,8 +315,8 @@ TEST_F(BinaryOperationIntegrationTest, Div_Scalar_Vector_DurationD_SI32_Duration
   // Divide 2 days by an integer and convert the ticks to seconds
   auto lhs = cudf::scalar_type_t<TypeLhs>(TypeLhs{2});
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, DIV());
 }
@@ -331,8 +331,8 @@ TEST_F(BinaryOperationIntegrationTest, Div_Vector_Vector_DurationD_DurationS_Dur
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, DIV());
 }
@@ -347,7 +347,7 @@ TEST_F(BinaryOperationIntegrationTest, TrueDiv_Vector_Vector_SI64)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::TRUE_DIV, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, TRUEDIV());
@@ -363,7 +363,7 @@ TEST_F(BinaryOperationIntegrationTest, FloorDiv_Vector_Vector_SI64)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::FLOOR_DIV, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, FLOORDIV());
@@ -379,7 +379,7 @@ TEST_F(BinaryOperationIntegrationTest, FloorDiv_Vector_Vector_SI64_FP32_FP32)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::FLOOR_DIV, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, FLOORDIV());
@@ -395,8 +395,8 @@ TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Vector_SI64)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
 }
@@ -411,8 +411,8 @@ TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Vector_FP32)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
 }
@@ -427,8 +427,8 @@ TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Vector_SI64_FP32_FP32)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
 }
@@ -443,8 +443,8 @@ TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Vector_FP64)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
 }
@@ -460,8 +460,8 @@ TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Scalar_DurationD_SI32_Duration
   // Half the number of days and convert the remainder ticks to microseconds
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = cudf::scalar_type_t<TypeRhs>(2);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
 }
@@ -476,8 +476,8 @@ TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Scalar_DurationS_DurationMS_Du
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
 }
@@ -492,8 +492,8 @@ TEST_F(BinaryOperationIntegrationTest, Pow_Vector_Vector_FP64_SI64_SI64)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::POW, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::POW, data_type(type_to_id<TypeOut>()));
 
   /**
    * According to CUDA Programming Guide, 'E.1. Standard Functions', 'Table 7 - Double-Precision
@@ -513,8 +513,8 @@ TEST_F(BinaryOperationIntegrationTest, Pow_Vector_Vector_FP32)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::POW, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::POW, data_type(type_to_id<TypeOut>()));
   /**
    * According to CUDA Programming Guide, 'E.1. Standard Functions', 'Table 7 - Double-Precision
    * Mathematical Standard Library Functions with Maximum ULP Error'
@@ -533,7 +533,7 @@ TEST_F(BinaryOperationIntegrationTest, And_Vector_Vector_SI16_SI64_SI32)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::BITWISE_AND, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, AND());
@@ -549,7 +549,7 @@ TEST_F(BinaryOperationIntegrationTest, Or_Vector_Vector_SI64_SI16_SI32)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::BITWISE_OR, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, OR());
@@ -565,7 +565,7 @@ TEST_F(BinaryOperationIntegrationTest, Xor_Vector_Vector_SI32_SI16_SI64)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::BITWISE_XOR, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, XOR());
@@ -581,7 +581,7 @@ TEST_F(BinaryOperationIntegrationTest, Logical_And_Vector_Vector_SI16_FP64_SI8)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::LOGICAL_AND, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, AND());
@@ -597,7 +597,7 @@ TEST_F(BinaryOperationIntegrationTest, Logical_Or_Vector_Vector_B8_SI16_SI64)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::LOGICAL_OR, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, OR());
@@ -613,8 +613,8 @@ TEST_F(BinaryOperationIntegrationTest, Less_Scalar_Vector_B8_TSS_TSS)
 
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   auto rhs = make_random_wrapped_column<TypeRhs>(10);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS());
 }
@@ -629,7 +629,7 @@ TEST_F(BinaryOperationIntegrationTest, Greater_Scalar_Vector_B8_TSMS_TSS)
 
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::GREATER, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, GREATER());
@@ -645,8 +645,8 @@ TEST_F(BinaryOperationIntegrationTest, Less_Vector_Vector_B8_TSS_TSS)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(10);
   auto rhs = make_random_wrapped_column<TypeRhs>(10);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS());
 }
@@ -667,7 +667,7 @@ TEST_F(BinaryOperationIntegrationTest, Greater_Vector_Vector_B8_TSMS_TSS)
     itr, itr + 100, make_validity_iter());
 
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::GREATER, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, GREATER());
@@ -683,8 +683,8 @@ TEST_F(BinaryOperationIntegrationTest, Less_Scalar_Vector_B8_STR_STR)
 
   auto lhs = cudf::string_scalar("eee");
   auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS());
 }
@@ -699,8 +699,8 @@ TEST_F(BinaryOperationIntegrationTest, Less_Vector_Scalar_B8_STR_STR)
 
   auto lhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
   auto rhs = cudf::string_scalar("eee");
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS());
 }
@@ -715,8 +715,8 @@ TEST_F(BinaryOperationIntegrationTest, Less_Vector_Vector_B8_STR_STR)
 
   auto lhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
   auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS());
 }
@@ -731,7 +731,7 @@ TEST_F(BinaryOperationIntegrationTest, Greater_Vector_Vector_B8_STR_STR)
 
   auto lhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
   auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::GREATER, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, GREATER());
@@ -747,7 +747,7 @@ TEST_F(BinaryOperationIntegrationTest, Equal_Vector_Vector_B8_STR_STR)
 
   auto lhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
   auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::EQUAL, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, EQUAL());
@@ -763,7 +763,7 @@ TEST_F(BinaryOperationIntegrationTest, Equal_Vector_Scalar_B8_STR_STR)
 
   auto rhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
   auto lhs = cudf::string_scalar("");
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::EQUAL, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, EQUAL());
@@ -779,7 +779,7 @@ TEST_F(BinaryOperationIntegrationTest, LessEqual_Vector_Vector_B8_STR_STR)
 
   auto lhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
   auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::LESS_EQUAL, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS_EQUAL());
@@ -795,7 +795,7 @@ TEST_F(BinaryOperationIntegrationTest, GreaterEqual_Vector_Vector_B8_STR_STR)
 
   auto lhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
   auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::GREATER_EQUAL, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, GREATER_EQUAL());
@@ -812,7 +812,7 @@ TEST_F(BinaryOperationIntegrationTest, ShiftLeft_Vector_Vector_SI32)
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   // this generates values in the range 1-10 which should be reasonable for the shift
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::SHIFT_LEFT, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_LEFT());
@@ -829,7 +829,7 @@ TEST_F(BinaryOperationIntegrationTest, ShiftLeft_Vector_Vector_SI32_SI16_SI64)
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   // this generates values in the range 1-10 which should be reasonable for the shift
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::SHIFT_LEFT, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_LEFT());
@@ -846,7 +846,7 @@ TEST_F(BinaryOperationIntegrationTest, ShiftLeft_Scalar_Vector_SI32)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   // this generates values in the range 1-10 which should be reasonable for the shift
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::SHIFT_LEFT, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_LEFT());
@@ -863,7 +863,7 @@ TEST_F(BinaryOperationIntegrationTest, ShiftLeft_Vector_Scalar_SI32)
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   // this generates values in the range 1-10 which should be reasonable for the shift
   auto rhs = make_random_wrapped_scalar<TypeRhs>();
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::SHIFT_LEFT, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_LEFT());
@@ -880,7 +880,7 @@ TEST_F(BinaryOperationIntegrationTest, ShiftRight_Vector_Vector_SI32)
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   // this generates values in the range 1-10 which should be reasonable for the shift
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::SHIFT_RIGHT, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT());
@@ -897,7 +897,7 @@ TEST_F(BinaryOperationIntegrationTest, ShiftRight_Vector_Vector_SI32_SI16_SI64)
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   // this generates values in the range 1-10 which should be reasonable for the shift
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::SHIFT_RIGHT, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT());
@@ -914,7 +914,7 @@ TEST_F(BinaryOperationIntegrationTest, ShiftRight_Scalar_Vector_SI32)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   // this generates values in the range 1-10 which should be reasonable for the shift
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::SHIFT_RIGHT, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT());
@@ -931,7 +931,7 @@ TEST_F(BinaryOperationIntegrationTest, ShiftRight_Vector_Scalar_SI32)
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   // this generates values in the range 1-10 which should be reasonable for the shift
   auto rhs = make_random_wrapped_scalar<TypeRhs>();
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::SHIFT_RIGHT, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT());
@@ -954,7 +954,7 @@ TEST_F(BinaryOperationIntegrationTest, ShiftRightUnsigned_Vector_Vector_SI32)
   TypeOut expected[] = {2147483644, 39, 536870900, 0, 32768};
   cudf::test::fixed_width_column_wrapper<TypeOut> expected_w(expected, expected + num_els);
 
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs_w, shift_w, cudf::binary_operator::SHIFT_RIGHT_UNSIGNED, data_type(type_to_id<TypeOut>()));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*out, expected_w);
@@ -972,7 +972,7 @@ TEST_F(BinaryOperationIntegrationTest, ShiftRightUnsigned_Vector_Vector_SI32_SI1
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   // this generates values in the range 1-10 which should be reasonable for the shift
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::SHIFT_RIGHT_UNSIGNED, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT_UNSIGNED());
@@ -990,7 +990,7 @@ TEST_F(BinaryOperationIntegrationTest, ShiftRightUnsigned_Scalar_Vector_SI32)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   // this generates values in the range 1-10 which should be reasonable for the shift
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::SHIFT_RIGHT_UNSIGNED, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT_UNSIGNED());
@@ -1008,7 +1008,7 @@ TEST_F(BinaryOperationIntegrationTest, ShiftRightUnsigned_Vector_Scalar_SI32)
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   // this generates values in the range 1-10 which should be reasonable for the shift
   auto rhs = make_random_wrapped_scalar<TypeRhs>();
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::SHIFT_RIGHT_UNSIGNED, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT_UNSIGNED());
@@ -1027,7 +1027,7 @@ TEST_F(BinaryOperationIntegrationTest, LogBase_Vector_Scalar_SI32_SI32_float)
   fixed_width_column_wrapper<TypeLhs> lhs(elements, elements + 100);
   // Find log to the base 10
   auto rhs = numeric_scalar<TypeRhs>(10);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::LOG_BASE, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LOG_BASE());
@@ -1046,7 +1046,7 @@ TEST_F(BinaryOperationIntegrationTest, LogBase_Scalar_Vector_float_SI32)
   fixed_width_column_wrapper<TypeRhs> rhs(elements, elements + 100);
   // Find log to the base 2
   auto lhs = numeric_scalar<TypeLhs>(2);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::LOG_BASE, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LOG_BASE());
@@ -1068,7 +1068,7 @@ TEST_F(BinaryOperationIntegrationTest, LogBase_Vector_Vector_double_SI64_SI32)
   // Find log to the base 7
   auto rhs_elements = cudf::detail::make_counting_transform_iterator(0, [](auto) { return 7; });
   fixed_width_column_wrapper<TypeRhs> rhs(rhs_elements, rhs_elements + 50);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::LOG_BASE, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LOG_BASE());
@@ -1084,7 +1084,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Scalar_B8_SI32_SI32
     fixed_width_column_wrapper<TypeLhs>{{999, -37, 0, INT32_MAX}, {true, true, true, false}};
   auto int_scalar = cudf::scalar_type_t<TypeRhs>(999);
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     int_col, int_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1105,7 +1105,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_ScalarInvalid_B8_SI
   auto int_scalar = cudf::scalar_type_t<TypeRhs>(999);
   int_scalar.set_valid_async(false);
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     int_col, int_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1137,7 +1137,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_tsD_tsD)
     {false, true, true, true, false, true, true, false}};
   auto ts_scalar = cudf::scalar_type_t<TypeRhs>(typename TypeRhs::duration{44376}, true);
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     ts_scalar, ts_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1158,7 +1158,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Scalar_B8_string_st
   // Empty string
   cudf::string_scalar str_scalar("");
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     str_col, str_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1178,7 +1178,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_string_st
   // Match a valid string
   cudf::string_scalar str_scalar("<null>");
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1199,7 +1199,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Scalar_B8_string_st
   // Matching a string that isn't present
   cudf::string_scalar str_scalar("foo");
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     str_col, str_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1221,7 +1221,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_string_st
   cudf::string_scalar str_scalar("foo");
   str_scalar.set_valid_async(false);
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1243,7 +1243,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Scalar_B8_string_st
   // Matching a scalar that is valid
   cudf::string_scalar str_scalar("foo");
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1266,7 +1266,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_string_st
   cudf::string_scalar str_scalar("foo");
   str_scalar.set_valid_async(false);
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1286,7 +1286,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_string_st
   // Matching an invalid string
   cudf::string_scalar str_scalar("bb");
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1307,7 +1307,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_InvalidScalar_B8_st
   cudf::string_scalar str_scalar("bb");
   str_scalar.set_valid_async(false);
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     str_col, str_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1340,7 +1340,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_tsD_tsD_N
     22270,  // 2030-12-22 00:00:00 GMT
   };
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1364,7 +1364,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_st
     cudf::test::strings_column_wrapper({"foo", "valid", "<null>", "", "invalid", "inv", "ééé"},
                                        {true, true, true, true, false, false, true});
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1385,7 +1385,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_st
   auto rhs_col =
     cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"});
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1407,7 +1407,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_st
     cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
                                        {false, false, false, false, false, false, false});
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1427,7 +1427,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_st
   auto rhs_col =
     cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"});
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1448,7 +1448,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_st
     cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
                                        {false, false, false, false, false, false, false});
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1470,7 +1470,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_st
     cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
                                        {false, false, false, false, false, false, false});
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1491,7 +1491,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_VectorAllInvalid_B8
   auto rhs_col = fixed_width_column_wrapper<TypeLhs>{{-47, 37, 12, 99, 4, -INT32_MAX},
                                                      {false, false, false, false, false, false}};
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1514,7 +1514,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareMin_Vector_Scalar_SI64_SI32_SI8)
   };
   auto int_scalar = cudf::scalar_type_t<TypeRhs>(77);
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     int_col, int_scalar, cudf::binary_operator::NULL_MIN, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1535,7 +1535,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Scalar_Vector_FP64_SI32_SI64
                                         {false, true, false, true, false, true, false}};
   auto int_scalar = cudf::scalar_type_t<TypeRhs>(INT32_MAX);
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     int_scalar, int_col, cudf::binary_operator::NULL_MAX, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1559,7 +1559,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareMin_Vector_Scalar_SI64_SI32_FP32
   auto float_scalar = cudf::scalar_type_t<TypeRhs>(-3.14f);
   float_scalar.set_valid_async(false);
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     int_col, float_scalar, cudf::binary_operator::NULL_MIN, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1581,7 +1581,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Scalar_Vector_SI8_SI8_FP32)
   auto float_scalar = cudf::scalar_type_t<TypeRhs>(-3.14f);
   float_scalar.set_valid_async(false);
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     float_scalar, int_col, cudf::binary_operator::NULL_MAX, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1603,7 +1603,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareMin_Vector_Vector_SI64_SI32_SI8)
   auto another_int_col = fixed_width_column_wrapper<TypeLhs>{
     {9, -37, 0, 32, -47, -4, 55}, {false, false, false, false, false, false, false}};
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     int_col, another_int_col, cudf::binary_operator::NULL_MIN, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1624,7 +1624,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Vector_Vector_SI64_SI32_SI8)
   auto another_int_col = fixed_width_column_wrapper<TypeLhs>{
     {9, -37, 0, 32, -47, -4, 55}, {false, false, false, false, false, false, false}};
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     int_col, another_int_col, cudf::binary_operator::NULL_MAX, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1656,7 +1656,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareMin_Vector_Vector_tsD_tsD_tsD)
     },
     {false, true, true, true, false}};
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     lhs_col, rhs_col, cudf::binary_operator::NULL_MIN, data_type(type_to_id<cudf::timestamp_D>()));
 
   // Every row has a value
@@ -1678,7 +1678,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Vector_Vector_SI32_SI64_SI8)
   auto another_int_col = fixed_width_column_wrapper<TypeLhs>{
     {9, -37, 0, 32, -47, -4, 55}, {true, false, true, false, true, false, true}};
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     int_col, another_int_col, cudf::binary_operator::NULL_MAX, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1698,7 +1698,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Vector_Vector_string_string_
     {"eee", "goo", "<null>", "", "", "", "ééé", "bar", "foo", "def", "abc"},
     {false, true, true, true, false, true, true, false, false, true, true});
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     lhs_col, rhs_col, cudf::binary_operator::NULL_MAX, data_type{type_id::STRING});
 
   auto exp_col = cudf::test::strings_column_wrapper(
@@ -1717,7 +1717,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareMin_Vector_Scalar_string_string_
 
   // Returns a non-nullable column as all elements are valid - it will have the scalar
   // value at the very least
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     lhs_col, str_scalar, cudf::binary_operator::NULL_MIN, data_type{type_id::STRING});
 
   auto exp_col = cudf::test::strings_column_wrapper(
@@ -1735,7 +1735,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Scalar_Vector_string_string_
   str_scalar.set_valid_async(false);
 
   // Returns the lhs_col
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     str_scalar, lhs_col, cudf::binary_operator::NULL_MAX, data_type{type_id::STRING});
 
   auto exp_col = cudf::test::strings_column_wrapper(
@@ -1757,8 +1757,8 @@ TEST_F(BinaryOperationIntegrationTest, CastAdd_Vector_Vector_SI32_float_float)
   auto rhs      = cudf::test::fixed_width_column_wrapper<float>{1.3f, 1.6f};
   auto expected = cudf::test::fixed_width_column_wrapper<int>{2, 3};
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -1773,8 +1773,8 @@ TEST_F(BinaryOperationIntegrationTest, Add_Vector_Vector_TimepointD_DurationS_Ti
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -1789,8 +1789,8 @@ TEST_F(BinaryOperationIntegrationTest, Add_Vector_Scalar_DurationD_TimepointS_Ti
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = cudf::scalar_type_t<TypeRhs>(typename TypeRhs::duration{34}, true);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -1805,8 +1805,8 @@ TEST_F(BinaryOperationIntegrationTest, Add_Scalar_Vector_DurationS_DurationD_Dur
 
   auto lhs = cudf::scalar_type_t<TypeLhs>(TypeLhs{-9});
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -1823,7 +1823,7 @@ TEST_F(BinaryOperationIntegrationTest, ShiftRightUnsigned_Scalar_Vector_SI64_SI6
   auto lhs = cudf::scalar_type_t<TypeLhs>(-12);
   // this generates values in the range 1-10 which should be reasonable for the shift
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::SHIFT_RIGHT_UNSIGNED, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT_UNSIGNED());
@@ -1838,8 +1838,8 @@ TEST_F(BinaryOperationIntegrationTest, PMod_Scalar_Vector_FP32)
   auto lhs = cudf::scalar_type_t<TypeLhs>(-86099.68377);
   auto rhs = fixed_width_column_wrapper<TypeRhs>{{90770.74881, -15456.4335, 32213.22119}};
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
 
   auto expected_result =
     fixed_width_column_wrapper<TypeOut>{{4671.0625, -8817.51953125, 10539.974609375}};
@@ -1855,8 +1855,8 @@ TEST_F(BinaryOperationIntegrationTest, PMod_Vector_Scalar_FP64)
   auto lhs = fixed_width_column_wrapper<TypeLhs>{{90770.74881, -15456.4335, 32213.22119}};
   auto rhs = cudf::scalar_type_t<TypeRhs>(-86099.68377);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
 
   auto expected_result = fixed_width_column_wrapper<TypeOut>{
     {4671.0650400000013178, -15456.433499999999185, 32213.221190000000206}};
@@ -1880,8 +1880,8 @@ TEST_F(BinaryOperationIntegrationTest, PMod_Vector_Vector_FP64_FP32_FP64)
                                                   2.1336193413893147E307,
                                                   -2.1336193413893147E307}};
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
 
   auto expected_result = fixed_width_column_wrapper<TypeOut>{{24854.55859375,
                                                               2664.7075000000040745,
@@ -1905,8 +1905,8 @@ TEST_F(BinaryOperationIntegrationTest, PMod_Vector_Vector_FP64_SI32_SI64)
   auto lhs = make_random_wrapped_column<TypeLhs>(1000);
   auto rhs = make_random_wrapped_column<TypeRhs>(1000);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, PMOD());
 }
@@ -1922,8 +1922,8 @@ TEST_F(BinaryOperationIntegrationTest, PMod_Vector_Vector_SI64_SI32_SI64)
   auto lhs = make_random_wrapped_column<TypeLhs>(1000);
   auto rhs = make_random_wrapped_column<TypeRhs>(1000);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, PMOD());
 }
@@ -1939,8 +1939,8 @@ TEST_F(BinaryOperationIntegrationTest, PMod_Vector_Vector_SI64_FP64_FP64)
   auto lhs = make_random_wrapped_column<TypeLhs>(1000);
   auto rhs = make_random_wrapped_column<TypeRhs>(1000);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, PMOD());
 }
@@ -1956,7 +1956,7 @@ TEST_F(BinaryOperationIntegrationTest, ATan2_Scalar_Vector_FP32)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   auto rhs = make_random_wrapped_column<TypeRhs>(10000);
 
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::ATAN2, data_type(type_to_id<TypeOut>()));
 
   // atan2 has a max ULP error of 2 per CUDA programming guide
@@ -1974,7 +1974,7 @@ TEST_F(BinaryOperationIntegrationTest, ATan2_Vector_Scalar_FP64)
   auto lhs = make_random_wrapped_column<TypeLhs>(10000);
   auto rhs = make_random_wrapped_scalar<TypeRhs>();
 
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::ATAN2, data_type(type_to_id<TypeOut>()));
 
   // atan2 has a max ULP error of 2 per CUDA programming guide
@@ -1992,7 +1992,7 @@ TEST_F(BinaryOperationIntegrationTest, ATan2_Vector_Vector_FP64_FP32_FP64)
   auto lhs = make_random_wrapped_column<TypeLhs>(10000);
   auto rhs = make_random_wrapped_column<TypeRhs>(10000);
 
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::ATAN2, data_type(type_to_id<TypeOut>()));
 
   // atan2 has a max ULP error of 2 per CUDA programming guide
@@ -2010,7 +2010,7 @@ TEST_F(BinaryOperationIntegrationTest, ATan2_Vector_Vector_FP64_SI32_SI64)
   auto lhs = make_random_wrapped_column<TypeLhs>(10000);
   auto rhs = make_random_wrapped_column<TypeRhs>(10000);
 
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::ATAN2, data_type(type_to_id<TypeOut>()));
 
   // atan2 has a max ULP error of 2 per CUDA programming guide
@@ -2053,7 +2053,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, result->view());
 }
@@ -2086,7 +2086,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiply)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::MUL,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
+  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, result->view());
 }
@@ -2108,7 +2108,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiply2)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::MUL,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
+  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2127,7 +2127,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::DIV,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2146,7 +2146,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv2)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::DIV,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2163,7 +2163,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv3)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::DIV, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2183,7 +2183,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv4)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::DIV, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2202,7 +2202,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd2)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2221,7 +2221,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd3)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2238,7 +2238,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd4)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::ADD, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2255,7 +2255,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd5)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::ADD, lhs->type(), static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::jit::binary_operation(*lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2272,8 +2272,8 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd6)
   auto const expected2 = fp_wrapper<RepType>{{0, 0, 1, 1, 1, 1}, scale_type{1}};
   auto const type1     = cudf::data_type{cudf::type_to_id<decimalXX>(), 0};
   auto const type2     = cudf::data_type{cudf::type_to_id<decimalXX>(), 1};
-  auto const result1   = cudf::binary_operation(col, col, cudf::binary_operator::ADD, type1);
-  auto const result2   = cudf::binary_operation(col, col, cudf::binary_operator::ADD, type2);
+  auto const result1   = cudf::jit::binary_operation(col, col, cudf::binary_operator::ADD, type1);
+  auto const result2   = cudf::jit::binary_operation(col, col, cudf::binary_operator::ADD, type2);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
@@ -2305,7 +2305,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiplyScalar)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::MUL, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::MUL, type);
+  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::MUL, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2324,7 +2324,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpSimplePlus)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2340,8 +2340,8 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimple)
   auto const col2     = fp_wrapper<RepType>{{100, 200, 300, 400}, scale_type{-2}};
   auto const expected = wrapper<bool>(trues.begin(), trues.end());
 
-  auto const result =
-    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+  auto const result = cudf::jit::binary_operation(
+    col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2357,7 +2357,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimpleScale0)
   auto const expected = wrapper<bool>(trues.begin(), trues.end());
 
   auto const result =
-    cudf::binary_operation(col, col, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+    cudf::jit::binary_operation(col, col, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2372,8 +2372,8 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimpleScale0Null)
   auto const col2     = fp_wrapper<RepType>{{1, 2, 3, 4}, {0, 0, 0, 0}, scale_type{0}};
   auto const expected = wrapper<bool>{{0, 1, 0, 1}, {0, 0, 0, 0}};
 
-  auto const result =
-    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+  auto const result = cudf::jit::binary_operation(
+    col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2388,8 +2388,8 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimpleScale2Null)
   auto const col2     = fp_wrapper<RepType>{{1, 2, 3, 4}, {0, 0, 0, 0}, scale_type{0}};
   auto const expected = wrapper<bool>{{0, 1, 0, 1}, {0, 0, 0, 0}};
 
-  auto const result =
-    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+  auto const result = cudf::jit::binary_operation(
+    col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2415,7 +2415,8 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualLessGreater)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(iota_3).type(),
                                                    static_cast<cudf::column_view>(zeros_3).type());
-  auto const iota_3_after_add = cudf::binary_operation(zeros_3, iota_3, binary_operator::ADD, type);
+  auto const iota_3_after_add =
+    cudf::jit::binary_operation(zeros_3, iota_3, binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(iota_3, iota_3_after_add->view());
 
@@ -2426,15 +2427,15 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualLessGreater)
 
   auto const btype = cudf::data_type{type_id::BOOL8};
   auto const equal_result =
-    cudf::binary_operation(iota_3, iota_3_after_add->view(), binary_operator::EQUAL, btype);
+    cudf::jit::binary_operation(iota_3, iota_3_after_add->view(), binary_operator::EQUAL, btype);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, equal_result->view());
 
   auto const less_result =
-    cudf::binary_operation(zeros_3, iota_3_after_add->view(), binary_operator::LESS, btype);
+    cudf::jit::binary_operation(zeros_3, iota_3_after_add->view(), binary_operator::LESS, btype);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, less_result->view());
 
   auto const greater_result =
-    cudf::binary_operation(iota_3_after_add->view(), zeros_3, binary_operator::GREATER, btype);
+    cudf::jit::binary_operation(iota_3_after_add->view(), zeros_3, binary_operator::GREATER, btype);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, greater_result->view());
 }
 
@@ -2453,7 +2454,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpNullMaxSimple)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::NULL_MAX,
                                                    static_cast<cudf::column_view>(col1).type(),
                                                    static_cast<cudf::column_view>(col2).type());
-  auto const result = cudf::binary_operation(col1, col2, binary_operator::NULL_MAX, type);
+  auto const result = cudf::jit::binary_operation(col1, col2, binary_operator::NULL_MAX, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2473,7 +2474,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpNullMinSimple)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::NULL_MIN,
                                                    static_cast<cudf::column_view>(col1).type(),
                                                    static_cast<cudf::column_view>(col2).type());
-  auto const result = cudf::binary_operation(col1, col2, binary_operator::NULL_MIN, type);
+  auto const result = cudf::jit::binary_operation(col1, col2, binary_operator::NULL_MIN, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2489,7 +2490,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpNullEqualsSimple)
   auto const col2     = fp_wrapper<RepType>{{40, 200, 20, 400}, {1, 0, 1, 0}, scale_type{-1}};
   auto const expected = wrapper<bool>{{1, 0, 0, 1}, {1, 1, 1, 1}};
 
-  auto const result = cudf::binary_operation(
+  auto const result = cudf::jit::binary_operation(
     col1, col2, binary_operator::NULL_EQUALS, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
@@ -2506,7 +2507,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div)
   auto const expected = fp_wrapper<RepType>{{25, 75, 125, 175}, scale_type{-2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2522,7 +2523,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div2)
   auto const expected = fp_wrapper<RepType>{{5000, 15000, 25000, 35000}, scale_type{-2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2538,7 +2539,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div3)
   auto const expected = fp_wrapper<RepType>{{3333, 3333, 16666, 23333}, scale_type{-2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2554,7 +2555,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div4)
   auto const expected = fp_wrapper<RepType>{{3, 10, 16, 23}, scale_type{1}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2571,7 +2572,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div6)
   auto const expected = fp_wrapper<RepType>{{300, 100, 60, 42}, scale_type{-2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2588,7 +2589,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div7)
   auto const expected = fp_wrapper<RepType>{{12, 6, 4, 2, 2, 1, 1, 0}, scale_type{2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 2};
-  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2604,7 +2605,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div8)
   auto const expected = fp_wrapper<RepType>{{0, 1, 16}, scale_type{2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 2};
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2620,7 +2621,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div9)
   auto const expected = fp_wrapper<RepType>{{1, 2, 4}, scale_type{1}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2636,7 +2637,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div10)
   auto const expected = fp_wrapper<RepType>{{14, 28, 42}, scale_type{1}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2652,7 +2653,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div11)
   auto const expected = fp_wrapper<RepType>{{142, 285, 428}, scale_type{1}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2666,9 +2667,9 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpThrows)
   auto const col           = fp_wrapper<RepType>{{100, 300, 500, 700}, scale_type{-2}};
   auto const non_bool_type = data_type{type_to_id<decimalXX>(), -2};
   auto const float_type    = data_type{type_id::FLOAT32};
-  EXPECT_THROW(cudf::binary_operation(col, col, cudf::binary_operator::LESS, non_bool_type),
+  EXPECT_THROW(cudf::jit::binary_operation(col, col, cudf::binary_operator::LESS, non_bool_type),
                cudf::logic_error);
-  EXPECT_THROW(cudf::binary_operation(col, col, cudf::binary_operator::MUL, float_type),
+  EXPECT_THROW(cudf::jit::binary_operation(col, col, cudf::binary_operator::MUL, float_type),
                cudf::logic_error);
 }
 
diff --git a/cpp/tests/binaryop/binop-null-test.cpp b/cpp/tests/binaryop/binop-null-test.cpp
index c91bc12d95f..25ec3b30834 100644
--- a/cpp/tests/binaryop/binop-null-test.cpp
+++ b/cpp/tests/binaryop/binop-null-test.cpp
@@ -66,8 +66,8 @@ TEST_F(BinaryOperationNullTest, Scalar_Null_Vector_Valid)
   lhs.set_valid_async(false);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::ALL_VALID);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -83,8 +83,8 @@ TEST_F(BinaryOperationNullTest, Scalar_Valid_Vector_NonNullable)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::UNALLOCATED);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -101,8 +101,8 @@ TEST_F(BinaryOperationNullTest, Scalar_Null_Vector_NonNullable)
   lhs.set_valid_async(false);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::UNALLOCATED);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -118,8 +118,8 @@ TEST_F(BinaryOperationNullTest, Vector_Null_Scalar_Valid)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::ALL_NULL);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -135,8 +135,8 @@ TEST_F(BinaryOperationNullTest, Vector_Null_Vector_Valid)
   auto lhs = make_random_wrapped_column<TypeLhs>(100, mask_state::ALL_NULL);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::ALL_VALID);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -152,8 +152,8 @@ TEST_F(BinaryOperationNullTest, Vector_Null_Vector_NonNullable)
   auto lhs = make_random_wrapped_column<TypeLhs>(100, mask_state::ALL_NULL);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::UNALLOCATED);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -169,8 +169,8 @@ TEST_F(BinaryOperationNullTest, Vector_Valid_Vector_NonNullable)
   auto lhs = make_random_wrapped_column<TypeLhs>(100, mask_state::ALL_VALID);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::UNALLOCATED);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -186,8 +186,8 @@ TEST_F(BinaryOperationNullTest, Vector_NonNullable_Vector_NonNullable)
   auto lhs = make_random_wrapped_column<TypeLhs>(100, mask_state::UNALLOCATED);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::UNALLOCATED);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
diff --git a/cpp/tests/binaryop/binop-verify-input-test.cpp b/cpp/tests/binaryop/binop-verify-input-test.cpp
index 167fbc22bde..779dc7c4c1f 100644
--- a/cpp/tests/binaryop/binop-verify-input-test.cpp
+++ b/cpp/tests/binaryop/binop-verify-input-test.cpp
@@ -35,9 +35,9 @@ TEST_F(BinopVerifyInputTest, Vector_Scalar_ErrorOutputVectorType)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   auto rhs = make_random_wrapped_column<TypeRhs>(10);
 
-  EXPECT_THROW(
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_id::NUM_TYPE_IDS)),
-    cudf::logic_error);
+  EXPECT_THROW(cudf::jit::binary_operation(
+                 lhs, rhs, cudf::binary_operator::ADD, data_type(type_id::NUM_TYPE_IDS)),
+               cudf::logic_error);
 }
 
 TEST_F(BinopVerifyInputTest, Vector_Vector_ErrorSecondOperandVectorZeroSize)
@@ -49,9 +49,9 @@ TEST_F(BinopVerifyInputTest, Vector_Vector_ErrorSecondOperandVectorZeroSize)
   auto lhs = make_random_wrapped_column<TypeLhs>(1);
   auto rhs = make_random_wrapped_column<TypeRhs>(10);
 
-  EXPECT_THROW(
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>())),
-    cudf::logic_error);
+  EXPECT_THROW(cudf::jit::binary_operation(
+                 lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>())),
+               cudf::logic_error);
 }
 
 }  // namespace binop
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp
index 47b2a95e7b5..ced809c243d 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cpp
+++ b/cpp/tests/fixed_point/fixed_point_tests.cpp
@@ -524,8 +524,8 @@ TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType32)
   auto const expected2 = fp_wrapper{{50000000}, scale_type{6}};
 
   auto const type    = cudf::data_type{cudf::type_id::DECIMAL32, 6};
-  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type);
-  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type);
+  auto const result1 = cudf::jit::binary_operation(a, b, cudf::binary_operator::ADD, type);
+  auto const result2 = cudf::jit::binary_operation(a, c, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
@@ -547,8 +547,8 @@ TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType64)
   auto const expected2 = fp_wrapper{{50000000}, scale_type{100}};
 
   auto const type    = cudf::data_type{cudf::type_id::DECIMAL64, 100};
-  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type);
-  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type);
+  auto const result1 = cudf::jit::binary_operation(a, b, cudf::binary_operator::ADD, type);
+  auto const result2 = cudf::jit::binary_operation(a, c, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
@@ -570,10 +570,10 @@ TYPED_TEST(FixedPointTestBothReps, ExtremelyLargeNegativeScale)
   auto const expected2 = fp_wrapper{{5}, scale_type{-201}};
 
   auto const type1   = cudf::data_type{cudf::type_to_id<decimalXX>(), -202};
-  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type1);
+  auto const result1 = cudf::jit::binary_operation(a, b, cudf::binary_operator::ADD, type1);
 
   auto const type2   = cudf::data_type{cudf::type_to_id<decimalXX>(), -201};
-  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type2);
+  auto const result2 = cudf::jit::binary_operation(a, c, cudf::binary_operator::DIV, type2);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
diff --git a/python/cudf/cudf/_lib/cpp/binaryop.pxd b/python/cudf/cudf/_lib/cpp/binaryop.pxd
index c3320b371cd..c36ab124bf8 100644
--- a/python/cudf/cudf/_lib/cpp/binaryop.pxd
+++ b/python/cudf/cudf/_lib/cpp/binaryop.pxd
@@ -61,3 +61,27 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         const string& op,
         data_type output_type
     ) except +
+
+    unique_ptr[column] jit_binary_operation \
+        "cudf::jit::binary_operation" (
+        const column_view& lhs,
+        const column_view& rhs,
+        binary_operator op,
+        data_type output_type
+    ) except +
+
+    unique_ptr[column] jit_binary_operation \
+        "cudf::jit::binary_operation" (
+        const column_view& lhs,
+        const scalar& rhs,
+        binary_operator op,
+        data_type output_type
+    ) except +
+
+    unique_ptr[column] jit_binary_operation \
+        "cudf::jit::binary_operation" (
+        const scalar& lhs,
+        const column_view& rhs,
+        binary_operator op,
+        data_type output_type
+    ) except +
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 85a9f85ad22..bc12b42a3fa 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -164,7 +164,17 @@ def binary_operator(
                 ):
                     out_dtype = cudf.dtype("float64")
 
-        if binop in {"lt", "gt", "le", "ge", "eq", "ne", "NULL_EQUALS"}:
+        if binop in {
+            "l_and",
+            "l_or",
+            "lt",
+            "gt",
+            "le",
+            "ge",
+            "eq",
+            "ne",
+            "NULL_EQUALS",
+        }:
             out_dtype = "bool"
         lhs, rhs = (self, rhs) if not reflect else (rhs, self)
         return libcudf.binaryop.binaryop(lhs, rhs, binop, out_dtype)

From 406e87bdb8b1976bc8d47794b79901fc35d4803d Mon Sep 17 00:00:00 2001
From: Marlene <57748216+marlenezw@users.noreply.github.com>
Date: Mon, 23 Aug 2021 17:40:17 +0200
Subject: [PATCH 26/46] Allowing %f in format to return nanoseconds (#9081)

This is a quick fix to close PR #7945
This PR checks to see if `%f` is passed as part of `format`  into `cudf.to_datetime`. Previously, cudf would not return nanoseconds, while pandas does.

Authors:
  - Marlene  (https://github.com/marlenezw)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/9081
---
 python/cudf/cudf/core/tools/datetimes.py | 3 +++
 python/cudf/cudf/tests/test_datetime.py  | 1 +
 2 files changed, 4 insertions(+)

diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 946cdcb1ebc..4856995b391 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -123,6 +123,9 @@ def to_datetime(
     if yearfirst:
         raise NotImplementedError("yearfirst support is not yet implemented")
 
+    if format is not None and "%f" in format:
+        format = format.replace("%f", "%9f")
+
     try:
         if isinstance(arg, cudf.DataFrame):
             # we require at least Ymd
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 9f19bf8b960..65e87e88f55 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -717,6 +717,7 @@ def test_to_datetime_units(data, unit):
         (["10/11/2012", "01/01/2010", "07/07/2016", "02/02/2014"], "%m/%d/%Y"),
         (["10/11/2012", "01/01/2010", "07/07/2016", "02/02/2014"], "%d/%m/%Y"),
         (["10/11/2012", "01/01/2010", "07/07/2016", "02/02/2014"], None),
+        (["2021-04-13 12:30:04.123456789"], "%Y-%m-%d %H:%M:%S.%f"),
         (pd.Series([2015, 2020, 2021]), "%Y"),
         pytest.param(
             pd.Series(["1", "2", "1"]),

From 8aefeb49bec96e3bc27a05276e66471f8ca7f966 Mon Sep 17 00:00:00 2001
From: MithunR <mythrocks@gmail.com>
Date: Mon, 23 Aug 2021 11:21:07 -0700
Subject: [PATCH 27/46] Fix branch_stack calculation in `row_bit_count()`
 (#9076)

Fixes #8938.

For input with a number of rows exceeding  `max_block_size`, `row_bit_count()` currently
reaches past the bounds of its shared-memory allocation, causing illegal memory access
errors like in [cudf/issues/8938](https://github.com/rapidsai/cudf/issues/8938).

This commit corrects the calculation of the branch stack's base address, and adds a
test for this case.

Authors:
  - MithunR (https://github.com/mythrocks)

Approvers:
  - https://github.com/nvdbaranec
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/9076
---
 cpp/src/transform/row_bit_count.cu        |  2 +-
 cpp/tests/transform/row_bit_count_test.cu | 65 +++++++++++++++++++++++
 2 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 620504f5c93..27936ce04b3 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -408,7 +408,7 @@ __global__ void compute_row_sizes(device_span<column_device_view const> cols,
   if (tid >= num_rows) { return; }
 
   // branch stack. points to the last list prior to branching.
-  row_span* my_branch_stack = thread_branch_stacks + (tid * max_branch_depth);
+  row_span* my_branch_stack = thread_branch_stacks + (threadIdx.x * max_branch_depth);
   size_type branch_depth{0};
 
   // current row span - always starts at 1 row.
diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu
index 0081cf0d467..8284def5f13 100644
--- a/cpp/tests/transform/row_bit_count_test.cu
+++ b/cpp/tests/transform/row_bit_count_test.cu
@@ -15,7 +15,9 @@
  */
 
 #include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/io/parquet.hpp>
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 #include <cudf_test/base_fixture.hpp>
@@ -25,6 +27,9 @@
 
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/fill.h>
+#include <thrust/tabulate.h>
+
 using namespace cudf;
 
 template <typename T>
@@ -192,6 +197,66 @@ TEST_F(RowBitCount, StringsWithNulls)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
 }
 
+namespace {
+
+/**
+ * @brief __device__ functor to multiply input by 2, defined out of line because __device__ lambdas
+ * cannot be defined in a TEST_F().
+ */
+struct times_2 {
+  int32_t __device__ operator()(int32_t i) const { return i * 2; }
+};
+
+}  // namespace
+
+TEST_F(RowBitCount, StructsWithLists_RowsExceedingASingleBlock)
+{
+  // Tests that `row_bit_count()` can handle struct<list<int32_t>> with more
+  // than max_block_size (256) rows.
+  // With a large number of rows, computation spills to multiple thread-blocks,
+  // thus exercising the branch-stack comptutation.
+  // The contents of the input column aren't as pertinent to this test as the
+  // column size. For what it's worth, it looks as follows:
+  //   [ struct({0,1}), struct({2,3}), struct({4,5}), ... ]
+
+  using namespace cudf;
+  auto constexpr num_rows = 1024 * 2;  // Exceeding a block size.
+
+  // List child column = {0, 1, 2, 3, 4, ..., 2*num_rows};
+  auto ints      = make_numeric_column(data_type{type_id::INT32}, num_rows * 2);
+  auto ints_view = ints->mutable_view();
+  thrust::tabulate(thrust::device,
+                   ints_view.begin<int32_t>(),
+                   ints_view.end<int32_t>(),
+                   thrust::identity<int32_t>());
+
+  // List offsets = {0, 2, 4, 6, 8, ..., num_rows*2};
+  auto list_offsets      = make_numeric_column(data_type{type_id::INT32}, num_rows + 1);
+  auto list_offsets_view = list_offsets->mutable_view();
+  thrust::tabulate(thrust::device,
+                   list_offsets_view.begin<offset_type>(),
+                   list_offsets_view.end<offset_type>(),
+                   times_2{});
+
+  // List<int32_t> = {{0,1}, {2,3}, {4,5}, ..., {2*(num_rows-1), 2*num_rows-1}};
+  auto lists_column = make_lists_column(num_rows, std::move(list_offsets), std::move(ints), 0, {});
+
+  // Struct<List<int32_t>.
+  auto struct_members = std::vector<std::unique_ptr<column>>{};
+  struct_members.emplace_back(std::move(lists_column));
+  auto structs_column = make_structs_column(num_rows, std::move(struct_members), 0, {});
+
+  // Compute row_bit_count, and compare.
+  auto row_bit_counts          = row_bit_count(table_view{{structs_column->view()}});
+  auto expected_row_bit_counts = make_numeric_column(data_type{type_id::INT32}, num_rows);
+  thrust::fill_n(thrust::device,
+                 expected_row_bit_counts->mutable_view().begin<int32_t>(),
+                 num_rows,
+                 CHAR_BIT * (2 * sizeof(int32_t) + sizeof(offset_type)));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(row_bit_counts->view(), expected_row_bit_counts->view());
+}
+
 std::pair<std::unique_ptr<column>, std::unique_ptr<column>> build_struct_column()
 {
   std::vector<bool> struct_validity{0, 1, 1, 1, 1, 0};

From d7a05dc88950039408152c0f8a75fc4c83a9f95c Mon Sep 17 00:00:00 2001
From: Alfred Xu <lovedreamf@gmail.com>
Date: Tue, 24 Aug 2021 09:43:52 +0800
Subject: [PATCH 28/46] Support nested types for nth_element reduction (#9043)

Closes #8967

Current PR supported the construction of default scalar on nested types (LIST_TYPE and STRUCT_TYPE) for reduction, in order to support nested types for nth_element reduction.

Authors:
  - Alfred Xu (https://github.com/sperlingxx)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/9043
---
 cpp/include/cudf/scalar/scalar_factories.hpp |  14 +
 cpp/src/reductions/reductions.cpp            |  19 +-
 cpp/src/scalar/scalar_factories.cpp          |  21 ++
 cpp/tests/groupby/nth_element_tests.cpp      | 107 +++++++-
 cpp/tests/reductions/reduction_tests.cpp     | 264 +++++++++++++++++++
 5 files changed, 416 insertions(+), 9 deletions(-)

diff --git a/cpp/include/cudf/scalar/scalar_factories.hpp b/cpp/include/cudf/scalar/scalar_factories.hpp
index b96a8c65a04..b949f8d542f 100644
--- a/cpp/include/cudf/scalar/scalar_factories.hpp
+++ b/cpp/include/cudf/scalar/scalar_factories.hpp
@@ -121,6 +121,20 @@ std::unique_ptr<scalar> make_default_constructed_scalar(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Creates an empty (invalid) scalar of the same type as the `input` column_view.
+ *
+ * @throw cudf::logic_error if the `input` column is struct type and empty
+ *
+ * @param input Immutable view of input column to emulate
+ * @param stream CUDA stream used for device memory operations.
+ * @param mr Device memory resource used to allocate the scalar's `data` and `is_valid` bool.
+ */
+std::unique_ptr<scalar> make_empty_scalar_like(
+  column_view const& input,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Construct scalar using the given value of fixed width type
  *
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index a8117373ca4..699494c49c5 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -25,6 +25,7 @@
 #include <cudf/reduction.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 
+#include <cudf/structs/structs_column_view.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
@@ -112,15 +113,17 @@ std::unique_ptr<scalar> reduce(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  std::unique_ptr<scalar> result = make_default_constructed_scalar(output_dtype, stream, mr);
-  result->set_valid_async(false, stream);
-
-  // check if input column is empty
-  if (col.size() <= col.null_count()) return result;
+  // Returns default scalar if input column is non-valid. In terms of nested columns, we need to
+  // handcraft the default scalar with input column.
+  if (col.size() <= col.null_count()) {
+    if (col.type().id() == type_id::EMPTY || col.type() != output_dtype) {
+      return make_default_constructed_scalar(output_dtype, stream, mr);
+    }
+    return make_empty_scalar_like(col, stream, mr);
+  }
 
-  result =
-    aggregation_dispatcher(agg->kind, reduce_dispatch_functor{col, output_dtype, stream, mr}, agg);
-  return result;
+  return aggregation_dispatcher(
+    agg->kind, reduce_dispatch_functor{col, output_dtype, stream, mr}, agg);
 }
 }  // namespace detail
 
diff --git a/cpp/src/scalar/scalar_factories.cpp b/cpp/src/scalar/scalar_factories.cpp
index af78d84d874..25418cf0f7e 100644
--- a/cpp/src/scalar/scalar_factories.cpp
+++ b/cpp/src/scalar/scalar_factories.cpp
@@ -20,6 +20,7 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <cudf/detail/copy.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
@@ -165,4 +166,24 @@ std::unique_ptr<scalar> make_default_constructed_scalar(data_type type,
   return type_dispatcher(type, default_scalar_functor{}, stream, mr);
 }
 
+std::unique_ptr<scalar> make_empty_scalar_like(column_view const& column,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr)
+{
+  std::unique_ptr<scalar> result;
+  switch (column.type().id()) {
+    case type_id::LIST:
+      result = make_list_scalar(empty_like(column)->view(), stream, mr);
+      result->set_valid_async(false, stream);
+      break;
+    case type_id::STRUCT:
+      // The input column must have at least 1 row to extract a scalar (row) from it.
+      result = detail::get_element(column, 0, stream, mr);
+      result->set_valid_async(false, stream);
+      break;
+    default: result = make_default_constructed_scalar(column.type(), stream, mr);
+  }
+  return result;
+}
+
 }  // namespace cudf
diff --git a/cpp/tests/groupby/nth_element_tests.cpp b/cpp/tests/groupby/nth_element_tests.cpp
index 22f1e14815f..47dfa2426eb 100644
--- a/cpp/tests/groupby/nth_element_tests.cpp
+++ b/cpp/tests/groupby/nth_element_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -414,5 +414,110 @@ TYPED_TEST(groupby_nth_element_lists_test, EmptyInput)
                   cudf::make_nth_element_aggregation<groupby_aggregation>(2));
 }
 
+struct groupby_nth_element_structs_test : BaseFixture {
+};
+
+TEST_F(groupby_nth_element_structs_test, Basics)
+{
+  using structs = cudf::test::structs_column_wrapper;
+  using ints    = cudf::test::fixed_width_column_wrapper<int>;
+  using doubles = cudf::test::fixed_width_column_wrapper<double>;
+  using strings = cudf::test::strings_column_wrapper;
+
+  auto keys   = ints{0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
+  auto child0 = ints{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  auto child1 = doubles{0.1, 1.2, 2.3, 3.4, 4.51, 5.3e4, 6.3231, -0.07, 832.1, 9.999};
+  auto child2 = strings{"", "a", "b", "c", "d", "e", "f", "g", "HH", "JJJ"};
+  auto values = structs{{child0, child1, child2}, {1, 0, 1, 0, 1, 1, 1, 1, 0, 1}};
+
+  auto expected_keys   = ints{0, 1, 2, 3};
+  auto expected_ch0    = ints{1, 4, 7, 0};
+  auto expected_ch1    = doubles{1.2, 4.51, -0.07, 0.0};
+  auto expected_ch2    = strings{"a", "d", "g", ""};
+  auto expected_values = structs{{expected_ch0, expected_ch1, expected_ch2}, {0, 1, 1, 0}};
+  test_single_agg(keys,
+                  values,
+                  expected_keys,
+                  expected_values,
+                  cudf::make_nth_element_aggregation<groupby_aggregation>(1));
+
+  expected_keys   = ints{0, 1, 2, 3};
+  expected_ch0    = ints{0, 4, 6, 9};
+  expected_ch1    = doubles{0.1, 4.51, 6.3231, 9.999};
+  expected_ch2    = strings{"", "d", "f", "JJJ"};
+  expected_values = structs{{expected_ch0, expected_ch1, expected_ch2}, {1, 1, 1, 1}};
+  test_single_agg(keys,
+                  values,
+                  expected_keys,
+                  expected_values,
+                  cudf::make_nth_element_aggregation<groupby_aggregation>(0, null_policy::EXCLUDE));
+}
+
+TEST_F(groupby_nth_element_structs_test, NestedStructs)
+{
+  using structs = cudf::test::structs_column_wrapper;
+  using ints    = cudf::test::fixed_width_column_wrapper<int>;
+  using doubles = cudf::test::fixed_width_column_wrapper<double>;
+  using lists   = cudf::test::lists_column_wrapper<int>;
+
+  auto keys             = ints{0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
+  auto child0           = ints{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  auto child0_of_child1 = ints{0, -1, -2, -3, -4, -5, -6, -7, -8, -9};
+  auto child1_of_child1 = doubles{0.1, 1.2, 2.3, 3.4, 4.51, 5.3e4, 6.3231, -0.07, 832.1, 9.999};
+  auto child1           = structs{child0_of_child1, child1_of_child1};
+  auto child2           = lists{{0}, {1, 2, 3}, {}, {4}, {5, 6}, {}, {}, {7}, {8, 9}, {}};
+  auto values           = structs{{child0, child1, child2}, {1, 0, 1, 0, 1, 1, 1, 1, 0, 1}};
+
+  auto expected_keys       = ints{0, 1, 2, 3};
+  auto expected_ch0        = ints{1, 4, 7, 0};
+  auto expected_ch0_of_ch1 = ints{-1, -4, -7, 0};
+  auto expected_ch1_of_ch1 = doubles{1.2, 4.51, -0.07, 0.0};
+  auto expected_ch1        = structs{expected_ch0_of_ch1, expected_ch1_of_ch1};
+  auto expected_ch2        = lists{{1, 2, 3}, {5, 6}, {7}, {}};
+  auto expected_values     = structs{{expected_ch0, expected_ch1, expected_ch2}, {0, 1, 1, 0}};
+  test_single_agg(keys,
+                  values,
+                  expected_keys,
+                  expected_values,
+                  cudf::make_nth_element_aggregation<groupby_aggregation>(1));
+
+  expected_keys       = ints{0, 1, 2, 3};
+  expected_ch0        = ints{0, 4, 6, 9};
+  expected_ch0_of_ch1 = ints{0, -4, -6, -9};
+  expected_ch1_of_ch1 = doubles{0.1, 4.51, 6.3231, 9.999};
+  expected_ch1        = structs{expected_ch0_of_ch1, expected_ch1_of_ch1};
+  expected_ch2        = lists{{0}, {5, 6}, {}, {}};
+  expected_values     = structs{{expected_ch0, expected_ch1, expected_ch2}, {1, 1, 1, 1}};
+  test_single_agg(keys,
+                  values,
+                  expected_keys,
+                  expected_values,
+                  cudf::make_nth_element_aggregation<groupby_aggregation>(0, null_policy::EXCLUDE));
+}
+
+TEST_F(groupby_nth_element_structs_test, EmptyInput)
+{
+  using structs = cudf::test::structs_column_wrapper;
+  using ints    = cudf::test::fixed_width_column_wrapper<int>;
+  using doubles = cudf::test::fixed_width_column_wrapper<double>;
+  using strings = cudf::test::strings_column_wrapper;
+
+  auto keys   = ints{};
+  auto child0 = ints{};
+  auto child1 = doubles{};
+  auto child2 = strings{};
+  auto values = structs{{child0, child1, child2}};
+
+  auto expected_keys   = ints{};
+  auto expected_ch0    = ints{};
+  auto expected_ch1    = doubles{};
+  auto expected_ch2    = strings{};
+  auto expected_values = structs{{expected_ch0, expected_ch1, expected_ch2}};
+  test_single_agg(keys,
+                  values,
+                  expected_keys,
+                  expected_values,
+                  cudf::make_nth_element_aggregation<groupby_aggregation>(0));
+}
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index da9032737f2..88318a41882 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -24,8 +24,10 @@
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/reduction.hpp>
+#include <cudf/scalar/scalar.hpp>
 #include <cudf/types.hpp>
 #include <cudf/wrappers/timestamps.hpp>
+#include <cudf_test/table_utilities.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -1872,4 +1874,266 @@ TYPED_TEST(DictionaryReductionTest, Quantile)
                        output_type);
 }
 
+struct ListReductionTest : public cudf::test::BaseFixture {
+  void reduction_test(cudf::column_view const& input_data,
+                      cudf::column_view const& expected_value,
+                      bool succeeded_condition,
+                      bool is_valid,
+                      std::unique_ptr<aggregation> const& agg)
+  {
+    auto statement = [&]() {
+      std::unique_ptr<cudf::scalar> result =
+        cudf::reduce(input_data, agg, cudf::data_type(cudf::type_id::LIST));
+      auto list_result = dynamic_cast<cudf::list_scalar*>(result.get());
+      EXPECT_EQ(is_valid, list_result->is_valid());
+      if (is_valid) { CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_value, list_result->view()); }
+    };
+
+    if (succeeded_condition) {
+      CUDF_EXPECT_NO_THROW(statement());
+    } else {
+      EXPECT_ANY_THROW(statement());
+    }
+  }
+};
+
+TEST_F(ListReductionTest, ListReductionNthElement)
+{
+  using LCW        = cudf::test::lists_column_wrapper<int>;
+  using ElementCol = cudf::test::fixed_width_column_wrapper<int>;
+
+  // test without nulls
+  LCW col{{-3}, {2, 1}, {0, 5, -3}, {-2}, {}, {28}};
+  this->reduction_test(col,
+                       ElementCol{0, 5, -3},  // expected_value,
+                       true,
+                       true,
+                       cudf::make_nth_element_aggregation(2, cudf::null_policy::INCLUDE));
+
+  // test with null-exclude
+  std::vector<bool> validity{1, 0, 0, 1, 1, 0};
+  LCW col_nulls({{-3}, {2, 1}, {0, 5, -3}, {-2}, {}, {28}}, validity.begin());
+  this->reduction_test(col_nulls,
+                       ElementCol{-2},  // expected_value,
+                       true,
+                       true,
+                       cudf::make_nth_element_aggregation(1, cudf::null_policy::EXCLUDE));
+
+  // test with null-include
+  this->reduction_test(col_nulls,
+                       ElementCol{},  // expected_value,
+                       true,
+                       false,
+                       cudf::make_nth_element_aggregation(1, cudf::null_policy::INCLUDE));
+}
+
+TEST_F(ListReductionTest, NestedListReductionNthElement)
+{
+  using LCW = cudf::test::lists_column_wrapper<int>;
+
+  // test without nulls
+  auto validity    = std::vector<bool>{1, 0, 0, 1, 1};
+  auto nested_list = LCW(
+    {{LCW{}, LCW{2, 3, 4}}, {}, {LCW{5}, LCW{6}, LCW{7, 8}}, {LCW{9, 10}}, {LCW{11}, LCW{12, 13}}},
+    validity.begin());
+  this->reduction_test(nested_list,
+                       LCW{{}, {2, 3, 4}},  // expected_value,
+                       true,
+                       true,
+                       cudf::make_nth_element_aggregation(0, cudf::null_policy::INCLUDE));
+
+  // test with null-include
+  this->reduction_test(nested_list,
+                       LCW{},  // expected_value,
+                       true,
+                       false,
+                       cudf::make_nth_element_aggregation(2, cudf::null_policy::INCLUDE));
+
+  // test with null-exclude
+  this->reduction_test(nested_list,
+                       LCW{{11}, {12, 13}},  // expected_value,
+                       true,
+                       true,
+                       cudf::make_nth_element_aggregation(2, cudf::null_policy::EXCLUDE));
+}
+
+TEST_F(ListReductionTest, NonValidListReductionNthElement)
+{
+  using LCW        = cudf::test::lists_column_wrapper<int>;
+  using ElementCol = cudf::test::fixed_width_column_wrapper<int>;
+
+  // test against col.size() <= col.null_count()
+  std::vector<bool> validity{0};
+  this->reduction_test(LCW{{{1, 2}}, validity.begin()},
+                       ElementCol{},  // expected_value,
+                       true,
+                       false,
+                       cudf::make_nth_element_aggregation(0, cudf::null_policy::INCLUDE));
+
+  // test against empty input
+  this->reduction_test(LCW{},
+                       ElementCol{{0}, {0}},  // expected_value,
+                       true,
+                       false,
+                       cudf::make_nth_element_aggregation(0, cudf::null_policy::INCLUDE));
+}
+
+struct StructReductionTest : public cudf::test::BaseFixture {
+  using SCW = cudf::test::structs_column_wrapper;
+
+  void reduction_test(SCW const& struct_column,
+                      cudf::table_view const& expected_value,
+                      bool succeeded_condition,
+                      bool is_valid,
+                      std::unique_ptr<aggregation> const& agg)
+  {
+    auto statement = [&]() {
+      std::unique_ptr<cudf::scalar> result =
+        cudf::reduce(struct_column, agg, cudf::data_type(cudf::type_id::STRUCT));
+      auto struct_result = dynamic_cast<cudf::struct_scalar*>(result.get());
+      EXPECT_EQ(is_valid, struct_result->is_valid());
+      if (is_valid) { CUDF_TEST_EXPECT_TABLES_EQUAL(expected_value, struct_result->view()); }
+    };
+
+    if (succeeded_condition) {
+      CUDF_EXPECT_NO_THROW(statement());
+    } else {
+      EXPECT_ANY_THROW(statement());
+    }
+  }
+};
+
+TEST_F(StructReductionTest, StructReductionNthElement)
+{
+  using ICW = cudf::test::fixed_width_column_wrapper<int>;
+
+  // test without nulls
+  auto child0 = *ICW{-3, 2, 1, 0, 5, -3, -2, 28}.release();
+  auto child1 = *ICW{0, 1, 2, 3, 4, 5, 6, 7}.release();
+  auto child2 =
+    *ICW{{-10, 10, -100, 100, -1000, 1000, -10000, 10000}, {1, 0, 0, 1, 1, 1, 0, 1}}.release();
+  std::vector<std::unique_ptr<cudf::column>> input_vector;
+  input_vector.push_back(std::make_unique<cudf::column>(child0));
+  input_vector.push_back(std::make_unique<cudf::column>(child1));
+  input_vector.push_back(std::make_unique<cudf::column>(child2));
+  auto struct_col  = SCW(std::move(input_vector));
+  auto result_col0 = ICW{1};
+  auto result_col1 = ICW{2};
+  auto result_col2 = ICW{{0}, {0}};
+  this->reduction_test(
+    struct_col,
+    cudf::table_view{{result_col0, result_col1, result_col2}},  // expected_value,
+    true,
+    true,
+    cudf::make_nth_element_aggregation(2, cudf::null_policy::INCLUDE));
+
+  // test with null-include
+  std::vector<bool> validity{1, 1, 1, 0, 1, 0, 0, 1};
+  input_vector.clear();
+  input_vector.push_back(std::make_unique<cudf::column>(child0));
+  input_vector.push_back(std::make_unique<cudf::column>(child1));
+  input_vector.push_back(std::make_unique<cudf::column>(child2));
+  struct_col  = SCW(std::move(input_vector), validity);
+  result_col0 = ICW{{0}, {0}};
+  result_col1 = ICW{{0}, {0}};
+  result_col2 = ICW{{0}, {0}};
+  this->reduction_test(
+    struct_col,
+    cudf::table_view{{result_col0, result_col1, result_col2}},  // expected_value,
+    true,
+    false,
+    cudf::make_nth_element_aggregation(6, cudf::null_policy::INCLUDE));
+
+  // test with null-exclude
+  result_col0 = ICW{{28}, {1}};
+  result_col1 = ICW{{7}, {1}};
+  result_col2 = ICW{{10000}, {1}};
+  this->reduction_test(
+    struct_col,
+    cudf::table_view{{result_col0, result_col1, result_col2}},  // expected_value,
+    true,
+    true,
+    cudf::make_nth_element_aggregation(4, cudf::null_policy::EXCLUDE));
+}
+
+TEST_F(StructReductionTest, NestedStructReductionNthElement)
+{
+  using ICW = cudf::test::fixed_width_column_wrapper<int>;
+  using LCW = cudf::test::lists_column_wrapper<int>;
+
+  auto int_col0      = ICW{-4, -3, -2, -1, 0};
+  auto struct_col0   = SCW({int_col0}, std::vector<bool>{1, 0, 0, 1, 1});
+  auto int_col1      = ICW{0, 1, 2, 3, 4};
+  auto list_col      = LCW{{0}, {}, {1, 2}, {3}, {4}};
+  auto struct_col1   = SCW({struct_col0, int_col1, list_col}, std::vector<bool>{1, 1, 1, 0, 1});
+  auto result_child0 = ICW{0};
+  auto result_col0   = SCW({result_child0}, std::vector<bool>{0});
+  auto result_col1   = ICW{{1}, {1}};
+  auto result_col2   = LCW({LCW{}}, std::vector<bool>{1}.begin());
+  // test without nulls
+  this->reduction_test(
+    struct_col1,
+    cudf::table_view{{result_col0, result_col1, result_col2}},  // expected_value,
+    true,
+    true,
+    cudf::make_nth_element_aggregation(1, cudf::null_policy::INCLUDE));
+
+  // test with null-include
+  result_child0 = ICW{0};
+  result_col0   = SCW({result_child0}, std::vector<bool>{0});
+  result_col1   = ICW{{0}, {0}};
+  result_col2   = LCW({LCW{3}}, std::vector<bool>{0}.begin());
+  this->reduction_test(
+    struct_col1,
+    cudf::table_view{{result_col0, result_col1, result_col2}},  // expected_value,
+    true,
+    false,
+    cudf::make_nth_element_aggregation(3, cudf::null_policy::INCLUDE));
+
+  // test with null-exclude
+  result_child0 = ICW{0};
+  result_col0   = SCW({result_child0}, std::vector<bool>{1});
+  result_col1   = ICW{{4}, {1}};
+  result_col2   = LCW({LCW{4}}, std::vector<bool>{1}.begin());
+  this->reduction_test(
+    struct_col1,
+    cudf::table_view{{result_col0, result_col1, result_col2}},  // expected_value,
+    true,
+    true,
+    cudf::make_nth_element_aggregation(3, cudf::null_policy::EXCLUDE));
+}
+
+TEST_F(StructReductionTest, NonValidStructReductionNthElement)
+{
+  using ICW = cudf::test::fixed_width_column_wrapper<int>;
+
+  // test against col.size() <= col.null_count()
+  auto child0     = ICW{-3, 3};
+  auto child1     = ICW{0, 0};
+  auto child2     = ICW{{-10, 10}, {0, 1}};
+  auto struct_col = SCW{{child0, child1, child2}, {0, 0}};
+  auto ret_col0   = ICW{{0}, {0}};
+  auto ret_col1   = ICW{{0}, {0}};
+  auto ret_col2   = ICW{{0}, {0}};
+  this->reduction_test(struct_col,
+                       cudf::table_view{{ret_col0, ret_col1, ret_col2}},  // expected_value,
+                       true,
+                       false,
+                       cudf::make_nth_element_aggregation(0, cudf::null_policy::INCLUDE));
+
+  // test against empty input (would fail because we can not create empty struct scalar)
+  child0     = ICW{};
+  child1     = ICW{};
+  child2     = ICW{};
+  struct_col = SCW{{child0, child1, child2}};
+  ret_col0   = ICW{};
+  ret_col1   = ICW{};
+  ret_col2   = ICW{};
+  this->reduction_test(struct_col,
+                       cudf::table_view{{ret_col0, ret_col1, ret_col2}},  // expected_value,
+                       false,
+                       false,
+                       cudf::make_nth_element_aggregation(0, cudf::null_policy::INCLUDE));
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 5fce0841b88059b5df3d76431cf48da881859d08 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 24 Aug 2021 17:25:47 +0200
Subject: [PATCH 29/46] Update to UCX-Py 0.22 (#9099)

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Jordan Jacobelli (https://github.com/Ethyling)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/9099
---
 ci/gpu/build.sh | 2 +-
 ci/gpu/java.sh  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 326fc2f1119..8e5b4d80115 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -80,7 +80,7 @@ gpuci_mamba_retry install -y \
                   "rapids-notebook-env=$MINOR_VERSION.*" \
                   "dask-cuda=${MINOR_VERSION}" \
                   "rmm=$MINOR_VERSION.*" \
-                  "ucx-py=0.21.*"
+                  "ucx-py=0.22.*"
 
 # https://docs.rapids.ai/maintainers/depmgmt/
 # gpuci_mamba_retry remove --force rapids-build-env rapids-notebook-env
diff --git a/ci/gpu/java.sh b/ci/gpu/java.sh
index 8c4b597d12d..b46817bb9ab 100755
--- a/ci/gpu/java.sh
+++ b/ci/gpu/java.sh
@@ -80,7 +80,7 @@ gpuci_conda_retry install -y \
                   "rapids-notebook-env=$MINOR_VERSION.*" \
                   "dask-cuda=${MINOR_VERSION}" \
                   "rmm=$MINOR_VERSION.*" \
-                  "ucx-py=0.21.*" \
+                  "ucx-py=0.22.*" \
                   "openjdk=8.*" \
                   "maven"
 

From c271ce2379d20712e097670a49992a9175747907 Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Tue, 24 Aug 2021 11:46:16 -0500
Subject: [PATCH 30/46] move filepath and mmap logic out of json/csv up to
 functions.cpp (#9040)

Removes the filepath-related logic from readers, moving whole-file compression type inference up to `io/functions.cpp`. Also moves the lazy mmap datasource creation logic out csv/json reader and up to `io/functions.cpp`.

Authors:
  - Christopher Harris (https://github.com/cwharris)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - MithunR (https://github.com/mythrocks)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Marlene  (https://github.com/marlenezw)

URL: https://github.com/rapidsai/cudf/pull/9040
---
 cpp/include/cudf/io/csv.hpp            |  36 ++++-
 cpp/include/cudf/io/detail/avro.hpp    |  13 --
 cpp/include/cudf/io/detail/orc.hpp     |  13 --
 cpp/include/cudf/io/detail/parquet.hpp |  13 --
 cpp/include/cudf/io/json.hpp           |  34 ++++-
 cpp/src/io/avro/reader_impl.cu         |  10 --
 cpp/src/io/comp/io_uncomp.h            |   7 +-
 cpp/src/io/comp/uncomp.cpp             |  21 +--
 cpp/src/io/csv/reader_impl.cu          |  83 ++----------
 cpp/src/io/csv/reader_impl.hpp         |   4 -
 cpp/src/io/functions.cpp               | 174 ++++++++++++++++---------
 cpp/src/io/json/reader_impl.cu         |  89 +++----------
 cpp/src/io/json/reader_impl.hpp        |   7 +-
 cpp/src/io/orc/reader_impl.cu          |   9 --
 cpp/src/io/parquet/reader_impl.cu      |   9 --
 cpp/src/io/utilities/parsing_utils.cu  |  34 -----
 cpp/src/io/utilities/parsing_utils.cuh |  18 ---
 python/cudf/cudf/_lib/csv.pyx          |   2 +-
 18 files changed, 232 insertions(+), 344 deletions(-)

diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index 455ffce7ed8..4545972e269 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -176,6 +176,40 @@ class csv_reader_options {
    */
   std::size_t get_byte_range_size() const { return _byte_range_size; }
 
+  /**
+   * @brief Returns number of bytes to read with padding.
+   */
+  std::size_t get_byte_range_size_with_padding() const
+  {
+    if (_byte_range_size == 0) {
+      return 0;
+    } else {
+      return _byte_range_size + get_byte_range_padding();
+    }
+  }
+
+  /**
+   * @brief Returns number of bytes to pad when reading.
+   */
+  std::size_t get_byte_range_padding() const
+  {
+    auto const num_names   = _names.size();
+    auto const num_dtypes  = std::visit([](const auto& dtypes) { return dtypes.size(); }, _dtypes);
+    auto const num_columns = std::max(num_dtypes, num_names);
+
+    auto const max_row_bytes = 16 * 1024;  // 16KB
+    auto const column_bytes  = 64;
+    auto const base_padding  = 1024;  // 1KB
+
+    if (num_columns == 0) {
+      // Use flat size if the number of columns is not known
+      return max_row_bytes;
+    }
+
+    // Expand the size based on the number of columns, if available
+    return base_padding + num_columns * column_bytes;
+  }
+
   /**
    * @brief Returns names of the columns.
    */
@@ -1163,7 +1197,7 @@ class csv_reader_options_builder {
  * @return The set of columns along with metadata.
  */
 table_with_metadata read_csv(
-  csv_reader_options const& options,
+  csv_reader_options options,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/io/detail/avro.hpp b/cpp/include/cudf/io/detail/avro.hpp
index 98483d1c03e..306c15dcb72 100644
--- a/cpp/include/cudf/io/detail/avro.hpp
+++ b/cpp/include/cudf/io/detail/avro.hpp
@@ -38,19 +38,6 @@ class reader {
   std::unique_ptr<impl> _impl;
 
  public:
-  /**
-   * @brief Constructor from an array of file paths
-   *
-   * @param filepaths Paths to the files containing the input dataset
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  explicit reader(std::vector<std::string> const& filepaths,
-                  avro_reader_options const& options,
-                  rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
-
   /**
    * @brief Constructor from an array of datasources
    *
diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index ab26c01db74..2174b688da2 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -47,19 +47,6 @@ class reader {
   std::unique_ptr<impl> _impl;
 
  public:
-  /**
-   * @brief Constructor from an array of file paths
-   *
-   * @param filepaths Paths to the files containing the input dataset
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  explicit reader(std::vector<std::string> const& filepaths,
-                  orc_reader_options const& options,
-                  rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
-
   /**
    * @brief Constructor from an array of datasources
    *
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index d95af7a11da..14f27ef8eef 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -49,19 +49,6 @@ class reader {
   std::unique_ptr<impl> _impl;
 
  public:
-  /**
-   * @brief Constructor from an array of file paths
-   *
-   * @param filepaths Paths to the files containing the input dataset
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  explicit reader(std::vector<std::string> const& filepaths,
-                  parquet_reader_options const& options,
-                  rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
-
   /**
    * @brief Constructor from an array of datasources
    *
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 31201e30ac6..5f34803f28e 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -136,6 +136,38 @@ class json_reader_options {
    */
   size_t get_byte_range_size() const { return _byte_range_size; }
 
+  /**
+   * @brief Returns number of bytes to read with padding.
+   */
+  size_t get_byte_range_size_with_padding() const
+  {
+    if (_byte_range_size == 0) {
+      return 0;
+    } else {
+      return _byte_range_size + get_byte_range_padding();
+    }
+  }
+
+  /**
+   * @brief Returns number of bytes to pad when reading.
+   */
+  size_t get_byte_range_padding() const
+  {
+    auto const num_columns = std::visit([](const auto& dtypes) { return dtypes.size(); }, _dtypes);
+
+    auto const max_row_bytes = 16 * 1024;  // 16KB
+    auto const column_bytes  = 64;
+    auto const base_padding  = 1024;  // 1KB
+
+    if (num_columns == 0) {
+      // Use flat size if the number of columns is not known
+      return max_row_bytes;
+    }
+
+    // Expand the size based on the number of columns, if available
+    return base_padding + num_columns * column_bytes;
+  }
+
   /**
    * @brief Whether to read the file as a json object per line.
    */
@@ -328,7 +360,7 @@ class json_reader_options_builder {
  * @return The set of columns along with metadata.
  */
 table_with_metadata read_json(
-  json_reader_options const& options,
+  json_reader_options options,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index f6ffdd99d35..08ea96139a1 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -474,16 +474,6 @@ table_with_metadata reader::impl::read(avro_reader_options const& options,
   return {std::make_unique<table>(std::move(out_columns)), std::move(metadata_out)};
 }
 
-// Forward to implementation
-reader::reader(std::vector<std::string> const& filepaths,
-               avro_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(filepaths.size() == 1, "Only a single source is currently supported.");
-  _impl = std::make_unique<impl>(datasource::create(filepaths[0]), options, mr);
-}
-
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                avro_reader_options const& options,
diff --git a/cpp/src/io/comp/io_uncomp.h b/cpp/src/io/comp/io_uncomp.h
index 8daf73ecd0c..7b1feb84813 100644
--- a/cpp/src/io/comp/io_uncomp.h
+++ b/cpp/src/io/comp/io_uncomp.h
@@ -16,12 +16,13 @@
 
 #pragma once
 
+#include <cudf/io/types.hpp>
+#include <cudf/utilities/span.hpp>
+
 #include <memory>
 #include <string>
 #include <vector>
 
-#include <cudf/utilities/span.hpp>
-
 using cudf::host_span;
 
 namespace cudf {
@@ -42,7 +43,7 @@ enum {
 
 std::vector<char> io_uncompress_single_h2d(void const* src, size_t src_size, int stream_type);
 
-std::vector<char> get_uncompressed_data(host_span<char const> data, std::string const& compression);
+std::vector<char> get_uncompressed_data(host_span<char const> data, compression_type compression);
 
 class HostDecompressor {
  public:
diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index 2cb99d897fe..e08cf1f8e1b 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -369,6 +369,7 @@ std::vector<char> io_uncompress_single_h2d(const void* src, size_t src_size, int
       // Unsupported format
       break;
   }
+
   CUDF_EXPECTS(comp_data != nullptr, "Unsupported compressed stream type");
   CUDF_EXPECTS(comp_len > 0, "Unsupported compressed stream type");
 
@@ -422,17 +423,17 @@ std::vector<char> io_uncompress_single_h2d(const void* src, size_t src_size, int
  * @return Vector containing the output uncompressed data
  */
 std::vector<char> get_uncompressed_data(host_span<char const> const data,
-                                        std::string const& compression)
+                                        compression_type compression)
 {
-  int comp_type = IO_UNCOMP_STREAM_TYPE_INFER;
-  if (compression == "gzip")
-    comp_type = IO_UNCOMP_STREAM_TYPE_GZIP;
-  else if (compression == "zip")
-    comp_type = IO_UNCOMP_STREAM_TYPE_ZIP;
-  else if (compression == "bz2")
-    comp_type = IO_UNCOMP_STREAM_TYPE_BZIP2;
-  else if (compression == "xz")
-    comp_type = IO_UNCOMP_STREAM_TYPE_XZ;
+  auto const comp_type = [compression]() {
+    switch (compression) {
+      case compression_type::GZIP: return IO_UNCOMP_STREAM_TYPE_GZIP;
+      case compression_type::ZIP: return IO_UNCOMP_STREAM_TYPE_ZIP;
+      case compression_type::BZIP2: return IO_UNCOMP_STREAM_TYPE_BZIP2;
+      case compression_type::XZ: return IO_UNCOMP_STREAM_TYPE_XZ;
+      default: return IO_UNCOMP_STREAM_TYPE_INFER;
+    }
+  }();
 
   return io_uncompress_single_h2d(data.data(), data.size(), comp_type);
 }
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 7f85589a8aa..579a8a5549b 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -56,31 +56,6 @@ namespace csv {
 using namespace cudf::io::csv;
 using namespace cudf::io;
 
-/**
- * @brief Estimates the maximum expected length or a row, based on the number
- * of columns
- *
- * If the number of columns is not available, it will return a value large
- * enough for most use cases
- *
- * @param[in] num_columns Number of columns in the CSV file (optional)
- *
- * @return Estimated maximum size of a row, in bytes
- */
-constexpr size_t calculateMaxRowSize(int num_columns = 0) noexcept
-{
-  constexpr size_t max_row_bytes = 16 * 1024;  // 16KB
-  constexpr size_t column_bytes  = 64;
-  constexpr size_t base_padding  = 1024;  // 1KB
-  if (num_columns == 0) {
-    // Use flat size if the number of columns is not known
-    return max_row_bytes;
-  } else {
-    // Expand the size based on the number of columns, if available
-    return base_padding + num_columns * column_bytes;
-  }
-}
-
 /**
  * @brief Translates a dtype string and returns its dtype enumeration and any
  * extended dtype flags that are supported by cuIO. Often, this is a column
@@ -198,35 +173,22 @@ void erase_except_last(C& container, rmm::cuda_stream_view stream)
 std::pair<rmm::device_uvector<char>, reader::impl::selected_rows_offsets>
 reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
 {
-  auto range_offset  = opts_.get_byte_range_offset();
-  auto range_size    = opts_.get_byte_range_size();
-  auto skip_rows     = opts_.get_skiprows();
-  auto skip_end_rows = opts_.get_skipfooter();
-  auto num_rows      = opts_.get_nrows();
+  auto range_offset      = opts_.get_byte_range_offset();
+  auto range_size        = opts_.get_byte_range_size();
+  auto range_size_padded = opts_.get_byte_range_size_with_padding();
+  auto skip_rows         = opts_.get_skiprows();
+  auto skip_end_rows     = opts_.get_skipfooter();
+  auto num_rows          = opts_.get_nrows();
 
   if (range_offset > 0 || range_size > 0) {
-    CUDF_EXPECTS(compression_type_ == "none",
+    CUDF_EXPECTS(opts_.get_compression() == compression_type::NONE,
                  "Reading compressed data using `byte range` is unsupported");
   }
-  size_t map_range_size = 0;
-  if (range_size != 0) {
-    auto num_given_dtypes =
-      std::visit([](const auto& dtypes) { return dtypes.size(); }, opts_.get_dtypes());
-    const auto num_columns = std::max(opts_.get_names().size(), num_given_dtypes);
-    map_range_size         = range_size + calculateMaxRowSize(num_columns);
-  }
-
-  // Support delayed opening of the file if using memory mapping datasource
-  // This allows only mapping of a subset of the file if using byte range
-  if (source_ == nullptr) {
-    assert(!filepath_.empty());
-    source_ = datasource::create(filepath_, range_offset, map_range_size);
-  }
 
   // Transfer source data to GPU
   if (!source_->is_empty()) {
-    auto data_size = (map_range_size != 0) ? map_range_size : source_->size();
-    auto buffer    = source_->host_read(range_offset, data_size);
+    auto const data_size = (range_size_padded != 0) ? range_size_padded : source_->size();
+    auto const buffer    = source_->host_read(range_offset, data_size);
 
     auto h_data = host_span<char const>(  //
       reinterpret_cast<const char*>(buffer->data()),
@@ -234,10 +196,11 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
 
     std::vector<char> h_uncomp_data_owner;
 
-    if (compression_type_ != "none") {
-      h_uncomp_data_owner = get_uncompressed_data(h_data, compression_type_);
+    if (opts_.get_compression() != compression_type::NONE) {
+      h_uncomp_data_owner = get_uncompressed_data(h_data, opts_.get_compression());
       h_data              = h_uncomp_data_owner;
     }
+
     // None of the parameters for row selection is used, we are parsing the entire file
     const bool load_whole_file = range_offset == 0 && range_size == 0 && skip_rows <= 0 &&
                                  skip_end_rows <= 0 && num_rows == -1;
@@ -845,35 +808,17 @@ parse_options make_parse_options(csv_reader_options const& reader_opts,
 }
 
 reader::impl::impl(std::unique_ptr<datasource> source,
-                   std::string filepath,
                    csv_reader_options const& options,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
-  : mr_(mr), source_(std::move(source)), filepath_(filepath), opts_(options)
+  : mr_(mr), source_(std::move(source)), opts_(options)
 {
   num_actual_cols_ = opts_.get_names().size();
   num_active_cols_ = num_actual_cols_;
 
-  compression_type_ =
-    infer_compression_type(opts_.get_compression(),
-                           filepath,
-                           {{"gz", "gzip"}, {"zip", "zip"}, {"bz2", "bz2"}, {"xz", "xz"}});
-
   opts = make_parse_options(options, stream);
 }
 
-// Forward to implementation
-reader::reader(std::vector<std::string> const& filepaths,
-               csv_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(filepaths.size() == 1, "Only a single source is currently supported.");
-  // Delay actual instantiation of data source until read to allow for
-  // partial memory mapping of file using byte ranges
-  _impl = std::make_unique<impl>(nullptr, filepaths[0], options, stream, mr);
-}
-
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                csv_reader_options const& options,
@@ -881,7 +826,7 @@ reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported.");
-  _impl = std::make_unique<impl>(std::move(sources[0]), "", options, stream, mr);
+  _impl = std::make_unique<impl>(std::move(sources[0]), options, stream, mr);
 }
 
 // Destructor within this translation unit
diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
index 4416457be16..de363a46ffe 100644
--- a/cpp/src/io/csv/reader_impl.hpp
+++ b/cpp/src/io/csv/reader_impl.hpp
@@ -72,13 +72,11 @@ class reader::impl {
    * @brief Constructor from a dataset source with reader options.
    *
    * @param source Dataset source
-   * @param filepath Filepath if reading dataset from a file
    * @param options Settings for controlling reading behavior
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit impl(std::unique_ptr<datasource> source,
-                std::string filepath,
                 csv_reader_options const& options,
                 rmm::cuda_stream_view stream,
                 rmm::mr::device_memory_resource* mr);
@@ -213,8 +211,6 @@ class reader::impl {
  private:
   rmm::mr::device_memory_resource* mr_ = nullptr;
   std::unique_ptr<datasource> source_;
-  std::string filepath_;
-  std::string compression_type_;
   const csv_reader_options opts_;
 
   cudf::size_type num_records_ = 0;  // Number of rows with actual data
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index bf51012211c..438cb1762c6 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -106,76 +106,113 @@ chunked_parquet_writer_options_builder chunked_parquet_writer_options::builder(
 }
 
 namespace {
-template <typename reader, typename reader_options>
-std::unique_ptr<reader> make_reader(source_info const& src_info,
-                                    reader_options const& options,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
-{
-  if (src_info.type == io_type::FILEPATH) {
-    return std::make_unique<reader>(src_info.filepaths, options, stream, mr);
-  }
 
-  std::vector<std::unique_ptr<datasource>> datasources;
-  if (src_info.type == io_type::HOST_BUFFER) {
-    datasources = cudf::io::datasource::create(src_info.buffers);
-  } else if (src_info.type == io_type::USER_IMPLEMENTED) {
-    datasources = cudf::io::datasource::create(src_info.user_sources);
-  } else {
-    CUDF_FAIL("Unsupported source type");
+std::vector<std::unique_ptr<cudf::io::datasource>> make_datasources(source_info const& info,
+                                                                    size_t range_offset = 0,
+                                                                    size_t range_size   = 0)
+{
+  switch (info.type) {
+    case io_type::FILEPATH: {
+      auto sources = std::vector<std::unique_ptr<cudf::io::datasource>>();
+      for (auto const& filepath : info.filepaths) {
+        sources.emplace_back(cudf::io::datasource::create(filepath, range_offset, range_size));
+      }
+      return sources;
+    }
+    case io_type::HOST_BUFFER: return cudf::io::datasource::create(info.buffers);
+    case io_type::USER_IMPLEMENTED: return cudf::io::datasource::create(info.user_sources);
+    default: CUDF_FAIL("Unsupported source type");
   }
-
-  return std::make_unique<reader>(std::move(datasources), options, stream, mr);
 }
 
-template <typename writer, typename... Ts>
-std::unique_ptr<writer> make_writer(sink_info const& sink, Ts&&... args)
+std::unique_ptr<data_sink> make_datasink(sink_info const& info)
 {
-  if (sink.type == io_type::FILEPATH) {
-    return std::make_unique<writer>(cudf::io::data_sink::create(sink.filepath),
-                                    std::forward<Ts>(args)...);
-  }
-  if (sink.type == io_type::HOST_BUFFER) {
-    return std::make_unique<writer>(cudf::io::data_sink::create(sink.buffer),
-                                    std::forward<Ts>(args)...);
-  }
-  if (sink.type == io_type::VOID) {
-    return std::make_unique<writer>(cudf::io::data_sink::create(), std::forward<Ts>(args)...);
+  switch (info.type) {
+    case io_type::FILEPATH: return cudf::io::data_sink::create(info.filepath);
+    case io_type::HOST_BUFFER: return cudf::io::data_sink::create(info.buffer);
+    case io_type::VOID: return cudf::io::data_sink::create();
+    case io_type::USER_IMPLEMENTED: return cudf::io::data_sink::create(info.user_sink);
+    default: CUDF_FAIL("Unsupported sink type");
   }
-  if (sink.type == io_type::USER_IMPLEMENTED) {
-    return std::make_unique<writer>(cudf::io::data_sink::create(sink.user_sink),
-                                    std::forward<Ts>(args)...);
-  }
-  CUDF_FAIL("Unsupported sink type");
 }
 
 }  // namespace
 
-table_with_metadata read_avro(avro_reader_options const& opts, rmm::mr::device_memory_resource* mr)
+table_with_metadata read_avro(avro_reader_options const& options,
+                              rmm::mr::device_memory_resource* mr)
 {
   namespace avro = cudf::io::detail::avro;
 
   CUDF_FUNC_RANGE();
-  auto reader = make_reader<avro::reader>(opts.get_source(), opts, rmm::cuda_stream_default, mr);
-  return reader->read(opts);
+
+  auto datasources = make_datasources(options.get_source());
+  auto reader =
+    std::make_unique<avro::reader>(std::move(datasources), options, rmm::cuda_stream_default, mr);
+
+  return reader->read(options);
+}
+
+compression_type infer_compression_type(compression_type compression, source_info const& info)
+{
+  if (compression != compression_type::AUTO) { return compression; }
+
+  if (info.type != io_type::FILEPATH) { return compression_type::NONE; }
+
+  auto filepath = info.filepaths[0];
+
+  // Attempt to infer from the file extension
+  const auto pos = filepath.find_last_of('.');
+
+  if (pos == std::string::npos) { return {}; }
+
+  auto str_tolower = [](const auto& begin, const auto& end) {
+    std::string out;
+    std::transform(begin, end, std::back_inserter(out), ::tolower);
+    return out;
+  };
+
+  const auto ext = str_tolower(filepath.begin() + pos + 1, filepath.end());
+
+  if (ext == "gz") { return compression_type::GZIP; }
+  if (ext == "zip") { return compression_type::ZIP; }
+  if (ext == "bz2") { return compression_type::BZIP2; }
+  if (ext == "xz") { return compression_type::XZ; }
+
+  return compression_type::NONE;
 }
 
-table_with_metadata read_json(json_reader_options const& opts, rmm::mr::device_memory_resource* mr)
+table_with_metadata read_json(json_reader_options options, rmm::mr::device_memory_resource* mr)
 {
   namespace json = cudf::io::detail::json;
 
   CUDF_FUNC_RANGE();
-  auto reader = make_reader<json::reader>(opts.get_source(), opts, rmm::cuda_stream_default, mr);
-  return reader->read(opts);
+
+  options.set_compression(infer_compression_type(options.get_compression(), options.get_source()));
+
+  auto datasources = make_datasources(options.get_source(),
+                                      options.get_byte_range_offset(),
+                                      options.get_byte_range_size_with_padding());
+
+  auto reader =
+    std::make_unique<json::reader>(std::move(datasources), options, rmm::cuda_stream_default, mr);
+
+  return reader->read(options);
 }
 
-table_with_metadata read_csv(csv_reader_options const& options, rmm::mr::device_memory_resource* mr)
+table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_resource* mr)
 {
   namespace csv = cudf::io::detail::csv;
 
   CUDF_FUNC_RANGE();
+
+  options.set_compression(infer_compression_type(options.get_compression(), options.get_source()));
+
+  auto datasources = make_datasources(options.get_source(),
+                                      options.get_byte_range_offset(),
+                                      options.get_byte_range_size_with_padding());
+
   auto reader =
-    make_reader<csv::reader>(options.get_source(), options, rmm::cuda_stream_default, mr);
+    std::make_unique<csv::reader>(std::move(datasources), options, rmm::cuda_stream_default, mr);
 
   return reader->read();
 }
@@ -185,7 +222,9 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc
 {
   using namespace cudf::io::detail;
 
-  auto writer = make_writer<csv::writer>(options.get_sink(), options, rmm::cuda_stream_default, mr);
+  auto sink = make_datasink(options.get_sink());
+  auto writer =
+    std::make_unique<csv::writer>(std::move(sink), options, rmm::cuda_stream_default, mr);
 
   writer->write(options.get_table(), options.get_metadata());
 }
@@ -294,8 +333,10 @@ parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info)
 table_with_metadata read_orc(orc_reader_options const& options, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  auto reader =
-    make_reader<detail_orc::reader>(options.get_source(), options, rmm::cuda_stream_default, mr);
+
+  auto datasources = make_datasources(options.get_source());
+  auto reader      = std::make_unique<detail_orc::reader>(
+    std::move(datasources), options, rmm::cuda_stream_default, mr);
 
   return reader->read(options);
 }
@@ -305,11 +346,13 @@ table_with_metadata read_orc(orc_reader_options const& options, rmm::mr::device_
  */
 void write_orc(orc_writer_options const& options, rmm::mr::device_memory_resource* mr)
 {
+  namespace io_detail = cudf::io::detail;
+
   CUDF_FUNC_RANGE();
 
-  namespace io_detail = cudf::io::detail;
-  auto writer         = make_writer<detail_orc::writer>(
-    options.get_sink(), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr);
+  auto sink   = make_datasink(options.get_sink());
+  auto writer = std::make_unique<detail_orc::writer>(
+    std::move(sink), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr);
 
   writer->write(options.get_table());
 }
@@ -317,12 +360,15 @@ void write_orc(orc_writer_options const& options, rmm::mr::device_memory_resourc
 /**
  * @copydoc cudf::io::orc_chunked_writer::orc_chunked_writer
  */
-orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& op,
+orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& options,
                                        rmm::mr::device_memory_resource* mr)
 {
   namespace io_detail = cudf::io::detail;
-  writer              = make_writer<detail_orc::writer>(
-    op.get_sink(), op, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr);
+
+  auto sink = make_datasink(options.get_sink());
+
+  writer = std::make_unique<detail_orc::writer>(
+    std::move(sink), options, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr);
 }
 
 /**
@@ -354,8 +400,10 @@ table_with_metadata read_parquet(parquet_reader_options const& options,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  auto reader = make_reader<detail_parquet::reader>(
-    options.get_source(), options, rmm::cuda_stream_default, mr);
+
+  auto datasources = make_datasources(options.get_source());
+  auto reader      = std::make_unique<detail_parquet::reader>(
+    std::move(datasources), options, rmm::cuda_stream_default, mr);
 
   return reader->read(options);
 }
@@ -392,25 +440,31 @@ table_input_metadata::table_input_metadata(table_view const& table,
 std::unique_ptr<std::vector<uint8_t>> write_parquet(parquet_writer_options const& options,
                                                     rmm::mr::device_memory_resource* mr)
 {
-  CUDF_FUNC_RANGE();
   namespace io_detail = cudf::io::detail;
 
-  auto writer = make_writer<detail_parquet::writer>(
-    options.get_sink(), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr);
+  CUDF_FUNC_RANGE();
+
+  auto sink   = make_datasink(options.get_sink());
+  auto writer = std::make_unique<detail_parquet::writer>(
+    std::move(sink), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr);
 
   writer->write(options.get_table());
+
   return writer->close(options.get_column_chunks_file_path());
 }
 
 /**
  * @copydoc cudf::io::parquet_chunked_writer::parquet_chunked_writer
  */
-parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options const& op,
+parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options const& options,
                                                rmm::mr::device_memory_resource* mr)
 {
   namespace io_detail = cudf::io::detail;
-  writer              = make_writer<detail_parquet::writer>(
-    op.get_sink(), op, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr);
+
+  auto sink = make_datasink(options.get_sink());
+
+  writer = std::make_unique<detail_parquet::writer>(
+    std::move(sink), options, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr);
 }
 
 /**
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index f1080342312..bef97edc426 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -50,31 +50,6 @@ namespace detail {
 namespace json {
 using namespace cudf::io;
 
-namespace {
-/**
- * @brief Estimates the maximum expected length or a row, based on the number
- * of columns
- *
- * If the number of columns is not available, it will return a value large
- * enough for most use cases
- *
- * @param[in] num_columns Number of columns in the JSON file (optional)
- *
- * @return Estimated maximum size of a row, in bytes
- */
-constexpr size_t calculate_max_row_size(int num_columns = 0) noexcept
-{
-  constexpr size_t max_row_bytes = 16 * 1024;  // 16KB
-  constexpr size_t column_bytes  = 64;
-  constexpr size_t base_padding  = 1024;  // 1KB
-  return num_columns == 0
-           ? max_row_bytes  // Use flat size if the # of columns is not known
-           : base_padding +
-               num_columns * column_bytes;  // Expand size based on the # of columns, if available
-}
-
-}  // anonymous namespace
-
 /**
  * @brief Aggregate the table containing keys info by their hash values.
  *
@@ -231,25 +206,12 @@ std::pair<std::vector<std::string>, col_map_ptr_type> reader::impl::get_json_obj
  *
  * @param[in] range_offset Number of bytes offset from the start
  * @param[in] range_size Bytes to read; use `0` for all remaining data
+ * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data
  */
-void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size)
+void reader::impl::ingest_raw_input(size_t range_offset,
+                                    size_t range_size,
+                                    size_t range_size_padded)
 {
-  size_t map_range_size = 0;
-  if (range_size != 0) {
-    auto const dtype_option_size =
-      std::visit([](const auto& dtypes) { return dtypes.size(); }, options_.get_dtypes());
-    map_range_size = range_size + calculate_max_row_size(dtype_option_size);
-  }
-
-  // Support delayed opening of the file if using memory mapping datasource
-  // This allows only mapping of a subset of the file if using byte range
-  if (sources_.empty()) {
-    assert(!filepaths_.empty());
-    for (const auto& path : filepaths_) {
-      sources_.emplace_back(datasource::create(path, range_offset, map_range_size));
-    }
-  }
-
   // Iterate through the user defined sources and read the contents into the local buffer
   CUDF_EXPECTS(!sources_.empty(), "No sources were defined");
   size_t total_source_size = 0;
@@ -262,14 +224,14 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size)
   size_t bytes_read = 0;
   for (const auto& source : sources_) {
     if (!source->is_empty()) {
-      auto data_size = (map_range_size != 0) ? map_range_size : source->size();
+      auto data_size = (range_size_padded != 0) ? range_size_padded : source->size();
       bytes_read += source->host_read(range_offset, data_size, &buffer_[bytes_read]);
     }
   }
 
   byte_range_offset_ = range_offset;
   byte_range_size_   = range_size;
-  load_whole_file_   = byte_range_offset_ == 0 && byte_range_size_ == 0;
+  load_whole_source_ = byte_range_offset_ == 0 && byte_range_size_ == 0;
 }
 
 /**
@@ -280,11 +242,7 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size)
  */
 void reader::impl::decompress_input(rmm::cuda_stream_view stream)
 {
-  const auto compression_type =
-    infer_compression_type(options_.get_compression(),
-                           filepaths_.size() > 0 ? filepaths_[0] : "",
-                           {{"gz", "gzip"}, {"zip", "zip"}, {"bz2", "bz2"}, {"xz", "xz"}});
-  if (compression_type == "none") {
+  if (options_.get_compression() == compression_type::NONE) {
     // Do not use the owner vector here to avoid extra copy
     uncomp_data_ = reinterpret_cast<const char*>(buffer_.data());
     uncomp_size_ = buffer_.size();
@@ -293,12 +251,12 @@ void reader::impl::decompress_input(rmm::cuda_stream_view stream)
       host_span<char const>(                     //
         reinterpret_cast<const char*>(buffer_.data()),
         buffer_.size()),
-      compression_type);
+      options_.get_compression());
 
     uncomp_data_ = uncomp_data_owner_.data();
     uncomp_size_ = uncomp_data_owner_.size();
   }
-  if (load_whole_file_) data_ = rmm::device_buffer(uncomp_data_, uncomp_size_, stream);
+  if (load_whole_source_) data_ = rmm::device_buffer(uncomp_data_, uncomp_size_, stream);
 }
 
 rmm::device_uvector<uint64_t> reader::impl::find_record_starts(rmm::cuda_stream_view stream)
@@ -310,7 +268,7 @@ rmm::device_uvector<uint64_t> reader::impl::find_record_starts(rmm::cuda_stream_
   if (allow_newlines_in_strings_) { chars_to_count.push_back('\"'); }
   // If not starting at an offset, add an extra row to account for the first row in the file
   cudf::size_type prefilter_count = ((byte_range_offset_ == 0) ? 1 : 0);
-  if (load_whole_file_) {
+  if (load_whole_source_) {
     prefilter_count += count_all_from_set(data_, chars_to_count, stream);
   } else {
     prefilter_count += count_all_from_set(uncomp_data_, uncomp_size_, chars_to_count, stream);
@@ -328,7 +286,7 @@ rmm::device_uvector<uint64_t> reader::impl::find_record_starts(rmm::cuda_stream_
   std::vector<char> chars_to_find{'\n'};
   if (allow_newlines_in_strings_) { chars_to_find.push_back('\"'); }
   // Passing offset = 1 to return positions AFTER the found character
-  if (load_whole_file_) {
+  if (load_whole_source_) {
     find_all_from_set(data_, chars_to_find, 1, find_result_ptr, stream);
   } else {
     find_all_from_set(uncomp_data_, uncomp_size_, chars_to_find, 1, find_result_ptr, stream);
@@ -622,11 +580,10 @@ table_with_metadata reader::impl::convert_data_to_table(device_span<uint64_t con
 }
 
 reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
-                   std::vector<std::string> const& filepaths,
                    json_reader_options const& options,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
-  : options_(options), mr_(mr), sources_(std::move(sources)), filepaths_(filepaths)
+  : options_(options), mr_(mr), sources_(std::move(sources))
 {
   CUDF_EXPECTS(options_.is_enabled_lines(), "Only JSON Lines format is currently supported.\n");
 
@@ -649,10 +606,11 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
 table_with_metadata reader::impl::read(json_reader_options const& options,
                                        rmm::cuda_stream_view stream)
 {
-  auto range_offset = options.get_byte_range_offset();
-  auto range_size   = options.get_byte_range_size();
+  auto range_offset      = options.get_byte_range_offset();
+  auto range_size        = options.get_byte_range_size();
+  auto range_size_padded = options.get_byte_range_size_with_padding();
 
-  ingest_raw_input(range_offset, range_size);
+  ingest_raw_input(range_offset, range_size, range_size_padded);
   CUDF_EXPECTS(buffer_.size() != 0, "Ingest failed: input data is null.\n");
 
   decompress_input(stream);
@@ -674,26 +632,13 @@ table_with_metadata reader::impl::read(json_reader_options const& options,
   return convert_data_to_table(rec_starts, stream);
 }
 
-// Forward to implementation
-reader::reader(std::vector<std::string> const& filepaths,
-               json_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-{
-  // Delay actual instantiation of data source until read to allow for
-  // partial memory mapping of file using byte ranges
-  std::vector<std::unique_ptr<datasource>> src = {};  // Empty datasources
-  _impl = std::make_unique<impl>(std::move(src), filepaths, options, stream, mr);
-}
-
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                json_reader_options const& options,
                rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource* mr)
 {
-  std::vector<std::string> file_paths = {};  // Empty filepaths
-  _impl = std::make_unique<impl>(std::move(sources), file_paths, options, stream, mr);
+  _impl = std::make_unique<impl>(std::move(sources), options, stream, mr);
 }
 
 // Destructor within this translation unit
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index bbda7e9ba74..4d14edf360a 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -57,7 +57,6 @@ class reader::impl {
   rmm::mr::device_memory_resource* mr_ = nullptr;
 
   std::vector<std::unique_ptr<datasource>> sources_;
-  std::vector<std::string> filepaths_;
   std::vector<uint8_t> buffer_;
 
   const char* uncomp_data_ = nullptr;
@@ -69,7 +68,7 @@ class reader::impl {
 
   size_t byte_range_offset_ = 0;
   size_t byte_range_size_   = 0;
-  bool load_whole_file_     = true;
+  bool load_whole_source_   = true;
 
   table_metadata metadata_;
   std::vector<data_type> dtypes_;
@@ -110,8 +109,9 @@ class reader::impl {
    *
    * @param[in] range_offset Number of bytes offset from the start
    * @param[in] range_size Bytes to read; use `0` for all remaining data
+   * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data
    */
-  void ingest_raw_input(size_t range_offset, size_t range_size);
+  void ingest_raw_input(size_t range_offset, size_t range_size, size_t range_size_padded);
 
   /**
    * @brief Extract the JSON objects keys from the input file with object rows.
@@ -184,7 +184,6 @@ class reader::impl {
    * @brief Constructor from a dataset source with reader options.
    */
   explicit impl(std::vector<std::unique_ptr<datasource>>&& sources,
-                std::vector<std::string> const& filepaths,
                 json_reader_options const& options,
                 rmm::cuda_stream_view stream,
                 rmm::mr::device_memory_resource* mr);
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index f7bd5ae86b8..33d19aeeabf 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -1438,15 +1438,6 @@ table_with_metadata reader::impl::read(size_type skip_rows,
   return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
 }
 
-// Forward to implementation
-reader::reader(std::vector<std::string> const& filepaths,
-               orc_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-{
-  _impl = std::make_unique<impl>(datasource::create(filepaths), options, mr);
-}
-
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                orc_reader_options const& options,
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index caf11b66206..749ee38e816 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -1690,15 +1690,6 @@ table_with_metadata reader::impl::read(size_type skip_rows,
   return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
 }
 
-// Forward to implementation
-reader::reader(std::vector<std::string> const& filepaths,
-               parquet_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-  : _impl(std::make_unique<impl>(datasource::create(filepaths), options, mr))
-{
-}
-
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                parquet_reader_options const& options,
diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu
index 6c8f01111e5..ba62238c5d3 100644
--- a/cpp/src/io/utilities/parsing_utils.cu
+++ b/cpp/src/io/utilities/parsing_utils.cu
@@ -209,39 +209,5 @@ cudf::size_type count_all_from_set(const char* h_data,
   return find_all_from_set<void>(h_data, h_size, keys, 0, nullptr, stream);
 }
 
-std::string infer_compression_type(
-  const compression_type& compression_arg,
-  const std::string& filename,
-  const std::vector<std::pair<std::string, std::string>>& ext_to_comp_map)
-{
-  auto str_tolower = [](const auto& begin, const auto& end) {
-    std::string out;
-    std::transform(begin, end, std::back_inserter(out), ::tolower);
-    return out;
-  };
-
-  // Attempt to infer from user-supplied argument
-  if (compression_arg != compression_type::AUTO) {
-    switch (compression_arg) {
-      case compression_type::GZIP: return "gzip";
-      case compression_type::BZIP2: return "bz2";
-      case compression_type::ZIP: return "zip";
-      case compression_type::XZ: return "xz";
-      default: break;
-    }
-  }
-
-  // Attempt to infer from the file extension
-  const auto pos = filename.find_last_of('.');
-  if (pos != std::string::npos) {
-    const auto ext = str_tolower(filename.begin() + pos + 1, filename.end());
-    for (const auto& mapping : ext_to_comp_map) {
-      if (mapping.first == ext) { return mapping.second; }
-    }
-  }
-
-  return "none";
-}
-
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 88297423b9b..daf23de7eb2 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -454,24 +454,6 @@ cudf::size_type count_all_from_set(const char* h_data,
                                    const std::vector<char>& keys,
                                    rmm::cuda_stream_view stream);
 
-/**
- * @brief Infer file compression type based on user supplied arguments.
- *
- * If the user specifies a valid compression_type for compression arg,
- * compression type will be computed based on that.  Otherwise the filename
- * and ext_to_comp_map will be used.
- *
- * @param[in] compression_arg User specified compression type (if any)
- * @param[in] filename Filename to base compression type (by extension) on
- * @param[in] ext_to_comp_map User supplied mapping of file extension to compression type
- *
- * @return string representing compression type ("gzip, "bz2", etc)
- */
-std::string infer_compression_type(
-  const compression_type& compression_arg,
-  const std::string& filename,
-  const std::vector<std::pair<std::string, std::string>>& ext_to_comp_map);
-
 /**
  * @brief Checks whether the given character is a whitespace character.
  *
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index 812d614e6d3..9912a7801a4 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -112,7 +112,7 @@ cdef csv_reader_options make_csv_reader_options(
     bool na_filter,
     object prefix,
     object index_col,
-) except +:
+) except *:
     cdef source_info c_source_info = make_source_info([datasource])
     cdef compression_type c_compression
     cdef size_type c_header

From abba33f3364c7f240e6c8047069b03f4ea591024 Mon Sep 17 00:00:00 2001
From: NV-jpt <86264103+NV-jpt@users.noreply.github.com>
Date: Tue, 24 Aug 2021 12:59:14 -0400
Subject: [PATCH 31/46] Add struct accessor to dask-cudf (#8874)

This PR implements 'Struct Accessor' requested feature in dask-cudf (Issue [#8658](https://github.com/rapidsai/cudf/issues/8658))

StructMethod class implemented to expose 'field(key)' method in dask-cudf

        Examples
        --------
        >>> s = cudf.Series([{'a': 1, 'b': 2}, {'a': 3, 'b': 4}])
        >>> ds = dask_cudf.from_cudf(s, 2)
        >>> ds.struct.field(0).compute()
        0    1
        1    3
        dtype: int64
        >>> ds.struct.field('a').compute()
        0    1
        1    3
        dtype: int64

Authors:
  - https://github.com/NV-jpt
  - https://github.com/shaneding

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/8874
---
 python/dask_cudf/dask_cudf/accessors.py       | 37 +++++++++++
 python/dask_cudf/dask_cudf/core.py            |  6 +-
 .../dask_cudf/tests/test_accessor.py          | 62 +++++++++++++++++++
 3 files changed, 104 insertions(+), 1 deletion(-)

diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py
index 04d3e20b844..77973ee34ff 100644
--- a/python/dask_cudf/dask_cudf/accessors.py
+++ b/python/dask_cudf/dask_cudf/accessors.py
@@ -1,6 +1,43 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.
 
 
+class StructMethods:
+    def __init__(self, d_series):
+        self.d_series = d_series
+
+    def field(self, key):
+        """
+        Extract children of the specified struct column
+        in the Series
+        Parameters
+        ----------
+        key: int or str
+            index/position or field name of the respective
+            struct column
+        Returns
+        -------
+        Series
+        Examples
+        --------
+        >>> s = cudf.Series([{'a': 1, 'b': 2}, {'a': 3, 'b': 4}])
+        >>> ds = dask_cudf.from_cudf(s, 2)
+        >>> ds.struct.field(0).compute()
+        0    1
+        1    3
+        dtype: int64
+        >>> ds.struct.field('a').compute()
+        0    1
+        1    3
+        dtype: int64
+        """
+        typ = self.d_series._meta.struct.field(key).dtype
+
+        return self.d_series.map_partitions(
+            lambda s: s.struct.field(key),
+            meta=self.d_series._meta._constructor([], dtype=typ),
+        )
+
+
 class ListMethods:
     def __init__(self, d_series):
         self.d_series = d_series
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index 1a632907047..f1fb408b0d1 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -27,7 +27,7 @@
 from cudf import _lib as libcudf
 
 from dask_cudf import sorting
-from dask_cudf.accessors import ListMethods
+from dask_cudf.accessors import ListMethods, StructMethods
 
 DASK_VERSION = LooseVersion(dask.__version__)
 
@@ -414,6 +414,10 @@ def groupby(self, *args, **kwargs):
     def list(self):
         return ListMethods(self)
 
+    @property
+    def struct(self):
+        return StructMethods(self)
+
 
 class Index(Series, dd.core.Index):
     _partition_type = cudf.Index
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index 342f2b60180..8227023aa51 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -438,3 +438,65 @@ def test_sorting(data, ascending, na_position, ignore_index):
         .reset_index(drop=True)
     )
     assert_eq(expect, got)
+
+
+#############################################################################
+#                            Struct Accessor                                #
+#############################################################################
+struct_accessor_data_params = [
+    [{"a": 5, "b": 10}, {"a": 3, "b": 7}, {"a": -3, "b": 11}],
+    [{"a": None, "b": 1}, {"a": None, "b": 0}, {"a": -3, "b": None}],
+    [{"a": 1, "b": 2}],
+    [{"a": 1, "b": 3, "c": 4}],
+]
+
+
+@pytest.mark.parametrize(
+    "data", struct_accessor_data_params,
+)
+def test_create_struct_series(data):
+    expect = pd.Series(data)
+    ds_got = dgd.from_cudf(Series(data), 2)
+    assert_eq(expect, ds_got.compute())
+
+
+@pytest.mark.parametrize(
+    "data", struct_accessor_data_params,
+)
+def test_struct_field_str(data):
+    for test_key in ["a", "b"]:
+        expect = Series(data).struct.field(test_key)
+        ds_got = dgd.from_cudf(Series(data), 2).struct.field(test_key)
+        assert_eq(expect, ds_got.compute())
+
+
+@pytest.mark.parametrize(
+    "data", struct_accessor_data_params,
+)
+def test_struct_field_integer(data):
+    for test_key in [0, 1]:
+        expect = Series(data).struct.field(test_key)
+        ds_got = dgd.from_cudf(Series(data), 2).struct.field(test_key)
+        assert_eq(expect, ds_got.compute())
+
+
+@pytest.mark.parametrize(
+    "data", struct_accessor_data_params,
+)
+def test_dask_struct_field_Key_Error(data):
+    got = dgd.from_cudf(Series(data), 2)
+
+    # import pdb; pdb.set_trace()
+    with pytest.raises(KeyError):
+        got.struct.field("notakey").compute()
+
+
+@pytest.mark.parametrize(
+    "data", struct_accessor_data_params,
+)
+def test_dask_struct_field_Int_Error(data):
+    # breakpoint()
+    got = dgd.from_cudf(Series(data), 2)
+
+    with pytest.raises(IndexError):
+        got.struct.field(1000).compute()

From a15349358ae2d46cd7f93751452607de4aa09f8b Mon Sep 17 00:00:00 2001
From: "Ram (Ramakrishna Prabhu)"
 <42624703+rgsl888prabhu@users.noreply.github.com>
Date: Tue, 24 Aug 2021 23:37:05 +0530
Subject: [PATCH 32/46] Add support for reading ORC file with no row group
 index (#9060)

The ORC reader in cuIO was designed thinking row group index is always available, which resulted in the failure.
Changes have been made to read ORC files even in case group index stream is not available.

closes #8878

Authors:
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Devavret Makkar (https://github.com/devavret)
  - Vukasin Milovanovic (https://github.com/vuule)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/9060
---
 cpp/src/io/orc/reader_impl.cu                 |  51 ++++++++++++------
 .../TestOrcFile.NoIndStrm.IntWithNulls.orc    | Bin 0 -> 101 bytes
 ...dStrm.StructAndIntWithNulls.TwoStripes.orc | Bin 0 -> 232 bytes
 ...rcFile.NoIndStrm.StructAndIntWithNulls.orc | Bin 0 -> 193 bytes
 ...estOrcFile.NoIndStrm.StructWithNoNulls.orc | Bin 0 -> 167 bytes
 python/cudf/cudf/tests/test_orc.py            |  18 +++++++
 6 files changed, 54 insertions(+), 15 deletions(-)
 create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.IntWithNulls.orc
 create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructAndIntWithNulls.TwoStripes.orc
 create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructAndIntWithNulls.orc
 create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructWithNoNulls.orc

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 33d19aeeabf..1b78d8b8585 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -269,6 +269,7 @@ class aggregate_orc_metadata {
   size_type const num_rows;
   size_type const num_columns;
   size_type const num_stripes;
+  bool row_grp_idx_present = true;
 
   /**
    * @brief Create a metadata object from each element in the source vector
@@ -368,6 +369,8 @@ class aggregate_orc_metadata {
     return per_file_metadata[source_idx].get_column_name(column_idx);
   }
 
+  auto is_row_grp_idx_present() const { return row_grp_idx_present; }
+
   std::vector<cudf::io::orc::metadata::stripe_source_mapping> select_stripes(
     std::vector<std::vector<size_type>> const& user_specified_stripes,
     size_type& row_start,
@@ -457,6 +460,7 @@ class aggregate_orc_metadata {
           ProtobufReader(sf_data, sf_length)
             .read(per_file_metadata[mapping.source_idx].stripefooters[i]);
           mapping.stripe_info[i].second = &per_file_metadata[mapping.source_idx].stripefooters[i];
+          if (stripe->indexLength == 0) { row_grp_idx_present = false; }
         }
       }
     }
@@ -1101,6 +1105,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
     // Association between each ORC column and its cudf::column
     _col_meta.orc_col_map.emplace_back(_metadata->get_num_cols(), -1);
     std::vector<orc_column_meta> nested_col;
+    bool is_data_empty = false;
 
     // Get a list of column data types
     std::vector<data_type> column_types;
@@ -1157,6 +1162,8 @@ table_with_metadata reader::impl::read(size_type skip_rows,
 
       const bool use_index =
         (_use_index == true) &&
+        // Do stripes have row group index
+        _metadata->is_row_grp_idx_present() &&
         // Only use if we don't have much work with complete columns & stripes
         // TODO: Consider nrows, gpu, and tune the threshold
         (num_rows > _metadata->get_row_index_stride() && !(_metadata->get_row_index_stride() & 7) &&
@@ -1204,13 +1211,21 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                                                           stream_info,
                                                           level == 0);
 
-          CUDF_EXPECTS(total_data_size > 0, "Expected streams data within stripe");
+          if (total_data_size == 0) {
+            CUDF_EXPECTS(stripe_info->indexLength == 0, "Invalid index rowgroup stream data");
+            // In case ROW GROUP INDEX is not present and all columns are structs with no null
+            // stream, there is nothing to read at this level.
+            auto fn_check_dtype = [](auto dtype) { return dtype.id() == type_id::STRUCT; };
+            CUDF_EXPECTS(std::all_of(column_types.begin(), column_types.end(), fn_check_dtype),
+                         "Expected streams data within stripe");
+            is_data_empty = true;
+          }
 
           stripe_data.emplace_back(total_data_size, stream);
           auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());
 
           // Coalesce consecutive streams into one read
-          while (stream_count < stream_info.size()) {
+          while (not is_data_empty and stream_count < stream_info.size()) {
             const auto d_dst  = dst_base + stream_info[stream_count].dst_pos;
             const auto offset = stream_info[stream_count].offset;
             auto len          = stream_info[stream_count].length;
@@ -1292,8 +1307,10 @@ table_with_metadata reader::impl::read(size_type skip_rows,
             if (chunk.type_kind == orc::TIMESTAMP) {
               chunk.ts_clock_rate = to_clockrate(_timestamp_type.id());
             }
-            for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
-              chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
+            if (not is_data_empty) {
+              for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
+                chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
+              }
             }
           }
           stripe_start_row += num_rows_per_stripe;
@@ -1327,7 +1344,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                          });
         }
         // Setup row group descriptors if using indexes
-        if (_metadata->per_file_metadata[0].ps.compression != orc::NONE) {
+        if (_metadata->per_file_metadata[0].ps.compression != orc::NONE and not is_data_empty) {
           auto decomp_data =
             decompress_stripe_data(chunks,
                                    stripe_data,
@@ -1378,19 +1395,23 @@ table_with_metadata reader::impl::read(size_type skip_rows,
           out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, stream, _mr);
         }
 
-        decode_stream_data(chunks,
-                           num_dict_entries,
-                           skip_rows,
-                           tz_table.view(),
-                           row_groups,
-                           _metadata->get_row_index_stride(),
-                           out_buffers[level],
-                           level,
-                           stream);
+        if (not is_data_empty) {
+          decode_stream_data(chunks,
+                             num_dict_entries,
+                             skip_rows,
+                             tz_table.view(),
+                             row_groups,
+                             _metadata->get_row_index_stride(),
+                             out_buffers[level],
+                             level,
+                             stream);
+        }
 
         // Extract information to process nested child columns
         if (nested_col.size()) {
-          scan_null_counts(chunks, null_count_prefix_sums[level], stream);
+          if (not is_data_empty) {
+            scan_null_counts(chunks, null_count_prefix_sums[level], stream);
+          }
           row_groups.device_to_host(stream, true);
           aggregate_child_meta(chunks, row_groups, out_buffers[level], nested_col, level);
         }
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.IntWithNulls.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.IntWithNulls.orc
new file mode 100644
index 0000000000000000000000000000000000000000..2103e0212fcdcc9a110e0dbe550d2fcb94bb640d
GIT binary patch
literal 101
zcmeYda%N><_<x_5f#DFF2GdjrJqCuyNski*5~d_P1=1{+<~aGt@Hm`dVQ^$%;9wTe
zmf`}E3=(__QW_je92`7CjEqu@iAqcy%my4B$^wiM4GKz33_KbP2AoY_n3?^9oW%eF
CbQJyo

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructAndIntWithNulls.TwoStripes.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructAndIntWithNulls.TwoStripes.orc
new file mode 100644
index 0000000000000000000000000000000000000000..e57da851820d4b76e6eaa581c3215ce8e397944e
GIT binary patch
literal 232
zcmeYda^_@UIKZyK#LB?%|2{7R!yz^srl}6xAi*E}OkNBOkCPrJh$Ku(cn+im5~d|Q
zW%?MYVsX%9HA}mIFgvq|vEoG^2CyLvNQQ{O47otOAx%Kb62xAlPI>ewK`ddygwP8Y
z9x!aOO~_G|<C9ZVa|_cv=AwBlgrmPRvdhr3%P_LDvpKSpS>w<NrK2;Lo-uB0WMJTM
a5nz;PP*7rG;L%_(;B5NB%<LcJECv7@fJzns

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructAndIntWithNulls.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructAndIntWithNulls.orc
new file mode 100644
index 0000000000000000000000000000000000000000..32d0c85dd25b7cbe4c9e3c434b438f57cab52bc2
GIT binary patch
literal 193
zcmeYda%N><_<x_5f#DFF2Gdjr1kVP@;|B?UV1$W-xj*=U+&Bh?$4QS91QI4CJO$D$
zm*&JM*cneb&-BcgYlBxo<I$T79_0wdIQhu%1e|GcX*Oi20_sQ*J2C0eqXe;p2?n7C
zmmUN>diX#vDJf-glFyVRpXLuUW-OJMyhLL1)EO2t6qR(=1odbJ^{h+~)G&L%z`)@k
Zz$nq6pv1($qrqUn+4O~(*+0lx3;@GaM5X`$

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructWithNoNulls.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructWithNoNulls.orc
new file mode 100644
index 0000000000000000000000000000000000000000..1c6e53a0b92f68b996b4a249b7050eae715331e9
GIT binary patch
literal 167
zcmeYda%N{>U}a<gkqqn>3=E$=&YnDZ-Sh)PaGSs>4kt-x4PlN{u7zh5rdTp?f|Q<M
zbYWn4ob)(BBw<<th;F)c#vy7Rx5_z{gL+4f9Ar4I<6y|gkOowgAl5PI(W3;hgb4;g
z0aqUcJbLgzFezn9l23>b!=@QCW-OgC`Ejpf%R(Vxc42|d(gh3*9A*NH5)BGUObpx_
P3<jJ{UznNwgPg?x?Q1ut

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 33ce8427a71..0f769d17015 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1171,3 +1171,21 @@ def test_writer_timestamp_stream_size(datadir, tmpdir):
     got = pa.orc.ORCFile(gdf_fname).read().to_pandas()
 
     assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "fname",
+    [
+        "TestOrcFile.NoIndStrm.StructWithNoNulls.orc",
+        "TestOrcFile.NoIndStrm.StructAndIntWithNulls.orc",
+        "TestOrcFile.NoIndStrm.StructAndIntWithNulls.TwoStripes.orc",
+        "TestOrcFile.NoIndStrm.IntWithNulls.orc",
+    ],
+)
+def test_no_row_group_index_orc_read(datadir, fname):
+    fpath = datadir / fname
+
+    got = pa.orc.ORCFile(fpath).read()
+    expect = cudf.read_orc(fpath)
+
+    assert got.equals(expect.to_arrow())

From 807519983dc57b103a2b8e081e47fac918813615 Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Tue, 24 Aug 2021 16:21:59 -0500
Subject: [PATCH 33/46] multibyte_split (#8702)

Adds `multbyte_split` API, part of #8557. Takes one large text input and splits it in to a single strings column.

- Features:
  - [x] split on multi-byte delimiters
  - [x] split on multiple delimiters simultaneously
  - [ ] erase delimiters from output (will implement later)
  - [ ] replace delimiters with alternate text (will implement later)
- Supported input types
  - [x] `cudf::io::text::data_chunk_source`
    - [x] `cudf::string_scalar` via `cudf::device_span`
    - [x] `std::string` via `std::istream`
    - [x] files via `std::istream`
- Supported delimiter type
  - [x] `std::string`
- Performance Goals
  - [x] ~2G/s from file, ~4G/s on-device. There is room for improvement, but perf is good enough for now.
- Additional goals:
  - [x] add reusable block-level pattern-matching utility.
  - [ ] add reusable block-level utility to "peek" at "future" scan states (will implement with delimiter erasure).

Authors:
  - Christopher Harris (https://github.com/cwharris)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Elias Stehle (https://github.com/elstehle)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Devavret Makkar (https://github.com/devavret)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: https://github.com/rapidsai/cudf/pull/8702
---
 conda/recipes/libcudf/meta.yaml               |  10 +-
 cpp/CMakeLists.txt                            |   1 +
 cpp/benchmarks/CMakeLists.txt                 |   5 +
 cpp/benchmarks/io/cuio_benchmark_common.hpp   |   2 +
 .../io/text/multibyte_split_benchmark.cpp     | 164 ++++++++
 cpp/include/cudf/column/column_factories.hpp  |  20 +
 .../cudf/io/text/data_chunk_source.hpp        |  70 ++++
 .../io/text/data_chunk_source_factories.hpp   | 231 ++++++++++
 .../cudf/io/text/detail/multistate.hpp        | 155 +++++++
 .../cudf/io/text/detail/tile_state.hpp        | 134 ++++++
 cpp/include/cudf/io/text/detail/trie.hpp      | 264 ++++++++++++
 cpp/include/cudf/io/text/multibyte_split.hpp  |  37 ++
 cpp/src/io/text/multibyte_split.cu            | 396 ++++++++++++++++++
 cpp/src/strings/strings_column_factories.cu   |  42 ++
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/io/text/multibyte_split_test.cpp    | 143 +++++++
 16 files changed, 1673 insertions(+), 2 deletions(-)
 create mode 100644 cpp/benchmarks/io/text/multibyte_split_benchmark.cpp
 create mode 100644 cpp/include/cudf/io/text/data_chunk_source.hpp
 create mode 100644 cpp/include/cudf/io/text/data_chunk_source_factories.hpp
 create mode 100644 cpp/include/cudf/io/text/detail/multistate.hpp
 create mode 100644 cpp/include/cudf/io/text/detail/tile_state.hpp
 create mode 100644 cpp/include/cudf/io/text/detail/trie.hpp
 create mode 100644 cpp/include/cudf/io/text/multibyte_split.hpp
 create mode 100644 cpp/src/io/text/multibyte_split.cu
 create mode 100644 cpp/tests/io/text/multibyte_split_test.cpp

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 208c21c2dc0..0f05dcb4bb3 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -118,10 +118,9 @@ test:
     - test -f $PREFIX/include/cudf/hashing.hpp
     - test -f $PREFIX/include/cudf/interop.hpp
     - test -f $PREFIX/include/cudf/io/avro.hpp
+    - test -f $PREFIX/include/cudf/io/csv.hpp
     - test -f $PREFIX/include/cudf/io/data_sink.hpp
     - test -f $PREFIX/include/cudf/io/datasource.hpp
-    - test -f $PREFIX/include/cudf/io/orc_metadata.hpp
-    - test -f $PREFIX/include/cudf/io/csv.hpp
     - test -f $PREFIX/include/cudf/io/detail/avro.hpp
     - test -f $PREFIX/include/cudf/io/detail/csv.hpp
     - test -f $PREFIX/include/cudf/io/detail/json.hpp
@@ -129,8 +128,15 @@ test:
     - test -f $PREFIX/include/cudf/io/detail/parquet.hpp
     - test -f $PREFIX/include/cudf/io/detail/utils.hpp
     - test -f $PREFIX/include/cudf/io/json.hpp
+    - test -f $PREFIX/include/cudf/io/orc_metadata.hpp
     - test -f $PREFIX/include/cudf/io/orc.hpp
     - test -f $PREFIX/include/cudf/io/parquet.hpp
+    - test -f $PREFIX/include/cudf/io/text/data_chunk_source_factories.hpp
+    - test -f $PREFIX/include/cudf/io/text/data_chunk_source.hpp
+    - test -f $PREFIX/include/cudf/io/text/detail/multistate.hpp
+    - test -f $PREFIX/include/cudf/io/text/detail/tile_state.hpp
+    - test -f $PREFIX/include/cudf/io/text/detail/trie.hpp
+    - test -f $PREFIX/include/cudf/io/text/multibyte_split.hpp
     - test -f $PREFIX/include/cudf/io/types.hpp
     - test -f $PREFIX/include/cudf/ipc.hpp
     - test -f $PREFIX/include/cudf/join.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d6b457a94d4..d9a493f57a0 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -307,6 +307,7 @@ add_library(cudf
     src/io/parquet/writer_impl.cu
     src/io/statistics/orc_column_statistics.cu
     src/io/statistics/parquet_column_statistics.cu
+    src/io/text/multibyte_split.cu
     src/io/utilities/column_buffer.cpp
     src/io/utilities/data_sink.cpp
     src/io/utilities/datasource.cpp
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 56f17dc7090..b3b92003573 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -245,3 +245,8 @@ ConfigureBench(STRINGS_BENCH
 # - json benchmark -------------------------------------------------------------------
 ConfigureBench(JSON_BENCH
   string/json_benchmark.cpp)
+
+###################################################################################################
+# - io benchmark ---------------------------------------------------------------------
+ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK
+  io/text/multibyte_split_benchmark.cpp)
diff --git a/cpp/benchmarks/io/cuio_benchmark_common.hpp b/cpp/benchmarks/io/cuio_benchmark_common.hpp
index 2c49386a901..7107585dbcc 100644
--- a/cpp/benchmarks/io/cuio_benchmark_common.hpp
+++ b/cpp/benchmarks/io/cuio_benchmark_common.hpp
@@ -33,6 +33,8 @@ using cudf::io::io_type;
   benchmark(name##_buffer_output, type_or_group, static_cast<uint32_t>(io_type::HOST_BUFFER)); \
   benchmark(name##_void_output, type_or_group, static_cast<uint32_t>(io_type::VOID));
 
+std::string random_file_in_dir(std::string const& dir_path);
+
 /**
  * @brief Class to create a coupled `source_info` and `sink_info` of given type.
  */
diff --git a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp
new file mode 100644
index 00000000000..cb8a61caa57
--- /dev/null
+++ b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/io/cuio_benchmark_common.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf_test/file_utilities.hpp>
+
+#include <cudf/io/text/data_chunk_source_factories.hpp>
+#include <cudf/io/text/multibyte_split.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/combine.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/transform.h>
+
+#include <cstdio>
+#include <fstream>
+#include <memory>
+
+using cudf::test::fixed_width_column_wrapper;
+
+temp_directory const temp_dir("cudf_gbench");
+
+enum data_chunk_source_type {
+  device,
+  file,
+  host,
+};
+
+static cudf::string_scalar create_random_input(int32_t num_chars,
+                                               double delim_factor,
+                                               double deviation,
+                                               std::string delim)
+{
+  auto const num_delims      = static_cast<int32_t>((num_chars * delim_factor) / delim.size());
+  auto const num_delim_chars = num_delims * delim.size();
+  auto const num_value_chars = num_chars - num_delim_chars;
+  auto const num_rows        = num_delims;
+  auto const value_size_avg  = static_cast<int32_t>(num_value_chars / num_rows);
+  auto const value_size_min  = static_cast<int32_t>(value_size_avg * (1 - deviation));
+  auto const value_size_max  = static_cast<int32_t>(value_size_avg * (1 + deviation));
+
+  data_profile table_profile;
+
+  table_profile.set_distribution_params(  //
+    cudf::type_id::STRING,
+    distribution_id::NORMAL,
+    value_size_min,
+    value_size_max);
+
+  auto const values_table = create_random_table(  //
+    {cudf::type_id::STRING},
+    1,
+    row_count{num_rows},
+    table_profile);
+
+  auto delim_scalar  = cudf::make_string_scalar(delim);
+  auto delims_column = cudf::make_column_from_scalar(*delim_scalar, num_rows);
+  auto input_table  = cudf::table_view({values_table->get_column(0).view(), delims_column->view()});
+  auto input_column = cudf::strings::concatenate(input_table);
+
+  // extract the chars from the returned strings column.
+  auto input_column_contents = input_column->release();
+  auto chars_column_contents = input_column_contents.children[1]->release();
+  auto chars_buffer          = chars_column_contents.data.release();
+
+  // turn the chars in to a string scalar.
+  return cudf::string_scalar(std::move(*chars_buffer));
+}
+
+static void BM_multibyte_split(benchmark::State& state)
+{
+  auto source_type      = static_cast<data_chunk_source_type>(state.range(0));
+  auto delim_size       = state.range(1);
+  auto delim_percent    = state.range(2);
+  auto file_size_approx = state.range(3);
+
+  CUDF_EXPECTS(delim_percent >= 1, "delimiter percent must be at least 1");
+  CUDF_EXPECTS(delim_percent <= 50, "delimiter percent must be at most 50");
+
+  auto delim = std::string(":", delim_size);
+
+  auto delim_factor = static_cast<double>(delim_percent) / 100;
+  auto device_input = create_random_input(file_size_approx, delim_factor, 0.05, delim);
+  auto host_input   = thrust::host_vector<char>(device_input.size());
+  auto host_string  = std::string(host_input.data(), host_input.size());
+
+  cudaMemcpyAsync(host_input.data(),
+                  device_input.data(),
+                  device_input.size() * sizeof(char),
+                  cudaMemcpyDeviceToHost,
+                  rmm::cuda_stream_default);
+
+  auto temp_file_name = random_file_in_dir(temp_dir.path());
+
+  {
+    auto temp_fostream = std::ofstream(temp_file_name, std::ofstream::out);
+    temp_fostream.write(host_input.data(), host_input.size());
+  }
+
+  cudaDeviceSynchronize();
+
+  auto source = std::unique_ptr<cudf::io::text::data_chunk_source>(nullptr);
+
+  switch (source_type) {
+    case data_chunk_source_type::file:  //
+      source = cudf::io::text::make_source_from_file(temp_file_name);
+      break;
+    case data_chunk_source_type::host:  //
+      source = cudf::io::text::make_source(host_string);
+      break;
+    case data_chunk_source_type::device:  //
+      source = cudf::io::text::make_source(device_input);
+      break;
+    default: CUDF_FAIL();
+  }
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true);
+    auto output = cudf::io::text::multibyte_split(*source, delim);
+  }
+
+  state.SetBytesProcessed(state.iterations() * device_input.size());
+}
+
+class MultibyteSplitBenchmark : public cudf::benchmark {
+};
+
+#define TRANSPOSE_BM_BENCHMARK_DEFINE(name)                                     \
+  BENCHMARK_DEFINE_F(MultibyteSplitBenchmark, name)(::benchmark::State & state) \
+  {                                                                             \
+    BM_multibyte_split(state);                                                  \
+  }                                                                             \
+  BENCHMARK_REGISTER_F(MultibyteSplitBenchmark, name)                           \
+    ->ArgsProduct({{data_chunk_source_type::device,                             \
+                    data_chunk_source_type::file,                               \
+                    data_chunk_source_type::host},                              \
+                   {1, 4, 7},                                                   \
+                   {1, 25},                                                     \
+                   {1 << 15, 1 << 30}})                                         \
+    ->UseManualTime()                                                           \
+    ->Unit(::benchmark::kMillisecond);
+
+TRANSPOSE_BM_BENCHMARK_DEFINE(multibyte_split_simple);
diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
index bdb7fd48e60..ebd7f5bbef0 100644
--- a/cpp/include/cudf/column/column_factories.hpp
+++ b/cpp/include/cudf/column/column_factories.hpp
@@ -442,6 +442,26 @@ std::unique_ptr<column> make_strings_column(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Construct a STRING type column given offsets, columns, and optional null count and null
+ * mask.
+ *
+ * @param[in] num_strings The number of strings the column represents.
+ * @param[in] offsets The offset values for this column. The number of elements is one more than the
+ * total number of strings so the `offset[last] - offset[0]` is the total number of bytes in the
+ * strings vector.
+ * @param[in] chars The char bytes for all the strings for this column. Individual strings are
+ * identified by the offsets and the nullmask.
+ * @param[in] null_mask The bits specifying the null strings in device memory. Arrow format for
+ *  nulls is used for interpreting this bitmask.
+ * @param[in] null_count The number of null string entries.
+ */
+std::unique_ptr<column> make_strings_column(size_type num_strings,
+                                            rmm::device_uvector<size_type>&& offsets,
+                                            rmm::device_uvector<char>&& chars,
+                                            rmm::device_buffer&& null_mask = {},
+                                            size_type null_count = cudf::UNKNOWN_NULL_COUNT);
+
 /**
  * @brief Construct a LIST type column given offsets column, child column, null mask and null
  * count.
diff --git a/cpp/include/cudf/io/text/data_chunk_source.hpp b/cpp/include/cudf/io/text/data_chunk_source.hpp
new file mode 100644
index 00000000000..6ee1fa033d0
--- /dev/null
+++ b/cpp/include/cudf/io/text/data_chunk_source.hpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_pool.hpp>
+#include <rmm/device_buffer.hpp>
+
+namespace cudf {
+namespace io {
+namespace text {
+
+/**
+ * @brief a reader capable of producing views over device memory.
+ *
+ * The data chunk reader API encapsulates the idea of statefully traversing and loading a data
+ * source. A data source may be a file, a region of device memory, or a region of host memory.
+ * Reading data from these data sources efficiently requires different strategies dependings on the
+ * type of data source, type of compression, capabilities of the host and device, the data's
+ * destination. Whole-file decompression should be hidden behind this interface
+ *
+ */
+class data_chunk_reader {
+ public:
+  /**
+   * @brief Get the next chunk of bytes from the data source
+   *
+   * Performs any necessary work to read and prepare the underlying data source for consumption as a
+   * view over device memory. Common implementations may read from a file, copy data from host
+   * memory, allocate temporary memory, perform iterative decompression, or even launch device
+   * kernels.
+   *
+   * @param size number of bytes to read.
+   * @param stream stream to associate allocations or perform work required to obtain chunk
+   * @return a chunk of data up to @param size bytes. May return less than @param size bytes if
+   * reader reaches end of underlying data source. Returned data must be accessed in stream order
+   * relative to the specified @param stream.
+   */
+  virtual device_span<char const> get_next_chunk(std::size_t size,
+                                                 rmm::cuda_stream_view stream) = 0;
+};
+
+/**
+ * @brief a data source capable of creating a reader which can produce views of the data source in
+ * device memory.
+ *
+ */
+class data_chunk_source {
+ public:
+  virtual std::unique_ptr<data_chunk_reader> create_reader() const = 0;
+};
+
+}  // namespace text
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
new file mode 100644
index 00000000000..f6807c1c9a8
--- /dev/null
+++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/io/text/data_chunk_source.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <rmm/device_buffer.hpp>
+
+#include <thrust/host_vector.h>
+#include <thrust/system/cuda/experimental/pinned_allocator.h>
+
+#include <fstream>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+namespace cudf {
+namespace io {
+namespace text {
+
+namespace {
+
+/**
+ * @brief a reader which produces views of device memory which contain a copy of the data from an
+ * istream.
+ *
+ */
+class istream_data_chunk_reader : public data_chunk_reader {
+  struct host_ticket {
+    cudaEvent_t event;
+    thrust::host_vector<char, thrust::system::cuda::experimental::pinned_allocator<char>> buffer;
+  };
+
+ public:
+  istream_data_chunk_reader(std::unique_ptr<std::istream> datastream)
+    : _datastream(std::move(datastream)), _buffers(), _tickets(2)
+  {
+    // create an event to track the completion of the last device-to-host copy.
+    for (std::size_t i = 0; i < _tickets.size(); i++) {
+      CUDA_TRY(cudaEventCreate(&(_tickets[i].event)));
+    }
+  }
+
+  ~istream_data_chunk_reader()
+  {
+    for (std::size_t i = 0; i < _tickets.size(); i++) {
+      CUDA_TRY(cudaEventDestroy(_tickets[i].event));
+    }
+  }
+
+  device_span<char> find_or_create_data(std::size_t size, rmm::cuda_stream_view stream)
+  {
+    auto search = _buffers.find(stream.value());
+
+    if (search == _buffers.end() || search->second.size() < size) {
+      _buffers[stream.value()] = rmm::device_buffer(size, stream);
+    }
+
+    return device_span<char>(static_cast<char*>(_buffers[stream.value()].data()), size);
+  }
+
+  device_span<char const> get_next_chunk(std::size_t read_size,
+                                         rmm::cuda_stream_view stream) override
+  {
+    CUDF_FUNC_RANGE();
+
+    auto& h_ticket = _tickets[_next_ticket_idx];
+
+    _next_ticket_idx = (_next_ticket_idx + 1) % _tickets.size();
+
+    // synchronize on the last host-to-device copy, so we don't clobber the host buffer.
+    CUDA_TRY(cudaEventSynchronize(h_ticket.event));
+
+    // resize the host buffer as necessary to contain the requested number of bytes
+    if (h_ticket.buffer.size() < read_size) { h_ticket.buffer.resize(read_size); }
+
+    // read data from the host istream in to the pinned host memory buffer
+    _datastream->read(h_ticket.buffer.data(), read_size);
+
+    // adjust the read size to reflect how many bytes were actually read from the data stream
+    read_size = _datastream->gcount();
+
+    // get a view over some device memory we can use to buffer the read data on to device.
+    auto chunk_span = find_or_create_data(read_size, stream);
+
+    // copy the host-pinned data on to device
+    CUDA_TRY(cudaMemcpyAsync(  //
+      chunk_span.data(),
+      h_ticket.buffer.data(),
+      read_size,
+      cudaMemcpyHostToDevice,
+      stream.value()));
+
+    // record the host-to-device copy.
+    CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value()));
+
+    // return the view over device memory so it can be processed.
+    return chunk_span;
+  }
+
+ private:
+  std::size_t _next_ticket_idx = 0;
+  std::unique_ptr<std::istream> _datastream;
+  std::unordered_map<cudaStream_t, rmm::device_buffer> _buffers;
+  std::vector<host_ticket> _tickets;
+};
+
+/**
+ * @brief a reader which produces view of device memory which represent a subset of the input device
+ * span
+ *
+ */
+class device_span_data_chunk_reader : public data_chunk_reader {
+ public:
+  device_span_data_chunk_reader(device_span<char const> data) : _data(data) {}
+
+  device_span<char const> get_next_chunk(std::size_t read_size,
+                                         rmm::cuda_stream_view stream) override
+  {
+    // limit the read size to the number of bytes remaining in the device_span.
+    if (read_size > _data.size() - _position) { read_size = _data.size() - _position; }
+
+    // create a view over the device span
+    auto chunk_span = _data.subspan(_position, read_size);
+
+    // increment position
+    _position += read_size;
+
+    // return the view over device memory so it can be processed.
+    return chunk_span;
+  }
+
+ private:
+  device_span<char const> _data;
+  uint64_t _position = 0;
+};
+
+/**
+ * @brief a file data source which creates an istream_data_chunk_reader
+ *
+ */
+class file_data_chunk_source : public data_chunk_source {
+ public:
+  file_data_chunk_source(std::string filename) : _filename(filename) {}
+  std::unique_ptr<data_chunk_reader> create_reader() const override
+  {
+    return std::make_unique<istream_data_chunk_reader>(
+      std::make_unique<std::ifstream>(_filename, std::ifstream::in));
+  }
+
+ private:
+  std::string _filename;
+};
+
+/**
+ * @brief a host string data source which creates an istream_data_chunk_reader
+ */
+class string_data_chunk_source : public data_chunk_source {
+ public:
+  string_data_chunk_source(std::string const& data) : _data(data) {}
+  std::unique_ptr<data_chunk_reader> create_reader() const override
+  {
+    return std::make_unique<istream_data_chunk_reader>(std::make_unique<std::istringstream>(_data));
+  }
+
+ private:
+  std::string const& _data;
+};
+
+/**
+ * @brief a device span data source which creates an istream_data_chunk_reader
+ */
+class device_span_data_chunk_source : public data_chunk_source {
+ public:
+  device_span_data_chunk_source(device_span<char const> data) : _data(data) {}
+  std::unique_ptr<data_chunk_reader> create_reader() const override
+  {
+    return std::make_unique<device_span_data_chunk_reader>(_data);
+  }
+
+ private:
+  device_span<char const> _data;
+};
+
+}  // namespace
+
+/**
+ * @brief Creates a data source capable of producing device-buffered views of the given string.
+ */
+std::unique_ptr<data_chunk_source> make_source(std::string const& data)
+{
+  return std::make_unique<string_data_chunk_source>(data);
+}
+
+/**
+ * @brief Creates a data source capable of producing device-buffered views of the file
+ */
+std::unique_ptr<data_chunk_source> make_source_from_file(std::string const& filename)
+{
+  return std::make_unique<file_data_chunk_source>(filename);
+}
+
+/**
+ * @brief Creates a data source capable of producing views of the given device string scalar
+ */
+std::unique_ptr<data_chunk_source> make_source(cudf::string_scalar& data)
+{
+  auto data_span = device_span<char const>(data.data(), data.size());
+  return std::make_unique<device_span_data_chunk_source>(data_span);
+}
+
+}  // namespace text
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/include/cudf/io/text/detail/multistate.hpp b/cpp/include/cudf/io/text/detail/multistate.hpp
new file mode 100644
index 00000000000..d3c8909ab51
--- /dev/null
+++ b/cpp/include/cudf/io/text/detail/multistate.hpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace cudf {
+namespace io {
+namespace text {
+namespace detail {
+
+/**
+ * @brief Represents up to 7 segments
+ */
+struct multistate {
+ private:
+  /**
+   * @brief represents a (head, tail] segment, stored as a single 8 bit value
+   */
+  struct multistate_segment {
+   public:
+    /**
+     * @brief Creates a segment which represents (0, 0]
+     */
+
+    constexpr multistate_segment() : _data(0) {}
+    /**
+     * @brief Creates a segment which represents (head, tail]
+     *
+     * @param head the (head, ____] value. Undefined behavior for values >= 16
+     * @param tail the (____, tail] value. Undefined behavior for values >= 16
+     */
+
+    constexpr multistate_segment(uint8_t head, uint8_t tail) : _data((head & 0b1111) | (tail << 4))
+    {
+    }
+
+    /**
+     * @brief Get's the (head, ____] value from the segment.
+     */
+    constexpr uint8_t get_head() const { return _data & 0b1111; }
+
+    /**
+     * @brief Get's the (____, tail] value from the segment.
+     */
+    constexpr uint8_t get_tail() const { return _data >> 4; }
+
+   private:
+    uint8_t _data;
+  };
+
+ public:
+  /**
+   * @brief The maximum state (head or tail) this multistate can represent
+   */
+
+  static auto constexpr max_segment_value = 15;
+  /**
+   * @brief The maximum number of segments this multistate can represent
+   */
+  static auto constexpr max_segment_count = 7;
+
+  /**
+   * @brief Enqueues a (head, tail] segment to this multistate
+   *
+   * @note: The behavior of this function is undefined if size() => max_segment_count
+   */
+  constexpr void enqueue(uint8_t head, uint8_t tail)
+  {
+    _segments[_size++] = multistate_segment(head, tail);
+  }
+
+  /**
+   * @brief get's the number of segments this multistate represents
+   */
+  constexpr uint8_t size() const { return _size; }
+
+  /**
+   * @brief get's the highest (____, tail] value this multistate represents
+   */
+  constexpr uint8_t max_tail() const
+  {
+    uint8_t maximum = 0;
+
+    for (uint8_t i = 0; i < _size; i++) {
+      maximum = std::max(maximum, get_tail(i));
+    }
+
+    return maximum;
+  }
+
+  /**
+   * @brief get's the Nth (head, ____] value state this multistate represents
+   */
+  constexpr uint8_t get_head(uint8_t idx) const { return _segments[idx].get_head(); }
+
+  /**
+   * @brief get's the Nth (____, tail] value state this multistate represents
+   */
+  constexpr uint8_t get_tail(uint8_t idx) const { return _segments[idx].get_tail(); }
+
+ private:
+  uint8_t _size = 0;
+  multistate_segment _segments[max_segment_count];
+};
+
+/**
+ * @brief associatively inner-joins transition histories.
+ *
+ * Examples:
+ *           <(0, 5]> + <(5, 9]>         = <(0, 9]>
+ *           <(0, 5]> + <(6, 9]>         = <>
+ *   <(0, 1], (0, 2]> + <(2, 3], (1, 4]> = <(0, 4], (0, 3]>
+ *   <(0, 1], (0, 2]> + <(1, 3]>         = <(0, 3]>
+ *
+ * Head and tail value are limited to [0, 1, ..., 16]
+ *
+ * @param lhs past segments
+ * @param rhs future segments
+ * @return full join of past and future segments
+ */
+constexpr multistate operator+(multistate const& lhs, multistate const& rhs)
+{
+  // combine two multistates together by full-joining LHS tails to RHS heads,
+  // and taking the corresponding LHS heads and RHS tails.
+
+  multistate result;
+  for (uint8_t lhs_idx = 0; lhs_idx < lhs.size(); lhs_idx++) {
+    auto tail = lhs.get_tail(lhs_idx);
+    for (uint8_t rhs_idx = 0; rhs_idx < rhs.size(); rhs_idx++) {
+      auto head = rhs.get_head(rhs_idx);
+      if (tail == head) { result.enqueue(lhs.get_head(lhs_idx), rhs.get_tail(rhs_idx)); }
+    }
+  }
+  return result;
+}
+
+}  // namespace detail
+}  // namespace text
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/include/cudf/io/text/detail/tile_state.hpp b/cpp/include/cudf/io/text/detail/tile_state.hpp
new file mode 100644
index 00000000000..849d857597b
--- /dev/null
+++ b/cpp/include/cudf/io/text/detail/tile_state.hpp
@@ -0,0 +1,134 @@
+
+#pragma once
+
+#include <cub/block/block_scan.cuh>
+
+#include <cuda/atomic>
+
+namespace cudf {
+namespace io {
+namespace text {
+namespace detail {
+
+enum class scan_tile_status : uint8_t {
+  oob,
+  invalid,
+  partial,
+  inclusive,
+};
+
+template <typename T>
+struct scan_tile_state_view {
+  uint64_t num_tiles;
+  cuda::atomic<scan_tile_status, cuda::thread_scope_device>* tile_status;
+  T* tile_partial;
+  T* tile_inclusive;
+
+  __device__ inline void set_status(cudf::size_type tile_idx, scan_tile_status status)
+  {
+    auto const offset = (tile_idx + num_tiles) % num_tiles;
+    tile_status[offset].store(status, cuda::memory_order_relaxed);
+  }
+
+  __device__ inline void set_partial_prefix(cudf::size_type tile_idx, T value)
+  {
+    auto const offset = (tile_idx + num_tiles) % num_tiles;
+    cub::ThreadStore<cub::STORE_CG>(tile_partial + offset, value);
+    tile_status[offset].store(scan_tile_status::partial);
+  }
+
+  __device__ inline void set_inclusive_prefix(cudf::size_type tile_idx, T value)
+  {
+    auto const offset = (tile_idx + num_tiles) % num_tiles;
+    cub::ThreadStore<cub::STORE_CG>(tile_inclusive + offset, value);
+    tile_status[offset].store(scan_tile_status::inclusive);
+  }
+
+  __device__ inline T get_prefix(cudf::size_type tile_idx, scan_tile_status& status)
+  {
+    auto const offset = (tile_idx + num_tiles) % num_tiles;
+
+    while ((status = tile_status[offset].load(cuda::memory_order_relaxed)) ==
+           scan_tile_status::invalid) {}
+
+    if (status == scan_tile_status::partial) {
+      return cub::ThreadLoad<cub::LOAD_CG>(tile_partial + offset);
+    } else {
+      return cub::ThreadLoad<cub::LOAD_CG>(tile_inclusive + offset);
+    }
+  }
+};
+
+template <typename T>
+struct scan_tile_state {
+  rmm::device_uvector<cuda::atomic<scan_tile_status, cuda::thread_scope_device>> tile_status;
+  rmm::device_uvector<T> tile_state_partial;
+  rmm::device_uvector<T> tile_state_inclusive;
+
+  scan_tile_state(cudf::size_type num_tiles,
+                  rmm::cuda_stream_view stream,
+                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+    : tile_status(rmm::device_uvector<cuda::atomic<scan_tile_status, cuda::thread_scope_device>>(
+        num_tiles, stream, mr)),
+      tile_state_partial(rmm::device_uvector<T>(num_tiles, stream, mr)),
+      tile_state_inclusive(rmm::device_uvector<T>(num_tiles, stream, mr))
+  {
+  }
+
+  operator scan_tile_state_view<T>()
+  {
+    return scan_tile_state_view<T>{tile_status.size(),
+                                   tile_status.data(),
+                                   tile_state_partial.data(),
+                                   tile_state_inclusive.data()};
+  }
+
+  inline T get_inclusive_prefix(cudf::size_type tile_idx, rmm::cuda_stream_view stream) const
+  {
+    auto const offset = (tile_idx + tile_status.size()) % tile_status.size();
+    return tile_state_inclusive.element(offset, stream);
+  }
+};
+
+template <typename T>
+struct scan_tile_state_callback {
+  __device__ inline scan_tile_state_callback(scan_tile_state_view<T>& tile_state,
+                                             cudf::size_type tile_idx)
+    : _tile_state(tile_state), _tile_idx(tile_idx)
+  {
+  }
+
+  __device__ inline T operator()(T const& block_aggregate)
+  {
+    T exclusive_prefix;
+
+    if (threadIdx.x == 0) {
+      _tile_state.set_partial_prefix(_tile_idx, block_aggregate);
+
+      auto predecessor_idx    = _tile_idx - 1;
+      auto predecessor_status = scan_tile_status::invalid;
+
+      // scan partials to form prefix
+
+      auto window_partial = _tile_state.get_prefix(predecessor_idx, predecessor_status);
+      while (predecessor_status != scan_tile_status::inclusive) {
+        predecessor_idx--;
+        auto predecessor_prefix = _tile_state.get_prefix(predecessor_idx, predecessor_status);
+        window_partial          = predecessor_prefix + window_partial;
+      }
+      exclusive_prefix = window_partial;
+
+      _tile_state.set_inclusive_prefix(_tile_idx, exclusive_prefix + block_aggregate);
+    }
+
+    return exclusive_prefix;
+  }
+
+  scan_tile_state_view<T>& _tile_state;
+  cudf::size_type _tile_idx;
+};
+
+}  // namespace detail
+}  // namespace text
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp
new file mode 100644
index 00000000000..d14fe15b0a9
--- /dev/null
+++ b/cpp/include/cudf/io/text/detail/trie.hpp
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/text/detail/multistate.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <algorithm>
+#include <queue>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace cudf {
+namespace io {
+namespace text {
+namespace detail {
+
+struct trie_node {
+  char token;
+  uint8_t match_length;
+  uint8_t child_begin;
+};
+
+struct trie_device_view {
+  device_span<trie_node const> _nodes;
+
+  /**
+   * @brief create a multistate which contains all partial path matches for the given token.
+   */
+  constexpr multistate transition_init(char c)
+  {
+    auto result = multistate();
+
+    result.enqueue(0, 0);
+
+    for (uint8_t curr = 0; curr < _nodes.size() - 1; curr++) {
+      transition_enqueue_all(c, result, curr, curr);
+    }
+    return result;
+  }
+
+  /**
+   * @brief create a new multistate by transitioning all states in the multistate by the given token
+   *
+   * Eliminates any partial matches that cannot transition using the given token.
+   *
+   * @note always enqueues (0, 0] as the first state of the returned multistate.
+   */
+  constexpr multistate transition(char c, multistate const& states)
+  {
+    auto result = multistate();
+
+    result.enqueue(0, 0);
+
+    for (uint8_t i = 0; i < states.size(); i++) {
+      transition_enqueue_all(c, result, states.get_head(i), states.get_tail(i));
+    }
+
+    return result;
+  }
+
+  /**
+   * @brief returns true if the given index is associated with a matching state.
+   */
+  constexpr bool is_match(uint16_t idx) { return static_cast<bool>(get_match_length(idx)); }
+
+  /**
+   * @brief returns the match length if the given index is associated with a matching state,
+   * otherwise zero.
+   */
+  constexpr uint8_t get_match_length(uint16_t idx) { return _nodes[idx].match_length; }
+
+  /**
+   * @brief returns the longest matching state of any state in the multistate.
+   */
+  template <uint32_t N>
+  constexpr uint8_t get_match_length(multistate const& states)
+  {
+    int8_t val = 0;
+    for (uint8_t i = 0; i < states.size(); i++) {
+      auto match_length = get_match_length(states.get_tail(i));
+      if (match_length > val) { val = match_length; }
+    }
+    return val;
+  }
+
+ private:
+  constexpr void transition_enqueue_all(  //
+    char c,
+    multistate& states,
+    uint8_t head,
+    uint8_t curr)
+  {
+    for (uint32_t tail = _nodes[curr].child_begin; tail < _nodes[curr + 1].child_begin; tail++) {
+      if (_nodes[tail].token == c) {  //
+        states.enqueue(head, tail);
+      }
+    }
+  }
+};
+
+/**
+ * @brief A flat trie contained in device memory.
+ */
+struct trie {
+ private:
+  cudf::size_type _max_duplicate_tokens;
+  rmm::device_uvector<trie_node> _nodes;
+
+  trie(cudf::size_type max_duplicate_tokens, rmm::device_uvector<trie_node>&& nodes)
+    : _max_duplicate_tokens(max_duplicate_tokens), _nodes(std::move(nodes))
+  {
+  }
+
+  /**
+   * @brief Used to build a hierarchical trie which can then be flattened.
+   */
+  struct trie_builder_node {
+    uint8_t match_length;
+    std::unordered_map<char, std::unique_ptr<trie_builder_node>> children;
+
+    /**
+     * @brief Insert the string in to the trie tree, growing the trie as necessary
+     */
+    void insert(std::string s) { insert(s.c_str(), s.size(), 0); }
+
+   private:
+    trie_builder_node& insert(char const* s, uint16_t size, uint8_t depth)
+    {
+      if (size == 0) {
+        match_length = depth;
+        return *this;
+      }
+
+      if (children[*s] == nullptr) { children[*s] = std::make_unique<trie_builder_node>(); }
+
+      return children[*s]->insert(s + 1, size - 1, depth + 1);
+    }
+  };
+
+ public:
+  /**
+   * @brief Gets the number of nodes contained in this trie.
+   */
+  cudf::size_type size() const { return _nodes.size(); }
+
+  /**
+   * @brief A pessimistic count of duplicate tokens in the trie. Used to determine the maximum
+   * possible stack size required to compute matches of this trie in parallel.
+   */
+  cudf::size_type max_duplicate_tokens() const { return _max_duplicate_tokens; }
+
+  /**
+   * @brief Create a trie which represents the given pattern.
+   *
+   * @param pattern The pattern to store in the trie
+   * @param stream The stream to use for allocation and copy
+   * @param mr Memory resource to use for the device memory allocation
+   * @return The trie.
+   */
+  static trie create(std::string const& pattern,
+                     rmm::cuda_stream_view stream,
+                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+
+  {
+    return create(std::vector<std::string>{pattern}, stream, mr);
+  }
+
+  /**
+   * @brief Create a trie which represents the given pattern.
+   *
+   * @param pattern The patterns to store in the trie
+   * @param stream The stream to use for allocation and copy
+   * @param mr Memory resource to use for the device memory allocation
+   * @return The trie.
+   */
+  static trie create(std::vector<std::string> const& patterns,
+                     rmm::cuda_stream_view stream,
+                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  {
+    std::vector<char> tokens;
+    std::vector<uint8_t> transitions;
+    std::vector<uint8_t> match_length;
+
+    // create the trie tree
+    auto root = std::make_unique<trie_builder_node>();
+    for (auto& pattern : patterns) {
+      root->insert(pattern);
+    }
+
+    // flatten
+    auto sum = 1;
+    transitions.emplace_back(sum);
+    match_length.emplace_back(root->match_length);
+
+    auto builder_nodes = std::queue<std::unique_ptr<trie_builder_node>>();
+    builder_nodes.push(std::move(root));
+
+    tokens.emplace_back(0);
+
+    while (builder_nodes.size()) {
+      auto layer_size = builder_nodes.size();
+      for (uint32_t i = 0; i < layer_size; i++) {
+        auto node = std::move(builder_nodes.front());
+        builder_nodes.pop();
+        sum += node->children.size();
+        transitions.emplace_back(sum);
+        for (auto& item : node->children) {
+          match_length.emplace_back(item.second->match_length);
+          tokens.emplace_back(item.first);
+          builder_nodes.push(std::move(item.second));
+        }
+      }
+    }
+
+    tokens.emplace_back(0);
+
+    match_length.emplace_back(0);
+
+    std::vector<trie_node> trie_nodes;
+    auto token_counts = std::unordered_map<cudf::size_type, int32_t>();
+
+    for (uint32_t i = 0; i < tokens.size(); i++) {
+      trie_nodes.emplace_back(trie_node{tokens[i], match_length[i], transitions[i]});
+      token_counts[tokens[i]]++;
+    }
+
+    auto most_common_token =
+      std::max_element(token_counts.begin(), token_counts.end(), [](auto const& a, auto const& b) {
+        return a.second < b.second;
+      });
+
+    auto max_duplicate_tokens = most_common_token->second;
+
+    return trie{max_duplicate_tokens,
+                cudf::detail::make_device_uvector_sync(trie_nodes, stream, mr)};
+  }
+
+  trie_device_view view() const { return trie_device_view{_nodes}; }
+};
+
+}  // namespace detail
+}  // namespace text
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp
new file mode 100644
index 00000000000..d42ee9f510e
--- /dev/null
+++ b/cpp/include/cudf/io/text/multibyte_split.hpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/io/text/data_chunk_source.hpp>
+
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <memory>
+
+namespace cudf {
+namespace io {
+namespace text {
+
+std::unique_ptr<cudf::column> multibyte_split(
+  data_chunk_source const& source,
+  std::string const& delimiter,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace text
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
new file mode 100644
index 00000000000..662ec744680
--- /dev/null
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -0,0 +1,396 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/io/text/data_chunk_source.hpp>
+#include <cudf/io/text/detail/multistate.hpp>
+#include <cudf/io/text/detail/tile_state.hpp>
+#include <cudf/io/text/detail/trie.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_pool.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_scan.cuh>
+
+#include <memory>
+
+namespace {
+
+using cudf::io::text::detail::multistate;
+
+int32_t constexpr ITEMS_PER_THREAD = 32;
+int32_t constexpr THREADS_PER_TILE = 128;
+int32_t constexpr ITEMS_PER_TILE   = ITEMS_PER_THREAD * THREADS_PER_TILE;
+int32_t constexpr TILES_PER_CHUNK  = 1024;
+int32_t constexpr ITEMS_PER_CHUNK  = ITEMS_PER_TILE * TILES_PER_CHUNK;
+
+struct PatternScan {
+  using BlockScan         = cub::BlockScan<multistate, THREADS_PER_TILE>;
+  using BlockScanCallback = cudf::io::text::detail::scan_tile_state_callback<multistate>;
+
+  struct _TempStorage {
+    typename BlockScan::TempStorage scan;
+  };
+
+  _TempStorage& _temp_storage;
+
+  using TempStorage = cub::Uninitialized<_TempStorage>;
+
+  __device__ inline PatternScan(TempStorage& temp_storage) : _temp_storage(temp_storage.Alias()) {}
+
+  __device__ inline void Scan(cudf::size_type tile_idx,
+                              cudf::io::text::detail::scan_tile_state_view<multistate> tile_state,
+                              cudf::io::text::detail::trie_device_view trie,
+                              char (&thread_data)[ITEMS_PER_THREAD],
+                              uint32_t (&thread_state)[ITEMS_PER_THREAD])
+  {
+    auto thread_multistate = trie.transition_init(thread_data[0]);
+
+    for (uint32_t i = 1; i < ITEMS_PER_THREAD; i++) {
+      thread_multistate = trie.transition(thread_data[i], thread_multistate);
+    }
+
+    auto prefix_callback = BlockScanCallback(tile_state, tile_idx);
+
+    BlockScan(_temp_storage.scan)
+      .ExclusiveSum(thread_multistate, thread_multistate, prefix_callback);
+
+    for (uint32_t i = 0; i < ITEMS_PER_THREAD; i++) {
+      thread_multistate = trie.transition(thread_data[i], thread_multistate);
+
+      thread_state[i] = thread_multistate.max_tail();
+    }
+  }
+};
+
+// multibyte_split works by splitting up inputs in to 32 inputs (bytes) per thread, and transforming
+// them in to data structures called "multistates". these multistates are created by searching a
+// trie, but instead of a tradition trie where the search begins at a single node at the beginning,
+// we allow our search to begin anywhere within the trie tree. The position within the trie tree is
+// stored as a "partial match path", which indicates "we can get from here to there by a set of
+// specific transitions". By scanning together multistates, we effectively know "we can get here
+// from the beginning by following the inputs". By doing this, each thread knows exactly what state
+// it begins in. From there, each thread can then take deterministic action. In this case, the
+// deterministic action is counting and outputting delimiter offsets when a delimiter is found.
+
+__global__ void multibyte_split_init_kernel(
+  cudf::size_type base_tile_idx,
+  cudf::size_type num_tiles,
+  cudf::io::text::detail::scan_tile_state_view<multistate> tile_multistates,
+  cudf::io::text::detail::scan_tile_state_view<uint32_t> tile_output_offsets,
+  cudf::io::text::detail::scan_tile_status status =
+    cudf::io::text::detail::scan_tile_status::invalid)
+{
+  auto const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (thread_idx < num_tiles) {
+    auto const tile_idx = base_tile_idx + thread_idx;
+    tile_multistates.set_status(tile_idx, status);
+    tile_output_offsets.set_status(tile_idx, status);
+  }
+}
+
+__global__ void multibyte_split_seed_kernel(
+  cudf::io::text::detail::scan_tile_state_view<multistate> tile_multistates,
+  cudf::io::text::detail::scan_tile_state_view<uint32_t> tile_output_offsets,
+  multistate tile_multistate_seed,
+  uint32_t tile_output_offset)
+{
+  auto const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (thread_idx == 0) {
+    tile_multistates.set_inclusive_prefix(-1, tile_multistate_seed);
+    tile_output_offsets.set_inclusive_prefix(-1, tile_output_offset);
+  }
+}
+
+__global__ void multibyte_split_kernel(
+  cudf::size_type base_tile_idx,
+  cudf::io::text::detail::scan_tile_state_view<multistate> tile_multistates,
+  cudf::io::text::detail::scan_tile_state_view<uint32_t> tile_output_offsets,
+  cudf::io::text::detail::trie_device_view trie,
+  int32_t chunk_input_offset,
+  cudf::device_span<char const> chunk_input_chars,
+  cudf::device_span<int32_t> abs_output_delimiter_offsets,
+  cudf::device_span<char> abs_output_chars)
+{
+  using InputLoad =
+    cub::BlockLoad<char, THREADS_PER_TILE, ITEMS_PER_THREAD, cub::BLOCK_LOAD_VECTORIZE>;
+  using OffsetScan         = cub::BlockScan<uint32_t, THREADS_PER_TILE>;
+  using OffsetScanCallback = cudf::io::text::detail::scan_tile_state_callback<uint32_t>;
+
+  __shared__ union {
+    typename InputLoad::TempStorage input_load;
+    typename PatternScan::TempStorage pattern_scan;
+    typename OffsetScan::TempStorage offset_scan;
+  } temp_storage;
+
+  int32_t const tile_idx            = base_tile_idx + blockIdx.x;
+  int32_t const tile_input_offset   = blockIdx.x * ITEMS_PER_TILE;
+  int32_t const thread_input_offset = tile_input_offset + threadIdx.x * ITEMS_PER_THREAD;
+  int32_t const thread_input_size   = chunk_input_chars.size() - thread_input_offset;
+
+  // STEP 1: Load inputs
+
+  char thread_chars[ITEMS_PER_THREAD];
+
+  InputLoad(temp_storage.input_load)
+    .Load(chunk_input_chars.data() + tile_input_offset,
+          thread_chars,
+          chunk_input_chars.size() - tile_input_offset);
+
+  // STEP 2: Scan inputs to determine absolute thread states
+
+  uint32_t thread_states[ITEMS_PER_THREAD];
+
+  __syncthreads();  // required before temp_memory re-use
+  PatternScan(temp_storage.pattern_scan)
+    .Scan(tile_idx, tile_multistates, trie, thread_chars, thread_states);
+
+  // STEP 3: Flag matches
+
+  uint32_t thread_offsets[ITEMS_PER_THREAD];
+
+  for (int32_t i = 0; i < ITEMS_PER_THREAD; i++) {
+    thread_offsets[i] = i < thread_input_size and trie.is_match(thread_states[i]);
+  }
+
+  // STEP 4: Scan flags to determine absolute thread output offset
+
+  auto prefix_callback = OffsetScanCallback(tile_output_offsets, tile_idx);
+
+  __syncthreads();  // required before temp_memory re-use
+  OffsetScan(temp_storage.offset_scan)
+    .ExclusiveSum(thread_offsets, thread_offsets, prefix_callback);
+
+  // Step 5: Assign outputs from each thread using match offsets.
+
+  if (abs_output_chars.size() > 0) {
+    for (int32_t i = 0; i < ITEMS_PER_THREAD and i < thread_input_size; i++) {
+      abs_output_chars[chunk_input_offset + thread_input_offset + i] = thread_chars[i];
+    }
+  }
+
+  if (abs_output_delimiter_offsets.size() > 0) {
+    for (int32_t i = 0; i < ITEMS_PER_THREAD and i < thread_input_size; i++) {
+      if (trie.is_match(thread_states[i])) {
+        auto const match_end = base_tile_idx * ITEMS_PER_TILE + thread_input_offset + i + 1;
+        abs_output_delimiter_offsets[thread_offsets[i]] = match_end;
+      }
+    }
+  }
+}
+
+}  // namespace
+
+namespace cudf {
+namespace io {
+namespace text {
+namespace detail {
+
+void fork_stream(std::vector<rmm::cuda_stream_view> streams, rmm::cuda_stream_view stream)
+{
+  cudaEvent_t event;
+  cudaEventCreate(&event);
+  cudaEventRecord(event, stream);
+  for (uint32_t i = 0; i < streams.size(); i++) {
+    cudaStreamWaitEvent(streams[i], event, 0);
+  }
+  cudaEventDestroy(event);
+}
+
+void join_stream(std::vector<rmm::cuda_stream_view> streams, rmm::cuda_stream_view stream)
+{
+  cudaEvent_t event;
+  cudaEventCreate(&event);
+  for (uint32_t i = 0; i < streams.size(); i++) {
+    cudaEventRecord(event, streams[i]);
+    cudaStreamWaitEvent(stream, event, 0);
+  }
+  cudaEventDestroy(event);
+}
+
+std::vector<rmm::cuda_stream_view> get_streams(int32_t count, rmm::cuda_stream_pool& stream_pool)
+{
+  auto streams = std::vector<rmm::cuda_stream_view>();
+  for (int32_t i = 0; i < count; i++) {
+    streams.emplace_back(stream_pool.get_stream());
+  }
+  return streams;
+}
+
+cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_source const& source,
+                                                 cudf::io::text::detail::trie const& trie,
+                                                 scan_tile_state<multistate>& tile_multistates,
+                                                 scan_tile_state<uint32_t>& tile_offsets,
+                                                 device_span<cudf::size_type> output_buffer,
+                                                 device_span<char> output_char_buffer,
+                                                 rmm::cuda_stream_view stream,
+                                                 std::vector<rmm::cuda_stream_view> const& streams)
+{
+  CUDF_FUNC_RANGE();
+  cudf::size_type chunk_offset = 0;
+
+  multibyte_split_init_kernel<<<TILES_PER_CHUNK, THREADS_PER_TILE, 0, stream.value()>>>(  //
+    -TILES_PER_CHUNK,
+    TILES_PER_CHUNK,
+    tile_multistates,
+    tile_offsets,
+    cudf::io::text::detail::scan_tile_status::oob);
+
+  auto multistate_seed = multistate();
+  multistate_seed.enqueue(0, 0);  // this represents the first state in the pattern.
+
+  // Seeding the tile state with an identity value allows the 0th tile to follow the same logic as
+  // the Nth tile, assuming it can look up an inclusive prefix. Without this seed, the 0th block
+  // would have to follow seperate logic.
+  multibyte_split_seed_kernel<<<1, 1, 0, stream.value()>>>(  //
+    tile_multistates,
+    tile_offsets,
+    multistate_seed,
+    0);
+
+  fork_stream(streams, stream);
+
+  auto reader = source.create_reader();
+
+  cudaEvent_t last_launch_event;
+  cudaEventCreate(&last_launch_event);
+
+  for (int32_t i = 0; true; i++) {
+    auto base_tile_idx = i * TILES_PER_CHUNK;
+    auto chunk_stream  = streams[i % streams.size()];
+    auto chunk         = reader->get_next_chunk(ITEMS_PER_CHUNK, chunk_stream);
+
+    if (chunk.size() == 0) { break; }
+
+    auto tiles_in_launch =
+      cudf::util::div_rounding_up_safe(chunk.size(), static_cast<std::size_t>(ITEMS_PER_TILE));
+
+    // reset the next chunk of tile state
+    multibyte_split_init_kernel<<<tiles_in_launch, THREADS_PER_TILE, 0, chunk_stream>>>(  //
+      base_tile_idx,
+      tiles_in_launch,
+      tile_multistates,
+      tile_offsets);
+
+    cudaStreamWaitEvent(chunk_stream, last_launch_event, 0);
+
+    multibyte_split_kernel<<<tiles_in_launch, THREADS_PER_TILE, 0, chunk_stream>>>(  //
+      base_tile_idx,
+      tile_multistates,
+      tile_offsets,
+      trie.view(),
+      chunk_offset,
+      chunk,
+      output_buffer,
+      output_char_buffer);
+
+    cudaEventRecord(last_launch_event, chunk_stream);
+
+    chunk_offset += chunk.size();
+  }
+
+  cudaEventDestroy(last_launch_event);
+
+  join_stream(streams, stream);
+
+  return chunk_offset;
+}
+
+std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
+                                              std::string const& delimiter,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr,
+                                              rmm::cuda_stream_pool& stream_pool)
+{
+  CUDF_FUNC_RANGE();
+  auto const trie = cudf::io::text::detail::trie::create({delimiter}, stream);
+
+  CUDF_EXPECTS(trie.max_duplicate_tokens() < multistate::max_segment_count,
+               "delimiter contains too many duplicate tokens to produce a deterministic result.");
+
+  CUDF_EXPECTS(trie.size() < multistate::max_segment_value,
+               "delimiter contains too many total tokens to produce a deterministic result.");
+
+  auto concurrency = 2;
+  // must be at least 32 when using warp-reduce on partials
+  // must be at least 1 more than max possible concurrent tiles
+  // best when at least 32 more than max possible concurrent tiles, due to rolling `invalid`s
+  auto num_tile_states  = std::max(32, TILES_PER_CHUNK * concurrency + 32);
+  auto tile_multistates = scan_tile_state<multistate>(num_tile_states, stream);
+  auto tile_offsets     = scan_tile_state<uint32_t>(num_tile_states, stream);
+
+  auto streams = get_streams(concurrency, stream_pool);
+
+  auto bytes_total =
+    multibyte_split_scan_full_source(source,
+                                     trie,
+                                     tile_multistates,
+                                     tile_offsets,
+                                     cudf::device_span<int32_t>(static_cast<int32_t*>(nullptr), 0),
+                                     cudf::device_span<char>(static_cast<char*>(nullptr), 0),
+                                     stream,
+                                     streams);
+
+  // allocate results
+  auto num_tiles      = cudf::util::div_rounding_up_safe(bytes_total, ITEMS_PER_TILE);
+  auto num_results    = tile_offsets.get_inclusive_prefix(num_tiles - 1, stream);
+  auto string_offsets = rmm::device_uvector<int32_t>(num_results + 2, stream, mr);
+  auto string_chars   = rmm::device_uvector<char>(bytes_total, stream, mr);
+
+  // first and last element are set manually to zero and size of input, respectively.
+  // kernel is only responsible for determining delimiter offsets
+  auto string_count = static_cast<cudf::size_type>(string_offsets.size() - 1);
+  string_offsets.set_element_to_zero_async(0, stream);
+  string_offsets.set_element_async(string_count, bytes_total, stream);
+
+  multibyte_split_scan_full_source(
+    source,
+    trie,
+    tile_multistates,
+    tile_offsets,
+    cudf::device_span<int32_t>(string_offsets).subspan(1, num_results),
+    string_chars,
+    stream,
+    streams);
+
+  return cudf::make_strings_column(
+    string_count, std::move(string_offsets), std::move(string_chars));
+}
+
+}  // namespace detail
+
+std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
+                                              std::string const& delimiter,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  auto stream      = rmm::cuda_stream_default;
+  auto stream_pool = rmm::cuda_stream_pool(2);
+  auto result      = detail::multibyte_split(source, delimiter, stream, mr, stream_pool);
+
+  stream.synchronize();
+
+  return result;
+}
+
+}  // namespace text
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu
index abf1f9599dc..c89f1b756d6 100644
--- a/cpp/src/strings/strings_column_factories.cu
+++ b/cpp/src/strings/strings_column_factories.cu
@@ -137,4 +137,46 @@ std::unique_ptr<column> make_strings_column(size_type num_strings,
                                   std::move(children));
 }
 
+std::unique_ptr<column> make_strings_column(size_type num_strings,
+                                            rmm::device_uvector<size_type>&& offsets,
+                                            rmm::device_uvector<char>&& chars,
+                                            rmm::device_buffer&& null_mask,
+                                            size_type null_count)
+{
+  CUDF_FUNC_RANGE();
+
+  auto const offsets_size = static_cast<size_type>(offsets.size());
+  auto const chars_size   = static_cast<size_type>(chars.size());
+
+  if (null_count > 0) CUDF_EXPECTS(null_mask.size() > 0, "Column with nulls must be nullable.");
+
+  CUDF_EXPECTS(num_strings == offsets_size - 1, "Invalid offsets column size for strings column.");
+
+  auto offsets_column = std::make_unique<column>(  //
+    data_type{type_id::INT32},
+    offsets_size,
+    offsets.release(),
+    rmm::device_buffer(),
+    0);
+
+  auto chars_column = std::make_unique<column>(  //
+    data_type{type_id::INT8},
+    chars_size,
+    chars.release(),
+    rmm::device_buffer(),
+    0);
+
+  auto children = std::vector<std::unique_ptr<column>>();
+
+  children.emplace_back(std::move(offsets_column));
+  children.emplace_back(std::move(chars_column));
+
+  return std::make_unique<column>(data_type{type_id::STRING},
+                                  num_strings,
+                                  rmm::device_buffer{},
+                                  std::move(null_mask),
+                                  null_count,
+                                  std::move(children));
+}
+
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 19421e3115d..edfbba74eb1 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -195,6 +195,7 @@ ConfigureTest(ORC_TEST io/orc_test.cpp)
 ConfigureTest(PARQUET_TEST io/parquet_test.cpp)
 ConfigureTest(JSON_TEST io/json_test.cpp)
 ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp)
+ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp)
 if(CUDF_ENABLE_ARROW_S3)
   target_compile_definitions(ARROW_IO_SOURCE_TEST PRIVATE "S3_ENABLED")
 endif()
diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp
new file mode 100644
index 00000000000..d1fa787e000
--- /dev/null
+++ b/cpp/tests/io/text/multibyte_split_test.cpp
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/io/text/data_chunk_source_factories.hpp>
+#include <cudf/io/text/multibyte_split.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+using namespace cudf;
+using namespace test;
+
+// 😀 | F0 9F 98 80 | 11110000 10011111 10011000 10000000
+// 😎 | F0 9F 98 8E | 11110000 10011111 10011000 10001110
+
+struct MultibyteSplitTest : public BaseFixture {
+};
+
+TEST_F(MultibyteSplitTest, NondeterministicMatching)
+{
+  auto delimiter  = std::string("abac");
+  auto host_input = std::string("ababacabacab");
+
+  auto expected = strings_column_wrapper{"ababac", "abac", "ab"};
+
+  auto source = cudf::io::text::make_source(host_input);
+  auto out    = cudf::io::text::multibyte_split(*source, delimiter);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
+TEST_F(MultibyteSplitTest, DelimiterAtEnd)
+{
+  auto delimiter  = std::string(":");
+  auto host_input = std::string("abcdefg:");
+
+  auto expected = strings_column_wrapper{"abcdefg:", ""};
+
+  auto source = cudf::io::text::make_source(host_input);
+  auto out    = cudf::io::text::multibyte_split(*source, delimiter);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
+TEST_F(MultibyteSplitTest, LargeInput)
+{
+  auto host_input    = std::string();
+  auto host_expected = std::vector<std::string>();
+
+  for (auto i = 0; i < (2 * 32 * 128 * 1024); i++) {
+    host_input += "...:|";
+    host_expected.emplace_back(std::string("...:|"));
+  }
+
+  host_expected.emplace_back(std::string(""));
+
+  auto expected = strings_column_wrapper{host_expected.begin(), host_expected.end()};
+
+  auto delimiter = std::string("...:|");
+  auto source    = cudf::io::text::make_source(host_input);
+  auto out       = cudf::io::text::multibyte_split(*source, delimiter);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
+TEST_F(MultibyteSplitTest, OverlappingMatchErasure)
+{
+  auto delimiter = "::";
+
+  auto host_input = std::string(
+    ":::::"
+    ":::::");
+  auto expected = strings_column_wrapper{":::::", ":::::"};
+
+  auto source = cudf::io::text::make_source(host_input);
+  auto out    = cudf::io::text::multibyte_split(*source, delimiter);
+
+  // CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); // this use case it not yet supported.
+}
+
+TEST_F(MultibyteSplitTest, HandpickedInput)
+{
+  auto delimiters = "::|";
+  auto host_input = std::string(
+    "aaa::|"
+    "bbb::|"
+    "ccc::|"
+    "ddd::|"
+    "eee::|"
+    "fff::|"
+    "ggg::|"
+    "hhh::|"
+    "___::|"
+    "here::|"
+    "is::|"
+    "another::|"
+    "simple::|"
+    "text::|"
+    "seperated::|"
+    "by::|"
+    "emojis::|"
+    "which::|"
+    "are::|"
+    "multiple::|"
+    "bytes::|"
+    "and::|"
+    "used::|"
+    "as::|"
+    "delimiters.::|"
+    "::|"
+    "::|"
+    "::|");
+
+  auto expected = strings_column_wrapper{
+    "aaa::|",         "bbb::|",      "ccc::|",       "ddd::|",  "eee::|",    "fff::|",
+    "ggg::|",         "hhh::|",      "___::|",       "here::|", "is::|",     "another::|",
+    "simple::|",      "text::|",     "seperated::|", "by::|",   "emojis::|", "which::|",
+    "are::|",         "multiple::|", "bytes::|",     "and::|",  "used::|",   "as::|",
+    "delimiters.::|", "::|",         "::|",          "::|",     ""};
+
+  auto source = cudf::io::text::make_source(host_input);
+  auto out    = cudf::io::text::multibyte_split(*source, delimiters);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, debug_output_level::ALL_ERRORS);
+}

From 5647b535b2a1546c56ddfd12e7cbd2fb198e64e8 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 24 Aug 2021 17:52:20 -0500
Subject: [PATCH 34/46] Add support for BaseIndexer in Rolling APIs (#9085)

Fixes: #9085

This PR adds support for `BaseIndexer` subclass support in `Rolling` APIs. This also contains a fix related to `fillna` - testcase added.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/9085
---
 python/cudf/cudf/core/column/numerical.py |  3 +
 python/cudf/cudf/core/window/rolling.py   | 67 ++++++++++++++++-------
 python/cudf/cudf/tests/test_replace.py    |  2 +
 python/cudf/cudf/tests/test_rolling.py    | 48 ++++++++++++++++
 4 files changed, 99 insertions(+), 21 deletions(-)

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index bc12b42a3fa..db1829d5f38 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -375,6 +375,9 @@ def fillna(
         else:
             col = self
 
+        if col.null_count == 0:
+            return col
+
         if method is not None:
             return super(NumericalColumn, col).fillna(fill_value, method)
 
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index e3ed15ba2a6..317ce29d00e 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -4,6 +4,7 @@
 
 import numba
 import pandas as pd
+from pandas.api.indexers import BaseIndexer
 
 import cudf
 from cudf import _lib as libcudf
@@ -20,7 +21,7 @@ class Rolling(GetAttrGetItemMixin):
 
     Parameters
     ----------
-    window : int or offset
+    window : int, offset or a BaseIndexer subclass
         Size of the window, i.e., the number of observations used
         to calculate the statistic.
         For datetime indexes, an offset can be provided instead
@@ -28,6 +29,8 @@ class Rolling(GetAttrGetItemMixin):
         As opposed to a fixed window size, each window will be
         sized to accommodate observations within the time period
         specified by the offset.
+        If a BaseIndexer subclass is passed, calculates the window
+        boundaries based on the defined ``get_window_bounds`` method.
     min_periods : int, optional
         The minimum number of observations in the window that are
         required to be non-null, so that the result is non-null.
@@ -195,26 +198,46 @@ def __getitem__(self, arg):
         )
 
     def _apply_agg_series(self, sr, agg_name):
+        source_column = sr._column
+        min_periods = self.min_periods or 1
         if isinstance(self.window, int):
-            result_col = libcudf.rolling.rolling(
-                sr._column,
-                None,
-                None,
-                self.window,
-                self.min_periods,
-                self.center,
-                agg_name,
+            preceding_window = None
+            following_window = None
+            window = self.window
+        elif isinstance(self.window, BaseIndexer):
+            start, end = self.window.get_window_bounds(
+                num_values=len(self.obj),
+                min_periods=self.min_periods,
+                center=self.center,
+                closed=None,
             )
+            start = as_column(start, dtype="int32")
+            end = as_column(end, dtype="int32")
+
+            idx = cudf.core.column.arange(len(start))
+            preceding_window = (idx - start + cudf.Scalar(1, "int32")).astype(
+                "int32"
+            )
+            following_window = (end - idx - cudf.Scalar(1, "int32")).astype(
+                "int32"
+            )
+            window = None
         else:
-            result_col = libcudf.rolling.rolling(
-                sr._column,
-                as_column(self.window),
-                column.full(self.window.size, 0, dtype=self.window.dtype),
-                None,
-                self.min_periods,
-                self.center,
-                agg_name,
+            preceding_window = as_column(self.window)
+            following_window = column.full(
+                self.window.size, 0, dtype=self.window.dtype
             )
+            window = None
+
+        result_col = libcudf.rolling.rolling(
+            source_column=source_column,
+            pre_column_window=preceding_window,
+            fwd_column_window=following_window,
+            window=window,
+            min_periods=min_periods,
+            center=self.center,
+            op=agg_name,
+        )
         return sr._from_data({sr.name: result_col}, sr._index)
 
     def _apply_agg_dataframe(self, df, agg_name):
@@ -305,15 +328,17 @@ def _normalize(self):
             if self.min_periods is None:
                 min_periods = window
         else:
-            if isinstance(window, numba.cuda.devicearray.DeviceNDArray):
-                # window is a device_array of window sizes
+            if isinstance(
+                window, (numba.cuda.devicearray.DeviceNDArray, BaseIndexer)
+            ):
+                # window is a device_array of window sizes or BaseIndexer
                 self.window = window
                 self.min_periods = min_periods
                 return
 
             if not isinstance(self.obj.index, cudf.core.index.DatetimeIndex):
                 raise ValueError(
-                    "window must be an integer for " "non datetime index"
+                    "window must be an integer for non datetime index"
                 )
 
             self._time_window = True
@@ -326,7 +351,7 @@ def _normalize(self):
                 window = window.to_timedelta64()
             except ValueError as e:
                 raise ValueError(
-                    "window must be integer or " "convertible to a timedelta"
+                    "window must be integer or convertible to a timedelta"
                 ) from e
             if self.min_periods is None:
                 min_periods = 1
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index 33bef2c677b..f60baec746f 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -657,6 +657,7 @@ def test_fillna_method_fixed_width_non_num(data, container, method, inplace):
         pd.DataFrame(
             {"a": [1, 2, None], "b": [None, None, 5]}, index=["a", "p", "z"]
         ),
+        pd.DataFrame({"a": [1, 2, 3]}),
     ],
 )
 @pytest.mark.parametrize(
@@ -671,6 +672,7 @@ def test_fillna_method_fixed_width_non_num(data, container, method, inplace):
         {"b": pd.Series([11, 22, 33], index=["a", "p", "z"])},
         {"a": 5, "b": pd.Series([3, 4, 5], index=["a", "p", "z"])},
         {"c": 100},
+        np.nan,
     ],
 )
 @pytest.mark.parametrize("inplace", [True, False])
diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py
index 07e7f43c992..8a8293cd090 100644
--- a/python/cudf/cudf/tests/test_rolling.py
+++ b/python/cudf/cudf/tests/test_rolling.py
@@ -369,3 +369,51 @@ def test_rolling_groupby_offset(agg, window_size):
     )
     got = getattr(gdf.groupby("group").rolling(window_size), agg)().fillna(-1)
     assert_eq(expect, got, check_dtype=False)
+
+
+def test_rolling_custom_index_support():
+    from pandas.api.indexers import BaseIndexer
+
+    class CustomIndexer(BaseIndexer):
+        def get_window_bounds(self, num_values, min_periods, center, closed):
+            start = np.empty(num_values, dtype=np.int64)
+            end = np.empty(num_values, dtype=np.int64)
+
+            for i in range(num_values):
+                if self.use_expanding[i]:
+                    start[i] = 0
+                    end[i] = i + 1
+                else:
+                    start[i] = i
+                    end[i] = i + self.window_size
+
+            return start, end
+
+    use_expanding = [True, False, True, False, True]
+    indexer = CustomIndexer(window_size=1, use_expanding=use_expanding)
+
+    df = pd.DataFrame({"values": range(5)})
+    gdf = cudf.from_pandas(df)
+
+    expected = df.rolling(window=indexer).sum()
+    actual = gdf.rolling(window=indexer).sum()
+
+    assert_eq(expected, actual, check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    "indexer",
+    [
+        pd.api.indexers.FixedForwardWindowIndexer(window_size=2),
+        pd.core.window.indexers.ExpandingIndexer(),
+        pd.core.window.indexers.FixedWindowIndexer(window_size=3),
+    ],
+)
+def test_rolling_indexer_support(indexer):
+    df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]})
+    gdf = cudf.from_pandas(df)
+
+    expected = df.rolling(window=indexer, min_periods=2).sum()
+    actual = gdf.rolling(window=indexer, min_periods=2).sum()
+
+    assert_eq(expected, actual)

From 359be0a24f702926c276d70c1e3f7f533ab63551 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Tue, 24 Aug 2021 20:02:45 -0400
Subject: [PATCH 35/46] Import fix (#9104)

Closes https://github.com/rapidsai/cudf/issues/9084

Authors:
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/9104
---
 python/cudf/cudf/utils/cudautils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py
index fb7163c52e3..727bbb1c345 100755
--- a/python/cudf/cudf/utils/cudautils.py
+++ b/python/cudf/cudf/utils/cudautils.py
@@ -240,6 +240,7 @@ def compile_udf(udf, type_signature):
       An numpy type
 
     """
+    import cudf.core.udf
 
     # Check if we've already compiled a similar (but possibly distinct)
     # function before

From 44bf4baead3950a1db5c9aad3c3d4d49a3273673 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 24 Aug 2021 20:52:48 -0400
Subject: [PATCH 36/46] Fix compile warnings found using nvcc 11.4 (#9101)

While looking into a `compute-sanitizer` issue, I found the problem with the tool had been fixed in 11.4. Building libcudf in 11.4 uncovered some new compile warnings which are fixed in this PR. All the warnings were identifying unused variables.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/9101
---
 cpp/src/binaryop/compiled/binary_ops.cu  | 3 +--
 cpp/src/io/orc/reader_impl.cu            | 2 --
 cpp/src/merge/merge.cu                   | 9 ++++-----
 cpp/tests/join/conditional_join_tests.cu | 4 +---
 4 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index 2b38224864a..7b0139a0082 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -43,7 +43,7 @@ struct scalar_as_column_device_view {
   template <typename T, std::enable_if_t<(is_fixed_width<T>())>* = nullptr>
   return_type operator()(scalar const& s,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
+                         rmm::mr::device_memory_resource*)
   {
     auto& h_scalar_type_view = static_cast<cudf::scalar_type_t<T>&>(const_cast<scalar&>(s));
     auto col_v =
@@ -201,7 +201,6 @@ struct null_considering_binop {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr) const
   {
-    std::unique_ptr<column> out;
     // Create device views for inputs
     auto const lhs_dev_view = get_device_view(lhs);
     auto const rhs_dev_view = get_device_view(rhs);
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 1b78d8b8585..83be58f5e56 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -176,7 +176,6 @@ size_t gather_stream_info(const size_t stripe_index,
                           const orc::StripeInformation* stripeinfo,
                           const orc::StripeFooter* stripefooter,
                           const std::vector<int>& orc2gdf,
-                          const std::vector<orc_column_meta>& gdf2orc,
                           const std::vector<orc::SchemaType> types,
                           bool use_index,
                           size_t* num_dictionary_entries,
@@ -1203,7 +1202,6 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                                                           stripe_info,
                                                           stripe_footer,
                                                           _col_meta.orc_col_map[level],
-                                                          selected_columns,
                                                           _metadata->get_types(),
                                                           use_index,
                                                           &num_dict_entries,
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 38025a8a0ed..147db2fdfe7 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -225,11 +225,10 @@ struct column_merger {
   explicit column_merger(index_vector const& row_order) : row_order_(row_order) {}
 
   template <typename Element, CUDF_ENABLE_IF(not is_rep_layout_compatible<Element>())>
-  std::unique_ptr<column> operator()(
-    column_view const& lcol,
-    column_view const& rcol,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const
+  std::unique_ptr<column> operator()(column_view const&,
+                                     column_view const&,
+                                     rmm::cuda_stream_view,
+                                     rmm::mr::device_memory_resource*) const
   {
     CUDF_FAIL("Unsupported type for merge.");
   }
diff --git a/cpp/tests/join/conditional_join_tests.cu b/cpp/tests/join/conditional_join_tests.cu
index 8018d613e05..d566d2086bb 100644
--- a/cpp/tests/join/conditional_join_tests.cu
+++ b/cpp/tests/join/conditional_join_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,8 +44,6 @@ constexpr cudf::size_type JoinNoneValue =
 // Common column references.
 const auto col_ref_left_0  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
 const auto col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
-const auto col_ref_left_1  = cudf::ast::column_reference(1, cudf::ast::table_reference::LEFT);
-const auto col_ref_right_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
 
 // Common expressions.
 auto left_zero_eq_right_zero =

From f0fa255add77daf6fd14b714286d01d5c2b4d082 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Wed, 25 Aug 2021 10:52:24 -0500
Subject: [PATCH 37/46] Fix cudf::hash_join output size for struct joins
 (#9107)

Fixes #9095.

This adds calls to `flatten_nested_columns` in the `cudf::hash_join` join output size APIs along with tests for joins on struct columns using `cudf::hash_join`.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/9107
---
 cpp/src/join/hash_join.cu     |  31 +++++---
 cpp/tests/join/join_tests.cpp | 136 ++++++++++++++++++++--------------
 2 files changed, 104 insertions(+), 63 deletions(-)

diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index 50cc479fcf4..ee1eaeaed47 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -349,11 +349,15 @@ std::size_t hash_join::hash_join_impl::inner_join_size(cudf::table_view const& p
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(_hash_table, "Hash table of hash join is null.");
 
-  auto build_table = cudf::table_device_view::create(_build, stream);
-  auto probe_table = cudf::table_device_view::create(probe, stream);
+  auto flattened_probe = structs::detail::flatten_nested_columns(
+    probe, {}, {}, structs::detail::column_nullability::FORCE);
+  auto const flattened_probe_table = std::get<0>(flattened_probe);
+
+  auto build_table_ptr           = cudf::table_device_view::create(_build, stream);
+  auto flattened_probe_table_ptr = cudf::table_device_view::create(flattened_probe_table, stream);
 
   return cudf::detail::compute_join_output_size<cudf::detail::join_kind::INNER_JOIN>(
-    *build_table, *probe_table, *_hash_table, compare_nulls, stream);
+    *build_table_ptr, *flattened_probe_table_ptr, *_hash_table, compare_nulls, stream);
 }
 
 std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const& probe,
@@ -365,11 +369,15 @@ std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const& pr
   // Trivial left join case - exit early
   if (!_hash_table) { return probe.num_rows(); }
 
-  auto build_table = cudf::table_device_view::create(_build, stream);
-  auto probe_table = cudf::table_device_view::create(probe, stream);
+  auto flattened_probe = structs::detail::flatten_nested_columns(
+    probe, {}, {}, structs::detail::column_nullability::FORCE);
+  auto const flattened_probe_table = std::get<0>(flattened_probe);
+
+  auto build_table_ptr           = cudf::table_device_view::create(_build, stream);
+  auto flattened_probe_table_ptr = cudf::table_device_view::create(flattened_probe_table, stream);
 
   return cudf::detail::compute_join_output_size<cudf::detail::join_kind::LEFT_JOIN>(
-    *build_table, *probe_table, *_hash_table, compare_nulls, stream);
+    *build_table_ptr, *flattened_probe_table_ptr, *_hash_table, compare_nulls, stream);
 }
 
 std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const& probe,
@@ -382,10 +390,15 @@ std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const& pr
   // Trivial left join case - exit early
   if (!_hash_table) { return probe.num_rows(); }
 
-  auto build_table = cudf::table_device_view::create(_build, stream);
-  auto probe_table = cudf::table_device_view::create(probe, stream);
+  auto flattened_probe = structs::detail::flatten_nested_columns(
+    probe, {}, {}, structs::detail::column_nullability::FORCE);
+  auto const flattened_probe_table = std::get<0>(flattened_probe);
+
+  auto build_table_ptr           = cudf::table_device_view::create(_build, stream);
+  auto flattened_probe_table_ptr = cudf::table_device_view::create(flattened_probe_table, stream);
 
-  return get_full_join_size(*build_table, *probe_table, *_hash_table, compare_nulls, stream, mr);
+  return get_full_join_size(
+    *build_table_ptr, *flattened_probe_table_ptr, *_hash_table, compare_nulls, stream, mr);
 }
 
 template <cudf::detail::join_kind JoinKind>
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index e468368842a..af998e366e9 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -44,6 +44,28 @@ constexpr cudf::size_type NoneValue =
   std::numeric_limits<cudf::size_type>::min();  // TODO: how to test if this isn't public?
 
 struct JoinTest : public cudf::test::BaseFixture {
+  std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>> gather_maps_as_tables(
+    cudf::column_view const& expected_left_map,
+    cudf::column_view const& expected_right_map,
+    std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
+              std::unique_ptr<rmm::device_uvector<cudf::size_type>>> const& result)
+  {
+    auto result_table =
+      cudf::table_view({cudf::column_view{cudf::data_type{cudf::type_id::INT32},
+                                          static_cast<cudf::size_type>(result.first->size()),
+                                          result.first->data()},
+                        cudf::column_view{cudf::data_type{cudf::type_id::INT32},
+                                          static_cast<cudf::size_type>(result.second->size()),
+                                          result.second->data()}});
+    auto result_sort_order = cudf::sorted_order(result_table);
+    auto sorted_result     = cudf::gather(result_table, *result_sort_order);
+
+    cudf::table_view gold({expected_left_map, expected_right_map});
+    auto gold_sort_order = cudf::sorted_order(gold);
+    auto sorted_gold     = cudf::gather(gold, *gold_sort_order);
+
+    return std::make_pair(std::move(sorted_gold), std::move(sorted_result));
+  }
 };
 
 TEST_F(JoinTest, EmptySentinelRepro)
@@ -1232,27 +1254,9 @@ TEST_F(JoinTest, HashJoinSequentialProbes)
     EXPECT_EQ(output_size, size_gold);
 
     auto result = hash_join.full_join(t0, cudf::null_equality::EQUAL, optional_size);
-    auto result_table =
-      cudf::table_view({cudf::column_view{cudf::data_type{cudf::type_id::INT32},
-                                          static_cast<cudf::size_type>(result.first->size()),
-                                          result.first->data()},
-                        cudf::column_view{cudf::data_type{cudf::type_id::INT32},
-                                          static_cast<cudf::size_type>(result.second->size()),
-                                          result.second->data()}});
-    auto result_sort_order = cudf::sorted_order(result_table);
-    auto sorted_result     = cudf::gather(result_table, *result_sort_order);
-
     column_wrapper<int32_t> col_gold_0{{NoneValue, NoneValue, NoneValue, NoneValue, 4, 0, 1, 2, 3}};
     column_wrapper<int32_t> col_gold_1{{0, 1, 2, 3, 4, NoneValue, NoneValue, NoneValue, NoneValue}};
-
-    CVector cols_gold;
-    cols_gold.push_back(col_gold_0.release());
-    cols_gold.push_back(col_gold_1.release());
-
-    Table gold(std::move(cols_gold));
-    auto gold_sort_order = cudf::sorted_order(gold.view());
-    auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-
+    auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result);
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
   }
 
@@ -1270,27 +1274,9 @@ TEST_F(JoinTest, HashJoinSequentialProbes)
     EXPECT_EQ(output_size, size_gold);
 
     auto result = hash_join.left_join(t0, cudf::null_equality::EQUAL, optional_size);
-    auto result_table =
-      cudf::table_view({cudf::column_view{cudf::data_type{cudf::type_id::INT32},
-                                          static_cast<cudf::size_type>(result.first->size()),
-                                          result.first->data()},
-                        cudf::column_view{cudf::data_type{cudf::type_id::INT32},
-                                          static_cast<cudf::size_type>(result.second->size()),
-                                          result.second->data()}});
-    auto result_sort_order = cudf::sorted_order(result_table);
-    auto sorted_result     = cudf::gather(result_table, *result_sort_order);
-
     column_wrapper<int32_t> col_gold_0{{0, 1, 2, 3, 4}};
     column_wrapper<int32_t> col_gold_1{{NoneValue, NoneValue, NoneValue, NoneValue, 4}};
-
-    CVector cols_gold;
-    cols_gold.push_back(col_gold_0.release());
-    cols_gold.push_back(col_gold_1.release());
-
-    Table gold(std::move(cols_gold));
-    auto gold_sort_order = cudf::sorted_order(gold.view());
-    auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-
+    auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result);
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
   }
 
@@ -1308,27 +1294,69 @@ TEST_F(JoinTest, HashJoinSequentialProbes)
     EXPECT_EQ(output_size, size_gold);
 
     auto result = hash_join.inner_join(t0, cudf::null_equality::EQUAL, optional_size);
-    auto result_table =
-      cudf::table_view({cudf::column_view{cudf::data_type{cudf::type_id::INT32},
-                                          static_cast<cudf::size_type>(result.first->size()),
-                                          result.first->data()},
-                        cudf::column_view{cudf::data_type{cudf::type_id::INT32},
-                                          static_cast<cudf::size_type>(result.second->size()),
-                                          result.second->data()}});
-    auto result_sort_order = cudf::sorted_order(result_table);
-    auto sorted_result     = cudf::gather(result_table, *result_sort_order);
-
     column_wrapper<int32_t> col_gold_0{{2, 4, 0}};
     column_wrapper<int32_t> col_gold_1{{1, 1, 4}};
+    auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
+  }
+}
 
-    CVector cols_gold;
-    cols_gold.push_back(col_gold_0.release());
-    cols_gold.push_back(col_gold_1.release());
+TEST_F(JoinTest, HashJoinWithStructsAndNulls)
+{
+  auto col0_names_col = strcol_wrapper{
+    "Samuel Vimes", "Carrot Ironfoundersson", "Detritus", "Samuel Vimes", "Angua von Überwald"};
+  auto col0_ages_col = column_wrapper<int32_t>{{48, 27, 351, 31, 25}};
+
+  auto col0_is_human_col = column_wrapper<bool>{{true, true, false, false, false}, {1, 1, 0, 1, 0}};
 
-    Table gold(std::move(cols_gold));
-    auto gold_sort_order = cudf::sorted_order(gold.view());
-    auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
+  auto col0 =
+    cudf::test::structs_column_wrapper{{col0_names_col, col0_ages_col, col0_is_human_col}};
+
+  auto col1_names_col = strcol_wrapper{
+    "Samuel Vimes", "Detritus", "Detritus", "Carrot Ironfoundersson", "Angua von Überwald"};
+  auto col1_ages_col = column_wrapper<int32_t>{{48, 35, 351, 22, 25}};
 
+  auto col1_is_human_col = column_wrapper<bool>{{true, true, false, false, true}, {1, 1, 0, 1, 1}};
+
+  auto col1 =
+    cudf::test::structs_column_wrapper{{col1_names_col, col1_ages_col, col1_is_human_col}};
+
+  CVector cols0, cols1;
+  cols0.push_back(col0.release());
+  cols1.push_back(col1.release());
+
+  Table t0(std::move(cols0));
+  Table t1(std::move(cols1));
+
+  auto hash_join = cudf::hash_join(t1, cudf::null_equality::EQUAL);
+
+  {
+    auto output_size = hash_join.left_join_size(t0);
+    EXPECT_EQ(5, output_size);
+    auto result = hash_join.left_join(t0, cudf::null_equality::EQUAL, output_size);
+    column_wrapper<int32_t> col_gold_0{{0, 1, 2, 3, 4}};
+    column_wrapper<int32_t> col_gold_1{{0, NoneValue, 2, NoneValue, NoneValue}};
+    auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
+  }
+
+  {
+    auto output_size = hash_join.inner_join_size(t0);
+    EXPECT_EQ(2, output_size);
+    auto result = hash_join.inner_join(t0, cudf::null_equality::EQUAL, output_size);
+    column_wrapper<int32_t> col_gold_0{{0, 2}};
+    column_wrapper<int32_t> col_gold_1{{0, 2}};
+    auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
+  }
+
+  {
+    auto output_size = hash_join.full_join_size(t0);
+    EXPECT_EQ(8, output_size);
+    auto result = hash_join.full_join(t0, cudf::null_equality::EQUAL, output_size);
+    column_wrapper<int32_t> col_gold_0{{NoneValue, NoneValue, NoneValue, 0, 1, 2, 3, 4}};
+    column_wrapper<int32_t> col_gold_1{{1, 3, 4, 0, NoneValue, 2, NoneValue, NoneValue}};
+    auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result);
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
   }
 }

From 2a566dd3a9bd81277640c71ca7d350a65ca78fc3 Mon Sep 17 00:00:00 2001
From: shaneding <shane200195@gmail.com>
Date: Wed, 25 Aug 2021 16:49:45 -0400
Subject: [PATCH 38/46] Implement timestamp ceil (#8942)

Partly addresses #8682

This adds a `ceil` function for timestamp columns in libcudf. It is applied on fixed resolutions only.

Authors:
  - https://github.com/shaneding

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/8942
---
 cpp/include/cudf/datetime.hpp            |  85 ++++++++++++++
 cpp/include/cudf/wrappers/durations.hpp  |   8 ++
 cpp/src/datetime/datetime_ops.cu         | 135 +++++++++++++++++++++++
 cpp/tests/datetime/datetime_ops_test.cpp |  56 ++++++++++
 4 files changed, 284 insertions(+)

diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp
index 2e4ac870969..52b21c98f75 100644
--- a/cpp/include/cudf/datetime.hpp
+++ b/cpp/include/cudf/datetime.hpp
@@ -237,5 +237,90 @@ std::unique_ptr<cudf::column> extract_quarter(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
+
+/**
+ * @brief Round up to the nearest day
+ *
+ * @param cudf::column_view of the input datetime values
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<cudf::column> ceil_day(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round up to the nearest hour
+ *
+ * @param cudf::column_view of the input datetime values
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<cudf::column> ceil_hour(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round up to the nearest minute
+ *
+ * @param cudf::column_view of the input datetime values
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<cudf::column> ceil_minute(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round up to the nearest second
+ *
+ * @param cudf::column_view of the input datetime values
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<cudf::column> ceil_second(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round up to the nearest millisecond
+ *
+ * @param cudf::column_view of the input datetime values
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<column> ceil_millisecond(
+  column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round up to the nearest microsecond
+ *
+ * @param cudf::column_view of the input datetime values
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<column> ceil_microsecond(
+  column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round up to the nearest nanosecond
+ *
+ * @param cudf::column_view of the input datetime values
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<column> ceil_nanosecond(
+  column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace datetime
 }  // namespace cudf
diff --git a/cpp/include/cudf/wrappers/durations.hpp b/cpp/include/cudf/wrappers/durations.hpp
index 07bcc1976a8..8bc8b7a7e6e 100644
--- a/cpp/include/cudf/wrappers/durations.hpp
+++ b/cpp/include/cudf/wrappers/durations.hpp
@@ -33,6 +33,14 @@ namespace cudf {
  * @brief Type alias representing an int32_t duration of days.
  */
 using duration_D = cuda::std::chrono::duration<int32_t, cuda::std::chrono::days::period>;
+/**
+ * @brief Type alias representing an int32_t duration of hours.
+ */
+using duration_h = cuda::std::chrono::duration<int32_t, cuda::std::chrono::hours::period>;
+/**
+ * @brief Type alias representing an int32_t duration of minutes.
+ */
+using duration_m = cuda::std::chrono::duration<int32_t, cuda::std::chrono::minutes::period>;
 /**
  * @brief Type alias representing an int64_t duration of seconds.
  */
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index 9879a6c5423..df013be717f 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -24,7 +24,10 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+#include <cudf/wrappers/durations.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -41,6 +44,9 @@ enum class datetime_component {
   HOUR,
   MINUTE,
   SECOND,
+  MILLISECOND,
+  MICROSECOND,
+  NANOSECOND
 };
 
 template <datetime_component Component>
@@ -77,6 +83,35 @@ struct extract_component_operator {
   }
 };
 
+template <datetime_component COMPONENT>
+struct ceil_timestamp {
+  template <typename Timestamp>
+  CUDA_DEVICE_CALLABLE Timestamp operator()(Timestamp const ts) const
+  {
+    using namespace cuda::std::chrono;
+    // want to use this with D, H, T (minute), S, L (millisecond), U
+    switch (COMPONENT) {
+      case datetime_component::DAY:
+        return time_point_cast<typename Timestamp::duration>(ceil<duration_D>(ts));
+      case datetime_component::HOUR:
+        return time_point_cast<typename Timestamp::duration>(ceil<duration_h>(ts));
+      case datetime_component::MINUTE:
+        return time_point_cast<typename Timestamp::duration>(ceil<duration_m>(ts));
+      case datetime_component::SECOND:
+        return time_point_cast<typename Timestamp::duration>(ceil<duration_s>(ts));
+      case datetime_component::MILLISECOND:
+        return time_point_cast<typename Timestamp::duration>(ceil<duration_ms>(ts));
+      case datetime_component::MICROSECOND:
+        return time_point_cast<typename Timestamp::duration>(ceil<duration_us>(ts));
+      case datetime_component::NANOSECOND:
+        return time_point_cast<typename Timestamp::duration>(ceil<duration_ns>(ts));
+      default: cudf_assert(false && "Unexpected resolution");
+    }
+
+    return {};
+  }
+};
+
 // Number of days until month indexed by leap year and month (0-based index)
 static __device__ int16_t const days_until_month[2][13] = {
   {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365},  // For non leap years
@@ -155,6 +190,45 @@ struct is_leap_year_op {
   }
 };
 
+// Specific function for applying ceil/floor date ops
+template <typename TransformFunctor>
+struct dispatch_ceil {
+  template <typename Timestamp>
+  std::enable_if_t<cudf::is_timestamp<Timestamp>(), std::unique_ptr<cudf::column>> operator()(
+    cudf::column_view const& column,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr) const
+  {
+    auto size            = column.size();
+    auto output_col_type = data_type{cudf::type_to_id<Timestamp>()};
+
+    // Return an empty column if source column is empty
+    if (size == 0) return make_empty_column(output_col_type);
+
+    auto output = make_fixed_width_column(output_col_type,
+                                          size,
+                                          cudf::detail::copy_bitmask(column, stream, mr),
+                                          column.null_count(),
+                                          stream,
+                                          mr);
+
+    thrust::transform(rmm::exec_policy(stream),
+                      column.begin<Timestamp>(),
+                      column.end<Timestamp>(),
+                      output->mutable_view().begin<Timestamp>(),
+                      TransformFunctor{});
+
+    return output;
+  }
+
+  template <typename Timestamp, typename... Args>
+  std::enable_if_t<!cudf::is_timestamp<Timestamp>(), std::unique_ptr<cudf::column>> operator()(
+    Args&&...)
+  {
+    CUDF_FAIL("Must be cudf::timestamp");
+  }
+};
+
 // Apply the functor for every element/row in the input column to create the output column
 template <typename TransformFunctor, typename OutputColT>
 struct launch_functor {
@@ -286,6 +360,15 @@ std::unique_ptr<column> add_calendrical_months(column_view const& timestamp_colu
   return output;
 }
 
+template <datetime_component Component>
+std::unique_ptr<column> ceil_general(column_view const& column,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+{
+  return cudf::type_dispatcher(
+    column.type(), dispatch_ceil<detail::ceil_timestamp<Component>>{}, column, stream, mr);
+}
+
 std::unique_ptr<column> extract_year(column_view const& column,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
@@ -388,6 +471,58 @@ std::unique_ptr<column> extract_quarter(column_view const& column,
 
 }  // namespace detail
 
+std::unique_ptr<column> ceil_day(column_view const& column, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::ceil_general<detail::datetime_component::DAY>(
+    column, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> ceil_hour(column_view const& column, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::ceil_general<detail::datetime_component::HOUR>(
+    column, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> ceil_minute(column_view const& column, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::ceil_general<detail::datetime_component::MINUTE>(
+    column, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> ceil_second(column_view const& column, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::ceil_general<detail::datetime_component::SECOND>(
+    column, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> ceil_millisecond(column_view const& column,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::ceil_general<detail::datetime_component::MILLISECOND>(
+    column, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> ceil_microsecond(column_view const& column,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::ceil_general<detail::datetime_component::MICROSECOND>(
+    column, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> ceil_nanosecond(column_view const& column,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::ceil_general<detail::datetime_component::NANOSECOND>(
+    column, rmm::cuda_stream_default, mr);
+}
+
 std::unique_ptr<column> extract_year(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp
index 39ad5f556d4..4a1c0512643 100644
--- a/cpp/tests/datetime/datetime_ops_test.cpp
+++ b/cpp/tests/datetime/datetime_ops_test.cpp
@@ -348,6 +348,62 @@ TEST_F(BasicDatetimeOpsTest, TestLastDayOfMonthWithDate)
     verbosity);
 }
 
+TYPED_TEST(TypedDatetimeOpsTest, TestCeilDatetime)
+{
+  using T = TypeParam;
+  using namespace cudf::test;
+  using namespace cudf::datetime;
+  using namespace cuda::std::chrono;
+
+  auto start = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
+  auto stop_ = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
+
+  auto input = generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop_));
+
+  auto host_val                     = to_host<T>(input);
+  thrust::host_vector<T> timestamps = host_val.first;
+
+  thrust::host_vector<T> ceiled_day(timestamps.size());
+  thrust::transform(timestamps.begin(), timestamps.end(), ceiled_day.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(ceil<days>(i));
+  });
+  auto expected_day =
+    fixed_width_column_wrapper<T, typename T::duration::rep>(ceiled_day.begin(), ceiled_day.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_day(input), expected_day);
+
+  thrust::host_vector<T> ceiled_hour(timestamps.size());
+  thrust::transform(timestamps.begin(), timestamps.end(), ceiled_hour.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(ceil<hours>(i));
+  });
+  auto expected_hour = fixed_width_column_wrapper<T, typename T::duration::rep>(ceiled_hour.begin(),
+                                                                                ceiled_hour.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_hour(input), expected_hour);
+
+  std::vector<T> ceiled_minute(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), ceiled_minute.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(ceil<minutes>(i));
+  });
+  auto expected_minute = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    ceiled_minute.begin(), ceiled_minute.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_minute(input), expected_minute);
+
+  std::vector<T> ceiled_second(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), ceiled_second.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(ceil<seconds>(i));
+  });
+  auto expected_second = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    ceiled_second.begin(), ceiled_second.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_second(input), expected_second);
+
+  std::vector<T> ceiled_millisecond(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), ceiled_millisecond.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(ceil<milliseconds>(i));
+  });
+  auto expected_millisecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    ceiled_millisecond.begin(), ceiled_millisecond.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_millisecond(input), expected_millisecond);
+}
+
 TEST_F(BasicDatetimeOpsTest, TestDayOfYearWithDate)
 {
   using namespace cudf::test;

From d29c441607d6d546c57a9a7ffcaf40247861398a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 25 Aug 2021 16:54:28 -0400
Subject: [PATCH 39/46] Fix cudf::strings::is_fixed_point checking of overflow
 for decimal32 (#9093)

While working on `decimal128` support, @codereport found a bug in the `cudf::strings::is_fixed_point` logic where a large integer (in a strings column) could return true/valid even though it overflows the `Rep` type for `decimal32 type`. The gtest values did not include a value that would have shown this error. This PR adds the test string and fixes the logic properly check the overflow condition.

The current logic was relying on storing intermediate values into `uint64_t` types so any number that would fit in `uint64_t` would not be detected as overflow for `decimal32`. This PR fixes functions to use the input type storage type more to help identify the overflow correctly and to help with specializing for `decimal128`.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Conor Hoekstra (https://github.com/codereport)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/9093
---
 .../strings/detail/convert/fixed_point.cuh    | 28 ++++++++++++-------
 .../strings/convert/convert_fixed_point.cu    | 12 ++++----
 cpp/tests/strings/fixed_point_tests.cpp       | 28 ++++++++++++-------
 3 files changed, 42 insertions(+), 26 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
index 53774ed948d..56205c161b1 100644
--- a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
+++ b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
@@ -17,6 +17,8 @@
 #include <thrust/optional.h>
 #include <thrust/pair.h>
 
+#include <type_traits>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -24,20 +26,25 @@ namespace detail {
 /**
  * @brief Return the integer component of a decimal string.
  *
- * This is reads everything up to the exponent 'e' notation.
+ * This reads everything up to the exponent 'e' notation.
  * The return includes the integer digits and any exponent offset.
  *
+ * @tparam UnsignedDecimalType The unsigned version of the desired decimal type.
+ *                             Use the `std::make_unsigned_t` to create the
+ *                             unsigned type from the storage type.
+ *
  * @param[in,out] iter Start of characters to parse
  * @param[in] end End of characters to parse
  * @return Integer component and exponent offset.
  */
-__device__ inline thrust::pair<uint64_t, int32_t> parse_integer(char const*& iter,
-                                                                char const* iter_end,
-                                                                const char decimal_pt_char = '.')
+template <typename UnsignedDecimalType>
+__device__ inline thrust::pair<UnsignedDecimalType, int32_t> parse_integer(
+  char const*& iter, char const* iter_end, const char decimal_pt_char = '.')
 {
   // highest value where another decimal digit cannot be appended without an overflow;
-  // this preserves the most digits when scaling the final result
-  constexpr uint64_t decimal_max = (std::numeric_limits<uint64_t>::max() - 9L) / 10L;
+  // this preserves the most digits when scaling the final result for this type
+  constexpr UnsignedDecimalType decimal_max =
+    (std::numeric_limits<UnsignedDecimalType>::max() - 9L) / 10L;
 
   uint64_t value     = 0;  // for checking overflow
   int32_t exp_offset = 0;
@@ -56,7 +63,7 @@ __device__ inline thrust::pair<uint64_t, int32_t> parse_integer(char const*& ite
     if (value > decimal_max) {
       exp_offset += static_cast<int32_t>(!decimal_found);
     } else {
-      value = (value * 10) + static_cast<uint64_t>(ch - '0');
+      value = (value * 10) + static_cast<UnsignedDecimalType>(ch - '0');
       exp_offset -= static_cast<int32_t>(decimal_found);
     }
   }
@@ -130,7 +137,8 @@ __device__ DecimalType parse_decimal(char const* iter, char const* iter_end, int
   // if string begins with a sign, continue with next character
   if (sign != 0) ++iter;
 
-  auto [value, exp_offset] = parse_integer(iter, iter_end);
+  using UnsignedDecimalType = std::make_unsigned_t<DecimalType>;
+  auto [value, exp_offset]  = parse_integer<UnsignedDecimalType>(iter, iter_end);
   if (value == 0) { return DecimalType{0}; }
 
   // check for exponent
@@ -143,9 +151,9 @@ __device__ DecimalType parse_decimal(char const* iter, char const* iter_end, int
 
   // shift the output value based on the exp_ten and the scale values
   if (exp_ten < scale) {
-    value = value / static_cast<uint64_t>(exp10(static_cast<double>(scale - exp_ten)));
+    value = value / static_cast<UnsignedDecimalType>(exp10(static_cast<double>(scale - exp_ten)));
   } else {
-    value = value * static_cast<uint64_t>(exp10(static_cast<double>(exp_ten - scale)));
+    value = value * static_cast<UnsignedDecimalType>(exp10(static_cast<double>(exp_ten - scale)));
   }
 
   return static_cast<DecimalType>(value) * (sign == 0 ? 1 : sign);
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index 2f57b38249f..6f7076422c4 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -97,7 +97,8 @@ struct string_to_decimal_check_fn {
 
     auto const iter_end = d_str.data() + d_str.size_bytes();
 
-    auto [value, exp_offset] = parse_integer(iter, iter_end);
+    using UnsignedDecimalType = std::make_unsigned_t<DecimalType>;
+    auto [value, exp_offset]  = parse_integer<UnsignedDecimalType>(iter, iter_end);
 
     // only exponent notation is expected here
     if ((iter < iter_end) && (*iter != 'e' && *iter != 'E')) { return false; }
@@ -112,11 +113,10 @@ struct string_to_decimal_check_fn {
     exp_ten += exp_offset;
 
     // finally, check for overflow based on the exp_ten and scale values
-    return (exp_ten < scale)
-             ? true
-             : value <= static_cast<uint64_t>(
-                          std::numeric_limits<DecimalType>::max() /
-                          static_cast<DecimalType>(exp10(static_cast<double>(exp_ten - scale))));
+    return (exp_ten < scale) or
+           value <= static_cast<UnsignedDecimalType>(
+                      std::numeric_limits<DecimalType>::max() /
+                      static_cast<DecimalType>(exp10(static_cast<double>(exp_ten - scale))));
   }
 };
 
diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index d8b570cee8b..820bf5ec216 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -189,31 +189,39 @@ TEST_F(StringsConvertTest, IsFixedPoint)
     "9223372036854775807",
     "-9223372036854775807",
     "9223372036854775808",
+    "9223372036854775808000",
     "100E2147483648",
   });
-  results = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
+  results               = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
                                           cudf::data_type{cudf::type_id::DECIMAL32});
-  auto const expected32 =
-    cudf::test::fixed_width_column_wrapper<bool>({true, true, false, false, false, false, false});
+  auto const expected32 = cudf::test::fixed_width_column_wrapper<bool>(
+    {true, true, false, false, false, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected32);
 
-  results = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
+  results               = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
                                           cudf::data_type{cudf::type_id::DECIMAL64});
-  auto const expected64 =
-    cudf::test::fixed_width_column_wrapper<bool>({true, true, true, true, true, false, false});
+  auto const expected64 = cudf::test::fixed_width_column_wrapper<bool>(
+    {true, true, true, true, true, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64);
 
   results = cudf::strings::is_fixed_point(
     cudf::strings_column_view(big_numbers),
     cudf::data_type{cudf::type_id::DECIMAL32, numeric::scale_type{10}});
-  auto const expected32_scaled =
-    cudf::test::fixed_width_column_wrapper<bool>({true, true, true, true, true, true, false});
+  auto const expected32_scaled = cudf::test::fixed_width_column_wrapper<bool>(
+    {true, true, true, true, true, true, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected32_scaled);
 
+  results = cudf::strings::is_fixed_point(
+    cudf::strings_column_view(big_numbers),
+    cudf::data_type{cudf::type_id::DECIMAL64, numeric::scale_type{10}});
+  auto const expected64_scaled_positive =
+    cudf::test::fixed_width_column_wrapper<bool>({true, true, true, true, true, true, true, false});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64_scaled_positive);
+
   results = cudf::strings::is_fixed_point(
     cudf::strings_column_view(big_numbers),
     cudf::data_type{cudf::type_id::DECIMAL64, numeric::scale_type{-5}});
-  auto const expected64_scaled =
-    cudf::test::fixed_width_column_wrapper<bool>({true, true, true, false, false, false, false});
+  auto const expected64_scaled = cudf::test::fixed_width_column_wrapper<bool>(
+    {true, true, true, false, false, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64_scaled);
 }

From 40cad3868ae6902cb0fe3dcf2fea16cc5a52fab2 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 26 Aug 2021 04:19:31 -0700
Subject: [PATCH 40/46] Refactor implementation of column setitem (#9110)

This small PR reworks the behavior of `ColumnBase.__setitem__` when it is provided something other than a slice as input, for instance an array. This code path requires scattering the new values into the column, which previously involved converting the Column to a Frame in order to call Frame._scatter. Since that method was only used for this one purpose, the underlying libcudf scatter implementation has been rewritten to accept and return Columns, allowing us to inline the call and also get rid of a round trip from Column to Frame and back.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Marlene  (https://github.com/marlenezw)

URL: https://github.com/rapidsai/cudf/pull/9110
---
 python/cudf/cudf/_lib/copying.pyx      | 105 +++++++++----------------
 python/cudf/cudf/core/column/column.py |  18 ++---
 python/cudf/cudf/core/frame.py         |   8 --
 3 files changed, 43 insertions(+), 88 deletions(-)

diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index ed31574b4a5..88f54632000 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -19,7 +19,7 @@ from cudf._lib.column cimport Column
 from cudf._lib.scalar import as_device_scalar
 
 from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.table cimport Table
+from cudf._lib.table cimport Table, make_table_view
 
 from cudf._lib.reduce import minmax
 from cudf.core.abc import Serializable
@@ -192,92 +192,59 @@ def gather(
     )
 
 
-def _scatter_table(Table source_table, Column scatter_map,
-                   Table target_table, bool bounds_check=True):
+def scatter(object source, Column scatter_map, Column target_column,
+            bool bounds_check=True):
+    """
+    Scattering input into target as per the scatter map,
+    input can be a list of scalars or can be a table
+    """
 
-    cdef table_view source_table_view = source_table.data_view()
     cdef column_view scatter_map_view = scatter_map.view()
-    cdef table_view target_table_view = target_table.data_view()
+    cdef table_view target_table_view = make_table_view((target_column,))
     cdef bool c_bounds_check = bounds_check
-
     cdef unique_ptr[table] c_result
 
-    with nogil:
-        c_result = move(
-            cpp_copying.scatter(
-                source_table_view,
-                scatter_map_view,
-                target_table_view,
-                c_bounds_check
-            )
-        )
-
-    data, _ = data_from_unique_ptr(
-        move(c_result),
-        column_names=target_table._column_names,
-        index_names=None
-    )
-
-    return data, (
-        None if target_table._index is None else target_table._index.copy(
-            deep=False)
-    )
-
-
-def _scatter_scalar(scalars, Column scatter_map,
-                    Table target_table, bool bounds_check=True):
+    # Needed for the table branch
+    cdef table_view source_table_view
 
+    # Needed for the scalar branch
     cdef vector[reference_wrapper[constscalar]] source_scalars
-    source_scalars.reserve(len(scalars))
-    cdef bool c_bounds_check = bounds_check
     cdef DeviceScalar slr
-    for val, col in zip(scalars, target_table._columns):
-        slr = as_device_scalar(val, col.dtype)
+
+    if isinstance(source, Column):
+        source_table_view = make_table_view((<Column> source,))
+
+        with nogil:
+            c_result = move(
+                cpp_copying.scatter(
+                    source_table_view,
+                    scatter_map_view,
+                    target_table_view,
+                    c_bounds_check
+                )
+            )
+    else:
+        slr = as_device_scalar(source, target_column.dtype)
         source_scalars.push_back(reference_wrapper[constscalar](
             slr.get_raw_ptr()[0]))
-    cdef column_view scatter_map_view = scatter_map.view()
-    cdef table_view target_table_view = target_table.data_view()
-
-    cdef unique_ptr[table] c_result
 
-    with nogil:
-        c_result = move(
-            cpp_copying.scatter(
-                source_scalars,
-                scatter_map_view,
-                target_table_view,
-                c_bounds_check
+        with nogil:
+            c_result = move(
+                cpp_copying.scatter(
+                    source_scalars,
+                    scatter_map_view,
+                    target_table_view,
+                    c_bounds_check
+                )
             )
-        )
 
     data, _ = data_from_unique_ptr(
         move(c_result),
-        column_names=target_table._column_names,
+        column_names=(None,),
         index_names=None
     )
 
-    return data, (
-        None if target_table._index is None else target_table._index.copy(
-            deep=False)
-    )
-
-
-def scatter(object input, object scatter_map, Table target,
-            bool bounds_check=True):
-    """
-    Scattering input into target as per the scatter map,
-    input can be a list of scalars or can be a table
-    """
-
-    from cudf.core.column.column import as_column
-
-    if not isinstance(scatter_map, Column):
-        scatter_map = as_column(scatter_map)
-
-    if isinstance(input, Table):
-        return _scatter_table(input, scatter_map, target, bounds_check)
-    else:
-        return _scatter_scalar(input, scatter_map, target, bounds_check)
+    return next(iter(data.values()))
 
 
 def _reverse_column(Column source_column):
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index d52f63a79f5..c834efec9fb 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -599,17 +599,13 @@ def __setitem__(self, key: Any, value: Any):
             )
         else:
             try:
-                if is_scalar(value):
-                    input = self
-                    out = input.as_frame()._scatter(key, [value])._as_column()
-                else:
-                    if not isinstance(value, Column):
-                        value = as_column(value)
-                    out = (
-                        self.as_frame()
-                        ._scatter(key, value.as_frame())
-                        ._as_column()
-                    )
+                if not isinstance(key, Column):
+                    key = as_column(key)
+                if not is_scalar(value) and not isinstance(value, Column):
+                    value = as_column(value)
+                out = libcudf.copying.scatter(
+                    value, key, self
+                )._with_type_metadata(self.dtype)
             except RuntimeError as e:
                 if "out of bounds" in str(e):
                     raise IndexError(
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 9f743cd8c85..4f46794aa3f 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -692,14 +692,6 @@ def _as_column(self):
 
         return self._data[None].copy(deep=False)
 
-    def _scatter(self, key, value):
-        result = self.__class__._from_data(
-            *libcudf.copying.scatter(value, key, self)
-        )
-
-        result._copy_type_metadata(self)
-        return result
-
     def _empty_like(self, keep_index=True):
         result = self.__class__._from_data(
             *libcudf.copying.table_empty_like(self, keep_index)

From 0ad36ff30fd693e9778ec7af206d5e5460d2bfa5 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 26 Aug 2021 11:38:04 -0500
Subject: [PATCH 41/46] Add backend for `percentile_lookup` dispatch (#9118)

This PR adds backend for `percentile_lookup` dispatch in `dask_cudf`, related dask upstream changes were done in https://github.com/dask/dask/pull/8083/

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/cudf/pull/9118
---
 python/dask_cudf/dask_cudf/backends.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index c0204190957..299d6f7b119 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -259,9 +259,14 @@ def is_categorical_dtype_cudf(obj):
 
 
 try:
-    from dask.dataframe.dispatch import percentile_dispatch
+    try:
+        from dask.array.dispatch import percentile_lookup
+    except ImportError:
+        from dask.dataframe.dispatch import (
+            percentile_dispatch as percentile_lookup,
+        )
 
-    @percentile_dispatch.register((cudf.Series, cp.ndarray, cudf.Index))
+    @percentile_lookup.register((cudf.Series, cp.ndarray, cudf.Index))
     def percentile_cudf(a, q, interpolation="linear"):
         # Cudf dispatch to the equivalent of `np.percentile`:
         # https://numpy.org/doc/stable/reference/generated/numpy.percentile.html

From 263190a99aaaff06222568aacb364c9d3c7a5552 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 26 Aug 2021 13:59:44 -0500
Subject: [PATCH 42/46] Preserve float16 upscaling (#9069)

Fixes: #9065

This PR enables using `np.dtype` only for `__cuda_array_interface__` scenario in `as_column`. The dtype in this array interface is guaranteed to be numeric which `np.dtype` can handle. Also there is `float16` dtype upcasting logic already inplace below i.e., at line 1760.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/9069
---
 python/cudf/cudf/_fuzz_testing/csv.py         |  4 +-
 python/cudf/cudf/_fuzz_testing/json.py        |  4 +-
 python/cudf/cudf/_fuzz_testing/utils.py       |  6 +-
 python/cudf/cudf/_lib/aggregation.pyx         | 14 ++--
 python/cudf/cudf/_lib/binaryop.pyx            |  4 +-
 python/cudf/cudf/_lib/column.pyx              |  5 +-
 python/cudf/cudf/_lib/orc.pyx                 | 13 ++--
 python/cudf/cudf/_lib/reduce.pyx              |  7 +-
 python/cudf/cudf/_lib/scalar.pyx              |  6 +-
 python/cudf/cudf/_lib/string_casting.pyx      | 14 ++--
 .../strings/convert/convert_fixed_point.pyx   |  2 +-
 python/cudf/cudf/_lib/transform.pyx           | 10 ++-
 python/cudf/cudf/_lib/types.pyx               | 26 ++------
 python/cudf/cudf/_lib/unary.pyx               |  7 +-
 python/cudf/cudf/_lib/utils.pyx               |  4 +-
 python/cudf/cudf/core/column/column.py        | 32 +++++++---
 python/cudf/cudf/core/column/numerical.py     |  8 +--
 python/cudf/cudf/core/column/timedelta.py     |  4 +-
 python/cudf/cudf/core/dtypes.py               |  8 +--
 python/cudf/cudf/tests/test_dataframe.py      | 10 +--
 python/cudf/cudf/tests/test_dtypes.py         | 22 +++++--
 python/cudf/cudf/tests/test_numerical.py      |  6 +-
 python/cudf/cudf/tests/test_replace.py        |  6 +-
 python/cudf/cudf/tests/test_repr.py           |  4 +-
 python/cudf/cudf/tests/test_series.py         | 21 ++++++
 python/cudf/cudf/utils/dtypes.py              | 64 ++++++++++---------
 26 files changed, 177 insertions(+), 134 deletions(-)

diff --git a/python/cudf/cudf/_fuzz_testing/csv.py b/python/cudf/cudf/_fuzz_testing/csv.py
index 0acb9c8a471..5f628904276 100644
--- a/python/cudf/cudf/_fuzz_testing/csv.py
+++ b/python/cudf/cudf/_fuzz_testing/csv.py
@@ -13,7 +13,7 @@
     pyarrow_to_pandas,
 )
 from cudf.testing import dataset_generator as dg
-from cudf.utils.dtypes import pandas_dtypes_to_cudf_dtypes
+from cudf.utils.dtypes import pandas_dtypes_to_np_dtypes
 
 logging.basicConfig(
     format="%(asctime)s %(levelname)-8s %(message)s",
@@ -100,7 +100,7 @@ def set_rand_params(self, params):
                         dtype_val = {
                             col_name: "category"
                             if cudf.utils.dtypes.is_categorical_dtype(dtype)
-                            else pandas_dtypes_to_cudf_dtypes[dtype]
+                            else pandas_dtypes_to_np_dtypes[dtype]
                             for col_name, dtype in dtype_val.items()
                         }
                     params_dict[param] = dtype_val
diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py
index df9226cf059..8a8a3d5bff7 100644
--- a/python/cudf/cudf/_fuzz_testing/json.py
+++ b/python/cudf/cudf/_fuzz_testing/json.py
@@ -14,7 +14,7 @@
     pyarrow_to_pandas,
 )
 from cudf.testing import dataset_generator as dg
-from cudf.utils.dtypes import pandas_dtypes_to_cudf_dtypes
+from cudf.utils.dtypes import pandas_dtypes_to_np_dtypes
 
 logging.basicConfig(
     format="%(asctime)s %(levelname)-8s %(message)s",
@@ -31,7 +31,7 @@ def _get_dtype_param_value(dtype_val):
                 processed_dtypes[col_name] = "category"
             else:
                 processed_dtypes[col_name] = str(
-                    pandas_dtypes_to_cudf_dtypes.get(dtype, dtype)
+                    pandas_dtypes_to_np_dtypes.get(dtype, dtype)
                 )
         return processed_dtypes
     return dtype_val
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index 83ab02351f2..0e68f1c71cc 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -11,7 +11,7 @@
 import cudf
 from cudf.testing._utils import assert_eq
 from cudf.utils.dtypes import (
-    pandas_dtypes_to_cudf_dtypes,
+    pandas_dtypes_to_np_dtypes,
     pyarrow_dtypes_to_pandas_dtypes,
 )
 
@@ -218,7 +218,7 @@ def convert_nulls_to_none(records, df):
     scalar_columns_convert = [
         col
         for col in df.columns
-        if df[col].dtype in pandas_dtypes_to_cudf_dtypes
+        if df[col].dtype in pandas_dtypes_to_np_dtypes
         or pd.api.types.is_datetime64_dtype(df[col].dtype)
         or pd.api.types.is_timedelta64_dtype(df[col].dtype)
     ]
@@ -263,7 +263,7 @@ def _null_to_None(value):
     has_nulls_or_nullable_dtype = any(
         [
             True
-            if df[col].dtype in pandas_dtypes_to_cudf_dtypes
+            if df[col].dtype in pandas_dtypes_to_np_dtypes
             or df[col].isnull().any()
             else False
             for col in df.columns
diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index da407cdbfa8..097018fe3c0 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -11,7 +11,11 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.types import NullHandling, cudf_to_np_types, np_to_cudf_types
+from cudf._lib.types import (
+    LIBCUDF_TO_SUPPORTED_NUMPY_TYPES,
+    SUPPORTED_NUMPY_TO_LIBCUDF_TYPES,
+    NullHandling,
+)
 from cudf.utils import cudautils
 
 from cudf._lib.types cimport (
@@ -281,7 +285,7 @@ cdef class Aggregation:
         compiled_op = cudautils.compile_udf(op, type_signature)
         output_np_dtype = cudf.dtype(compiled_op[1])
         cpp_str = compiled_op[0].encode('UTF-8')
-        if output_np_dtype not in np_to_cudf_types:
+        if output_np_dtype not in SUPPORTED_NUMPY_TO_LIBCUDF_TYPES:
             raise TypeError(
                 "Result of window function has unsupported dtype {}"
                 .format(op[1])
@@ -289,7 +293,7 @@ cdef class Aggregation:
         tid = (
             <libcudf_types.type_id> (
                 <underlying_type_t_type_id> (
-                    np_to_cudf_types[output_np_dtype]
+                    SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[output_np_dtype]
                 )
             )
         )
@@ -425,7 +429,7 @@ cdef class RollingAggregation:
         compiled_op = cudautils.compile_udf(op, type_signature)
         output_np_dtype = cudf.dtype(compiled_op[1])
         cpp_str = compiled_op[0].encode('UTF-8')
-        if output_np_dtype not in np_to_cudf_types:
+        if output_np_dtype not in SUPPORTED_NUMPY_TO_LIBCUDF_TYPES:
             raise TypeError(
                 "Result of window function has unsupported dtype {}"
                 .format(op[1])
@@ -433,7 +437,7 @@ cdef class RollingAggregation:
         tid = (
             <libcudf_types.type_id> (
                 <underlying_type_t_type_id> (
-                    np_to_cudf_types[output_np_dtype]
+                    SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[output_np_dtype]
                 )
             )
         )
diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx
index 7e0be09236f..d27ac533304 100644
--- a/python/cudf/cudf/_lib/binaryop.pyx
+++ b/python/cudf/cudf/_lib/binaryop.pyx
@@ -16,7 +16,7 @@ from cudf._lib.scalar import as_device_scalar
 
 from cudf._lib.scalar cimport DeviceScalar
 
-from cudf._lib.types import np_to_cudf_types
+from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -212,7 +212,7 @@ def binaryop_udf(Column lhs, Column rhs, udf_ptx, dtype):
     cdef type_id tid = (
         <type_id> (
             <underlying_type_t_type_id> (
-                np_to_cudf_types[cudf.dtype(dtype)]
+                SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[cudf.dtype(dtype)]
             )
         )
     )
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index b5223a32a18..f833f6e3150 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -30,7 +30,10 @@ from cudf._lib.cpp.strings.convert.convert_integers cimport (
     from_integers as cpp_from_integers,
 )
 
-from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
+from cudf._lib.types import (
+    LIBCUDF_TO_SUPPORTED_NUMPY_TYPES,
+    SUPPORTED_NUMPY_TO_LIBCUDF_TYPES,
+)
 
 from cudf._lib.types cimport (
     dtype_from_column_view,
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 995243c7ea7..b0cbfb33931 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -8,11 +8,8 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column cimport column
-
-from cudf.utils.dtypes import is_struct_dtype
-
 from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.io.orc cimport (
     chunked_orc_writer_options,
     orc_chunked_writer,
@@ -45,7 +42,7 @@ from cudf._lib.io.utils cimport (
 )
 from cudf._lib.table cimport Table
 
-from cudf._lib.types import np_to_cudf_types
+from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 
 from cudf._lib.types cimport underlying_type_t_type_id
 
@@ -53,7 +50,7 @@ import numpy as np
 
 from cudf._lib.utils cimport data_from_unique_ptr, get_column_names
 
-from cudf._lib.utils import _index_level_name, generate_pandas_metadata
+from cudf._lib.utils import generate_pandas_metadata
 
 
 cpdef read_raw_orc_statistics(filepath_or_buffer):
@@ -97,7 +94,9 @@ cpdef read_orc(object filepaths_or_buffers,
             if timestamp_type is None else
             <type_id>(
                 <underlying_type_t_type_id> (
-                    np_to_cudf_types[cudf.dtype(timestamp_type)]
+                    SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[
+                        cudf.dtype(timestamp_type)
+                    ]
                 )
             )
         ),
diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 49ebb0a2528..87da5526d3c 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -1,7 +1,6 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 import cudf
-from cudf.core.dtypes import Decimal64Dtype
 from cudf.utils.dtypes import is_decimal_dtype
 
 from cudf._lib.column cimport Column
@@ -12,7 +11,7 @@ from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.types cimport data_type, type_id
 from cudf._lib.scalar cimport DeviceScalar
 
-from cudf._lib.types import np_to_cudf_types
+from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move, pair
@@ -76,7 +75,7 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
         scale = -c_result.get()[0].type().scale()
         precision = _reduce_precision(col_dtype, reduction_op, len(incol))
         py_result = DeviceScalar.from_unique_ptr(
-            move(c_result), dtype=Decimal64Dtype(precision, scale)
+            move(c_result), dtype=cudf.Decimal64Dtype(precision, scale)
         )
     else:
         py_result = DeviceScalar.from_unique_ptr(move(c_result))
@@ -160,4 +159,4 @@ def _reduce_precision(dtype, op, nrows):
         new_p = 2 * p + nrows
     else:
         raise NotImplementedError()
-    return max(min(new_p, Decimal64Dtype.MAX_PRECISION), 0)
+    return max(min(new_p, cudf.Decimal64Dtype.MAX_PRECISION), 0)
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index fe11d5e2627..c0cae16d9ef 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -21,7 +21,7 @@ from libcpp.utility cimport move
 
 import cudf
 from cudf._lib.types import (
-    cudf_to_np_types,
+    LIBCUDF_TO_SUPPORTED_NUMPY_TYPES,
     datetime_unit_map,
     duration_unit_map,
 )
@@ -199,7 +199,7 @@ cdef class DeviceScalar:
                 )
             else:
                 s._dtype = ListDtype(
-                    cudf_to_np_types[
+                    LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
                         <underlying_type_t_type_id>(
                             (<list_scalar*>s.get_raw_ptr())[0]
                             .view().type().id()
@@ -210,7 +210,7 @@ cdef class DeviceScalar:
             if dtype is not None:
                 s._dtype = dtype
             else:
-                s._dtype = cudf_to_np_types[
+                s._dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
                     <underlying_type_t_type_id>(cdtype.id())
                 ]
         return s
diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
index 25e4149183e..74490d6bb19 100644
--- a/python/cudf/cudf/_lib/string_casting.pyx
+++ b/python/cudf/cudf/_lib/string_casting.pyx
@@ -8,7 +8,7 @@ from cudf._lib.scalar import as_device_scalar
 
 from cudf._lib.scalar cimport DeviceScalar
 
-from cudf._lib.types import np_to_cudf_types
+from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
@@ -72,7 +72,7 @@ def string_to_floating(Column input_col, object out_type):
     cdef unique_ptr[column] c_result
     cdef type_id tid = <type_id> (
         <underlying_type_t_type_id> (
-            np_to_cudf_types[out_type]
+            SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[out_type]
         )
     )
     cdef data_type c_out_type = data_type(tid)
@@ -165,7 +165,7 @@ def string_to_integer(Column input_col, object out_type):
     cdef unique_ptr[column] c_result
     cdef type_id tid = <type_id> (
         <underlying_type_t_type_id> (
-            np_to_cudf_types[out_type]
+            SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[out_type]
         )
     )
     cdef data_type c_out_type = data_type(tid)
@@ -552,7 +552,7 @@ def timestamp2int(Column input_col, dtype, format):
     cdef column_view input_column_view = input_col.view()
     cdef type_id tid = <type_id> (
         <underlying_type_t_type_id> (
-            np_to_cudf_types[dtype]
+            SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[dtype]
         )
     )
     cdef data_type out_type = data_type(tid)
@@ -617,7 +617,7 @@ def timedelta2int(Column input_col, dtype, format):
     cdef column_view input_column_view = input_col.view()
     cdef type_id tid = <type_id> (
         <underlying_type_t_type_id> (
-            np_to_cudf_types[dtype]
+            SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[dtype]
         )
     )
     cdef data_type out_type = data_type(tid)
@@ -744,7 +744,9 @@ def htoi(Column input_col, **kwargs):
     cdef column_view input_column_view = input_col.view()
     cdef type_id tid = <type_id> (
         <underlying_type_t_type_id> (
-            np_to_cudf_types[kwargs.get('dtype', cudf.dtype("int64"))]
+            SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[
+                kwargs.get('dtype', cudf.dtype("int64"))
+            ]
         )
     )
     cdef data_type c_out_type = data_type(tid)
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
index e35ab6489c6..54e85d8833f 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
@@ -4,7 +4,7 @@ import numpy as np
 
 from cudf._lib.column cimport Column
 
-from cudf._lib.types import np_to_cudf_types
+from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index 9fada59640e..60e6132fe7f 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -20,7 +20,7 @@ from cudf.core.buffer import Buffer
 
 from cudf._lib.cpp.types cimport bitmask_type, data_type, size_type, type_id
 
-from cudf._lib.types import np_to_cudf_types
+from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -103,7 +103,9 @@ def transform(Column input, op):
 
     try:
         c_tid = <type_id> (
-            <underlying_type_t_type_id> np_to_cudf_types[np_dtype]
+            <underlying_type_t_type_id> SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[
+                np_dtype
+            ]
         )
         c_dtype = data_type(c_tid)
 
@@ -131,7 +133,9 @@ def masked_udf(Table incols, op, output_type):
     cdef data_type c_dtype
 
     c_tid = <type_id> (
-        <underlying_type_t_type_id> np_to_cudf_types[output_type]
+        <underlying_type_t_type_id> SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[
+            output_type
+        ]
     )
     c_dtype = data_type(c_tid)
 
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index d3a4c45f213..e798d78d426 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -6,6 +6,7 @@ import numpy as np
 
 from libcpp.memory cimport make_shared, shared_ptr
 
+cimport cudf._lib.cpp.types as libcudf_types
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
 from cudf._lib.types cimport (
@@ -15,21 +16,6 @@ from cudf._lib.types cimport (
     underlying_type_t_sorted,
 )
 
-from cudf.core.dtypes import (
-    Decimal32Dtype,
-    Decimal64Dtype,
-    ListDtype,
-    StructDtype,
-)
-from cudf.utils.dtypes import (
-    is_decimal32_dtype,
-    is_decimal64_dtype,
-    is_decimal_dtype,
-    is_list_dtype,
-    is_struct_dtype,
-)
-
-cimport cudf._lib.cpp.types as libcudf_types
 import cudf
 
 
@@ -82,7 +68,7 @@ class TypeId(IntEnum):
     DECIMAL64 = <underlying_type_t_type_id> libcudf_types.type_id.DECIMAL64
 
 
-np_to_cudf_types = {
+SUPPORTED_NUMPY_TO_LIBCUDF_TYPES = {
     np.dtype("int8"): TypeId.INT8,
     np.dtype("int16"): TypeId.INT16,
     np.dtype("int32"): TypeId.INT32,
@@ -105,7 +91,7 @@ np_to_cudf_types = {
     np.dtype("timedelta64[ns]"): TypeId.DURATION_NANOSECONDS,
 }
 
-cudf_to_np_types = {
+LIBCUDF_TO_SUPPORTED_NUMPY_TYPES = {
     TypeId.INT8: np.dtype("int8"),
     TypeId.INT16: np.dtype("int16"),
     TypeId.INT32: np.dtype("int32"),
@@ -221,7 +207,9 @@ cdef dtype_from_column_view(column_view cv):
             scale=-cv.type().scale()
         )
     else:
-        return cudf_to_np_types[<underlying_type_t_type_id>(tid)]
+        return LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
+            <underlying_type_t_type_id>(tid)
+        ]
 
 cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
     if cudf.api.types.is_list_dtype(dtype):
@@ -235,7 +223,7 @@ cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
     else:
         tid = <libcudf_types.type_id> (
             <underlying_type_t_type_id> (
-                np_to_cudf_types[np.dtype(dtype)]))
+                SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[np.dtype(dtype)]))
 
     if tid in (
         libcudf_types.type_id.DECIMAL64,
diff --git a/python/cudf/cudf/_lib/unary.pyx b/python/cudf/cudf/_lib/unary.pyx
index c06723fe442..2b6f3e8b4c1 100644
--- a/python/cudf/cudf/_lib/unary.pyx
+++ b/python/cudf/cudf/_lib/unary.pyx
@@ -14,11 +14,14 @@ from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
 
-from cudf._lib.types import np_to_cudf_types
+from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 
 from cudf._lib.cpp.types cimport data_type, size_type, type_id
 
-from cudf._lib.column import cudf_to_np_types, np_to_cudf_types
+from cudf._lib.column import (
+    LIBCUDF_TO_SUPPORTED_NUMPY_TYPES,
+    SUPPORTED_NUMPY_TO_LIBCUDF_TYPES,
+)
 
 cimport cudf._lib.cpp.types as libcudf_types
 cimport cudf._lib.cpp.unary as libcudf_unary
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 81b62159b59..cd258102228 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -23,17 +23,17 @@ except ImportError:
     import json
 
 from cudf.utils.dtypes import (
-    cudf_dtypes_to_pandas_dtypes,
     is_categorical_dtype,
     is_decimal_dtype,
     is_list_dtype,
     is_struct_dtype,
+    np_dtypes_to_pandas_dtypes,
     np_to_pa_dtype,
 )
 
 PARQUET_META_TYPE_MAP = {
     str(cudf_dtype): str(pandas_dtype)
-    for cudf_dtype, pandas_dtype in cudf_dtypes_to_pandas_dtypes.items()
+    for cudf_dtype, pandas_dtype in np_dtypes_to_pandas_dtypes.items()
 }
 
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index c834efec9fb..bd4ca2bdcec 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -69,7 +69,7 @@
     min_unsigned_type,
     np_to_pa_dtype,
     pandas_dtypes_alias_to_cudf_alias,
-    pandas_dtypes_to_cudf_dtypes,
+    pandas_dtypes_to_np_dtypes,
 )
 from cudf.utils.utils import mask_dtype
 
@@ -894,7 +894,7 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
         dtype = (
             pandas_dtypes_alias_to_cudf_alias.get(dtype, dtype)
             if isinstance(dtype, str)
-            else pandas_dtypes_to_cudf_dtypes.get(dtype, dtype)
+            else pandas_dtypes_to_np_dtypes.get(dtype, dtype)
         )
         if _is_non_decimal_numeric_dtype(dtype):
             return self.as_numerical_column(dtype, **kwargs)
@@ -1729,9 +1729,13 @@ def as_column(
 
     elif hasattr(arbitrary, "__cuda_array_interface__"):
         desc = arbitrary.__cuda_array_interface__
-        current_dtype = cudf.dtype(desc["typestr"])
+        current_dtype = np.dtype(desc["typestr"])
 
-        arb_dtype = cudf.dtype(current_dtype)
+        arb_dtype = (
+            np.dtype("float32")
+            if current_dtype == "float16"
+            else cudf.dtype(current_dtype)
+        )
 
         if desc.get("mask", None) is not None:
             # Extract and remove the mask from arbitrary before
@@ -1775,6 +1779,12 @@ def as_column(
         return col
 
     elif isinstance(arbitrary, (pa.Array, pa.ChunkedArray)):
+        if isinstance(arbitrary, pa.lib.HalfFloatArray):
+            raise NotImplementedError(
+                "Type casting from `float16` to `float32` is not "
+                "yet supported in pyarrow, see: "
+                "https://issues.apache.org/jira/browse/ARROW-3802"
+            )
         col = ColumnBase.from_arrow(arbitrary)
         if isinstance(arbitrary, pa.NullArray):
             if type(dtype) == str and dtype == "empty":
@@ -1797,7 +1807,7 @@ def as_column(
         elif arbitrary.dtype == np.bool_:
             data = as_column(cupy.asarray(arbitrary), dtype=arbitrary.dtype)
         elif arbitrary.dtype.kind in ("f"):
-            arb_dtype = cudf.dtype(arbitrary.dtype)
+            arb_dtype = np.dtype(arbitrary.dtype)
             data = as_column(
                 cupy.asarray(arbitrary, dtype=arb_dtype),
                 nan_as_null=nan_as_null,
@@ -1850,7 +1860,7 @@ def as_column(
         # CUDF assumes values are always contiguous
         desc = arbitrary.__array_interface__
         shape = desc["shape"]
-        arb_dtype = cudf.dtype(desc["typestr"])
+        arb_dtype = np.dtype(desc["typestr"])
         # CUDF assumes values are always contiguous
         if len(shape) > 1:
             raise ValueError("Data must be 1-dimensional")
@@ -1941,6 +1951,8 @@ def as_column(
             if dtype is not None:
                 data = data.astype(dtype)
         elif arb_dtype.kind in ("f"):
+            if arb_dtype == np.dtype("float16"):
+                arb_dtype = np.dtype("float32")
             arb_dtype = cudf.dtype(arb_dtype if dtype is None else dtype)
             data = as_column(
                 cupy.asarray(arbitrary, dtype=arb_dtype),
@@ -1956,7 +1968,11 @@ def as_column(
             if arbitrary.dtype == pd.StringDtype():
                 arb_dtype = cudf.dtype("O")
             else:
-                arb_dtype = cudf.dtype(arbitrary.dtype)
+                arb_dtype = (
+                    cudf.dtype("float32")
+                    if arbitrary.dtype == "float16"
+                    else cudf.dtype(arbitrary.dtype)
+                )
                 if arb_dtype != arbitrary.dtype.numpy_dtype:
                     arbitrary = arbitrary.astype(arb_dtype)
         if (
@@ -2131,7 +2147,7 @@ def _construct_array(
             arbitrary,
             dtype=native_dtype
             if native_dtype is None
-            else cudf.dtype(native_dtype),
+            else np.dtype(native_dtype),
         )
     return arbitrary
 
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index db1829d5f38..736aa5b5a7b 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -25,9 +25,9 @@
 from cudf.utils import cudautils, utils
 from cudf.utils.dtypes import (
     NUMERIC_TYPES,
-    cudf_dtypes_to_pandas_dtypes,
     min_column_type,
     min_signed_type,
+    np_dtypes_to_pandas_dtypes,
     numeric_normalize_types,
     to_cudf_compatible_scalar,
 )
@@ -197,7 +197,7 @@ def normalize_binop_value(
             if isinstance(other, cudf.Scalar):
                 return other
             other_dtype = np.promote_types(self.dtype, other_dtype)
-            if other_dtype == cudf.dtype("float16"):
+            if other_dtype == np.dtype("float16"):
                 other_dtype = cudf.dtype("float32")
                 other = other_dtype.type(other)
             if self.dtype.kind == "b":
@@ -587,8 +587,8 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
     def to_pandas(
         self, index: pd.Index = None, nullable: bool = False, **kwargs
     ) -> "pd.Series":
-        if nullable and self.dtype in cudf_dtypes_to_pandas_dtypes:
-            pandas_nullable_dtype = cudf_dtypes_to_pandas_dtypes[self.dtype]
+        if nullable and self.dtype in np_dtypes_to_pandas_dtypes:
+            pandas_nullable_dtype = np_dtypes_to_pandas_dtypes[self.dtype]
             arrow_array = self.to_arrow()
             pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array)
             pd_series = pd.Series(pandas_array, copy=False)
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 7c1250231f3..1c4ed4c7f98 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -586,9 +586,9 @@ def nanoseconds(self) -> "cudf.core.column.NumericalColumn":
 
 
 def determine_out_dtype(lhs_dtype: Dtype, rhs_dtype: Dtype) -> Dtype:
-    if np.can_cast(cudf.dtype(lhs_dtype), cudf.dtype(rhs_dtype)):
+    if np.can_cast(np.dtype(lhs_dtype), np.dtype(rhs_dtype)):
         return rhs_dtype
-    elif np.can_cast(cudf.dtype(rhs_dtype), cudf.dtype(lhs_dtype)):
+    elif np.can_cast(np.dtype(rhs_dtype), np.dtype(lhs_dtype)):
         return lhs_dtype
     else:
         raise TypeError(f"Cannot type-cast {lhs_dtype} and {rhs_dtype}")
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index ead0b6453c1..1b504310e99 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -36,16 +36,12 @@ def dtype(arbitrary):
     # first, try interpreting arbitrary as a NumPy dtype that we support:
     try:
         np_dtype = np.dtype(arbitrary)
-        if np_dtype.name == "float16":
-            return np.dtype("float32")
-        elif np_dtype.name == "float128":
-            raise NotImplementedError()
-        elif np_dtype.kind in ("OU"):
+        if np_dtype.kind in ("OU"):
             return np.dtype("object")
     except TypeError:
         pass
     else:
-        if np_dtype.kind not in "biufUOMm":
+        if np_dtype not in cudf._lib.types.SUPPORTED_NUMPY_TO_LIBCUDF_TYPES:
             raise TypeError(f"Unsupported type {np_dtype}")
         return np_dtype
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index a337660b5b0..0eb7969f2d7 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -3669,7 +3669,7 @@ def test_one_row_head():
     "np_dtype,pd_dtype",
     [
         tuple(item)
-        for item in cudf.utils.dtypes.cudf_dtypes_to_pandas_dtypes.items()
+        for item in cudf.utils.dtypes.np_dtypes_to_pandas_dtypes.items()
     ],
 )
 def test_series_astype_pandas_nullable(dtype, np_dtype, pd_dtype):
@@ -5909,17 +5909,17 @@ def test_df_string_cat_types_mask_where(data, condition, other, has_cat):
         (
             pd.Series([random.random() for _ in range(10)], dtype="float128"),
             None,
-            NotImplementedError,
+            TypeError,
         ),
     ],
 )
 def test_from_pandas_unsupported_types(data, expected_upcast_type, error):
     pdf = pd.DataFrame({"one_col": data})
-    if error == NotImplementedError:
-        with pytest.raises(error):
+    if error is not None:
+        with pytest.raises(ValueError):
             cudf.from_pandas(data)
 
-        with pytest.raises(error):
+        with pytest.raises(ValueError):
             cudf.Series(data)
 
         with pytest.raises(error):
diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py
index ee6cc7b6df6..d98ab0504cc 100644
--- a/python/cudf/cudf/tests/test_dtypes.py
+++ b/python/cudf/cudf/tests/test_dtypes.py
@@ -264,7 +264,6 @@ def test_lists_of_structs_dtype(data):
     [
         (np.dtype("int8"), np.dtype("int8")),
         (np.int8, np.dtype("int8")),
-        (np.float16, np.dtype("float32")),
         (pd.Int8Dtype(), np.dtype("int8")),
         (pd.StringDtype(), np.dtype("object")),
         ("int8", np.dtype("int8")),
@@ -274,17 +273,12 @@ def test_lists_of_structs_dtype(data):
         (int, np.dtype("int64")),
         (float, np.dtype("float64")),
         (cudf.ListDtype("int64"), cudf.ListDtype("int64")),
-        ("float16", np.dtype("float32")),
         (np.dtype("U"), np.dtype("object")),
-        ("timedelta64", np.dtype("<m8")),
         ("timedelta64[ns]", np.dtype("<m8[ns]")),
         ("timedelta64[ms]", np.dtype("<m8[ms]")),
-        ("timedelta64[D]", np.dtype("<m8[D]")),
         ("<m8[s]", np.dtype("<m8[s]")),
-        ("datetime64", np.dtype("<M8")),
         ("datetime64[ns]", np.dtype("<M8[ns]")),
         ("datetime64[ms]", np.dtype("<M8[ms]")),
-        ("datetime64[D]", np.dtype("<M8[D]")),
         ("<M8[s]", np.dtype("<M8[s]")),
         (cudf.ListDtype("int64"), cudf.ListDtype("int64")),
         ("category", cudf.CategoricalDtype()),
@@ -311,7 +305,21 @@ def test_dtype(in_dtype, expect):
 
 
 @pytest.mark.parametrize(
-    "in_dtype", ["complex", np.complex128, complex, "S", "a", "V"]
+    "in_dtype",
+    [
+        "complex",
+        np.complex128,
+        complex,
+        "S",
+        "a",
+        "V",
+        "float16",
+        np.float16,
+        "timedelta64",
+        "timedelta64[D]",
+        "datetime64[D]",
+        "datetime64",
+    ],
 )
 def test_dtype_raise(in_dtype):
     with pytest.raises(TypeError):
diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py
index 2e1ce5cddfc..cb4757d1ab7 100644
--- a/python/cudf/cudf/tests/test_numerical.py
+++ b/python/cudf/cudf/tests/test_numerical.py
@@ -7,7 +7,7 @@
 import cudf
 from cudf.core._compat import PANDAS_GE_100
 from cudf.testing._utils import NUMERIC_TYPES, assert_eq
-from cudf.utils.dtypes import cudf_dtypes_to_pandas_dtypes
+from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes
 
 
 def test_can_cast_safely_same_kind():
@@ -393,7 +393,7 @@ def test_series_construction_with_nulls(dtype, input_obj):
     dtype = cudf.dtype(dtype)
     # numpy case
 
-    expect = pd.Series(input_obj, dtype=cudf_dtypes_to_pandas_dtypes[dtype])
+    expect = pd.Series(input_obj, dtype=np_dtypes_to_pandas_dtypes[dtype])
     got = cudf.Series(input_obj, dtype=dtype).to_pandas(nullable=True)
 
     assert_eq(expect, got)
@@ -403,6 +403,6 @@ def test_series_construction_with_nulls(dtype, input_obj):
         dtype.type(v) if v is not cudf.NA else cudf.NA for v in input_obj
     ]
 
-    expect = pd.Series(np_data, dtype=cudf_dtypes_to_pandas_dtypes[dtype])
+    expect = pd.Series(np_data, dtype=np_dtypes_to_pandas_dtypes[dtype])
     got = cudf.Series(np_data, dtype=dtype).to_pandas(nullable=True)
     assert_eq(expect, got)
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index f60baec746f..33bc56a2522 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -273,9 +273,7 @@ def test_series_fillna_numerical(psr, data_dtype, fill_value, inplace):
     # https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html
     if np.dtype(data_dtype).kind not in ("f") and test_psr.dtype.kind == "i":
         test_psr = test_psr.astype(
-            cudf.utils.dtypes.cudf_dtypes_to_pandas_dtypes[
-                np.dtype(data_dtype)
-            ]
+            cudf.utils.dtypes.np_dtypes_to_pandas_dtypes[np.dtype(data_dtype)]
         )
 
     gsr = cudf.from_pandas(test_psr)
@@ -318,7 +316,7 @@ def test_fillna_method_numerical(data, container, data_dtype, method, inplace):
     pdata = container(data)
 
     if np.dtype(data_dtype).kind not in ("f"):
-        data_dtype = cudf.utils.dtypes.cudf_dtypes_to_pandas_dtypes[
+        data_dtype = cudf.utils.dtypes.np_dtypes_to_pandas_dtypes[
             np.dtype(data_dtype)
         ]
     pdata = pdata.astype(data_dtype)
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 3ef0e2edaed..fa6c4d9bf24 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -11,7 +11,7 @@
 import cudf
 from cudf.core._compat import PANDAS_GE_110
 from cudf.testing import _utils as utils
-from cudf.utils.dtypes import cudf_dtypes_to_pandas_dtypes
+from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes
 
 repr_categories = utils.NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]
 
@@ -27,7 +27,7 @@ def test_null_series(nrows, dtype):
     if dtype != "category" and cudf.dtype(dtype).kind in {"u", "i"}:
         ps = pd.Series(
             sr._column.data_array_view.copy_to_host(),
-            dtype=cudf_dtypes_to_pandas_dtypes.get(
+            dtype=np_dtypes_to_pandas_dtypes.get(
                 cudf.dtype(dtype), cudf.dtype(dtype)
             ),
         )
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 5650790e970..bbc7ecc0c28 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -7,6 +7,7 @@
 import cupy as cp
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 import pytest
 
 import cudf
@@ -1230,3 +1231,23 @@ def test_explode(data, ignore_index, p_index):
 def test_nested_series_from_sequence_data(data, expected):
     actual = cudf.Series(data)
     assert_eq(actual, expected)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        cp.ones(5, dtype=cp.float16),
+        np.ones(5, dtype="float16"),
+        pd.Series([0.1, 1.2, 3.3], dtype="float16"),
+        pytest.param(
+            pa.array(np.ones(5, dtype="float16")),
+            marks=pytest.mark.xfail(
+                reason="https://issues.apache.org/jira/browse/ARROW-13762"
+            ),
+        ),
+    ],
+)
+def test_series_upcast_float16(data):
+    actual_series = cudf.Series(data)
+    expected_series = cudf.Series(data, dtype="float32")
+    assert_eq(actual_series, expected_series)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index d47fbee8e84..debc0ffdd53 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -53,17 +53,17 @@
     np.str_: pa.string(),
 }
 
-cudf_dtypes_to_pandas_dtypes = {
-    cudf.dtype("uint8"): pd.UInt8Dtype(),
-    cudf.dtype("uint16"): pd.UInt16Dtype(),
-    cudf.dtype("uint32"): pd.UInt32Dtype(),
-    cudf.dtype("uint64"): pd.UInt64Dtype(),
-    cudf.dtype("int8"): pd.Int8Dtype(),
-    cudf.dtype("int16"): pd.Int16Dtype(),
-    cudf.dtype("int32"): pd.Int32Dtype(),
-    cudf.dtype("int64"): pd.Int64Dtype(),
-    cudf.dtype("bool_"): pd.BooleanDtype(),
-    cudf.dtype("object"): pd.StringDtype(),
+np_dtypes_to_pandas_dtypes = {
+    np.dtype("uint8"): pd.UInt8Dtype(),
+    np.dtype("uint16"): pd.UInt16Dtype(),
+    np.dtype("uint32"): pd.UInt32Dtype(),
+    np.dtype("uint64"): pd.UInt64Dtype(),
+    np.dtype("int8"): pd.Int8Dtype(),
+    np.dtype("int16"): pd.Int16Dtype(),
+    np.dtype("int32"): pd.Int32Dtype(),
+    np.dtype("int64"): pd.Int64Dtype(),
+    np.dtype("bool_"): pd.BooleanDtype(),
+    np.dtype("object"): pd.StringDtype(),
 }
 
 pyarrow_dtypes_to_pandas_dtypes = {
@@ -79,17 +79,17 @@
     pa.string(): pd.StringDtype(),
 }
 
-pandas_dtypes_to_cudf_dtypes = {
-    pd.UInt8Dtype(): cudf.dtype("uint8"),
-    pd.UInt16Dtype(): cudf.dtype("uint16"),
-    pd.UInt32Dtype(): cudf.dtype("uint32"),
-    pd.UInt64Dtype(): cudf.dtype("uint64"),
-    pd.Int8Dtype(): cudf.dtype("int8"),
-    pd.Int16Dtype(): cudf.dtype("int16"),
-    pd.Int32Dtype(): cudf.dtype("int32"),
-    pd.Int64Dtype(): cudf.dtype("int64"),
-    pd.BooleanDtype(): cudf.dtype("bool_"),
-    pd.StringDtype(): cudf.dtype("object"),
+pandas_dtypes_to_np_dtypes = {
+    pd.UInt8Dtype(): np.dtype("uint8"),
+    pd.UInt16Dtype(): np.dtype("uint16"),
+    pd.UInt32Dtype(): np.dtype("uint32"),
+    pd.UInt64Dtype(): np.dtype("uint64"),
+    pd.Int8Dtype(): np.dtype("int8"),
+    pd.Int16Dtype(): np.dtype("int16"),
+    pd.Int32Dtype(): np.dtype("int32"),
+    pd.Int64Dtype(): np.dtype("int64"),
+    pd.BooleanDtype(): np.dtype("bool_"),
+    pd.StringDtype(): np.dtype("object"),
 }
 
 pandas_dtypes_alias_to_cudf_alias = {
@@ -105,10 +105,10 @@
 }
 
 if PANDAS_GE_120:
-    cudf_dtypes_to_pandas_dtypes[cudf.dtype("float32")] = pd.Float32Dtype()
-    cudf_dtypes_to_pandas_dtypes[cudf.dtype("float64")] = pd.Float64Dtype()
-    pandas_dtypes_to_cudf_dtypes[pd.Float32Dtype()] = cudf.dtype("float32")
-    pandas_dtypes_to_cudf_dtypes[pd.Float64Dtype()] = cudf.dtype("float64")
+    np_dtypes_to_pandas_dtypes[np.dtype("float32")] = pd.Float32Dtype()
+    np_dtypes_to_pandas_dtypes[np.dtype("float64")] = pd.Float64Dtype()
+    pandas_dtypes_to_np_dtypes[pd.Float32Dtype()] = np.dtype("float32")
+    pandas_dtypes_to_np_dtypes[pd.Float64Dtype()] = np.dtype("float64")
     pandas_dtypes_alias_to_cudf_alias["Float32"] = "float32"
     pandas_dtypes_alias_to_cudf_alias["Float64"] = "float64"
 
@@ -197,7 +197,7 @@ def cudf_dtype_from_pydata_dtype(dtype):
         return cudf.core.dtypes.Decimal32Dtype
     elif is_decimal64_dtype(dtype):
         return cudf.core.dtypes.Decimal64Dtype
-    elif dtype in cudf._lib.types.np_to_cudf_types:
+    elif dtype in cudf._lib.types.SUPPORTED_NUMPY_TO_LIBCUDF_TYPES:
         return dtype.type
 
     return infer_dtype_from_object(dtype)
@@ -385,9 +385,7 @@ def min_column_type(x, expected_type):
         return x.dtype
 
     if np.issubdtype(x.dtype, np.floating):
-        max_bound_dtype = np.min_scalar_type(x.max())
-        min_bound_dtype = np.min_scalar_type(x.min())
-        result_type = np.promote_types(max_bound_dtype, min_bound_dtype)
+        return get_min_float_dtype(x)
 
     elif np.issubdtype(expected_type, np.integer):
         max_bound_dtype = np.min_scalar_type(x.max())
@@ -402,7 +400,9 @@ def min_column_type(x, expected_type):
 def get_min_float_dtype(col):
     max_bound_dtype = np.min_scalar_type(float(col.max()))
     min_bound_dtype = np.min_scalar_type(float(col.min()))
-    result_type = np.promote_types(max_bound_dtype, min_bound_dtype)
+    result_type = np.promote_types(
+        "float32", np.promote_types(max_bound_dtype, min_bound_dtype)
+    )
     return cudf.dtype(result_type)
 
 
@@ -542,6 +542,8 @@ def find_common_type(dtypes):
         dtypes.add(np.result_type(*td_dtypes))
 
     common_dtype = np.find_common_type(list(dtypes), [])
+    if common_dtype == np.dtype("float16"):
+        return cudf.dtype("float32")
     return cudf.dtype(common_dtype)
 
 
From 4e0584b7105209038394d01836d6d3e0b21c2ac3 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Thu, 26 Aug 2021 18:19:14 -0400
Subject: [PATCH 43/46] Add handling for nested dicts in dask-cudf groupby
 (#9054)

Closes #9017

Adds handling for nested dict (renamed) aggregations supplied to dask-cudf's groupby, by storing the new aggregation names when standardizing the `aggs` input and applying them in `_finalize_gb_agg()`.

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

Approvers:
  - Marlene  (https://github.com/marlenezw)
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: https://github.com/rapidsai/cudf/pull/9054
---
 python/cudf/cudf/core/dataframe.py            |  2 +-
 python/dask_cudf/dask_cudf/groupby.py         | 80 +++++++++++--------
 .../dask_cudf/dask_cudf/tests/test_groupby.py | 29 +++++++
 3 files changed, 78 insertions(+), 33 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 721ebf22de7..0d833a7d341 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2585,7 +2585,7 @@ def columns(self, columns):
 
         if not len(columns) == len(self._data.names):
             raise ValueError(
-                f"Length mismatch: expected {len(self._data.names)} elements ,"
+                f"Length mismatch: expected {len(self._data.names)} elements, "
                 f"got {len(columns)} elements"
             )
 
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index 600d6cc7412..7e2c3a4f36c 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -198,34 +198,8 @@ def groupby_agg(
         in `dask.dataframe`, because it allows the cudf backend to
         perform multiple aggregations at once.
     """
-
-    # Deal with default split_out and split_every params
-    if split_every is False:
-        split_every = ddf.npartitions
-    split_every = split_every or 8
-    split_out = split_out or 1
-
-    # Standardize `gb_cols` and `columns` lists
-    aggs = _redirect_aggs(aggs_in.copy())
-    if isinstance(gb_cols, str):
-        gb_cols = [gb_cols]
-    columns = [c for c in ddf.columns if c not in gb_cols]
-    str_cols_out = False
-    if isinstance(aggs, dict):
-        # Use `str_cols_out` to specify if the output columns
-        # will have str (rather than MultiIndex/tuple) names.
-        # This happens when all values in the `aggs` dict are
-        # strings (no lists)
-        str_cols_out = True
-        for col in aggs:
-            if isinstance(aggs[col], str) or callable(aggs[col]):
-                aggs[col] = [aggs[col]]
-            else:
-                str_cols_out = False
-            if col in gb_cols:
-                columns.append(col)
-
     # Assert that aggregations are supported
+    aggs = _redirect_aggs(aggs_in)
     _supported = {
         "count",
         "mean",
@@ -244,10 +218,39 @@ def groupby_agg(
             f"Aggregations must be specified with dict or list syntax."
         )
 
-    # Always convert aggs to dict for consistency
+    # Deal with default split_out and split_every params
+    if split_every is False:
+        split_every = ddf.npartitions
+    split_every = split_every or 8
+    split_out = split_out or 1
+
+    # Standardize `gb_cols`, `columns`, and `aggs`
+    if isinstance(gb_cols, str):
+        gb_cols = [gb_cols]
+    columns = [c for c in ddf.columns if c not in gb_cols]
     if isinstance(aggs, list):
         aggs = {col: aggs for col in columns}
 
+    # Assert if our output will have a MultiIndex; this will be the case if
+    # any value in the `aggs` dict is not a string (i.e. multiple/named
+    # aggregations per column)
+    str_cols_out = True
+    aggs_renames = {}
+    for col in aggs:
+        if isinstance(aggs[col], str) or callable(aggs[col]):
+            aggs[col] = [aggs[col]]
+        elif isinstance(aggs[col], dict):
+            str_cols_out = False
+            col_aggs = []
+            for k, v in aggs[col].items():
+                aggs_renames[col, v] = k
+                col_aggs.append(v)
+            aggs[col] = col_aggs
+        else:
+            str_cols_out = False
+        if col in gb_cols:
+            columns.append(col)
+
     # Begin graph construction
     dsk = {}
     token = tokenize(ddf, gb_cols, aggs)
@@ -314,6 +317,13 @@ def groupby_agg(
         for col in aggs:
             _aggs[col] = _aggs[col][0]
     _meta = ddf._meta.groupby(gb_cols, as_index=as_index).agg(_aggs)
+    if aggs_renames:
+        col_array = []
+        agg_array = []
+        for col, agg in _meta.columns:
+            col_array.append(col)
+            agg_array.append(aggs_renames.get((col, agg), agg))
+        _meta.columns = pd.MultiIndex.from_arrays([col_array, agg_array])
     for s in range(split_out):
         dsk[(gb_agg_name, s)] = (
             _finalize_gb_agg,
@@ -326,6 +336,7 @@ def groupby_agg(
             sort,
             sep,
             str_cols_out,
+            aggs_renames,
         )
 
     divisions = [None] * (split_out + 1)
@@ -350,6 +361,10 @@ def _redirect_aggs(arg):
         for col in arg:
             if isinstance(arg[col], list):
                 new_arg[col] = [redirects.get(agg, agg) for agg in arg[col]]
+            elif isinstance(arg[col], dict):
+                new_arg[col] = {
+                    k: redirects.get(v, v) for k, v in arg[col].items()
+                }
             else:
                 new_arg[col] = redirects.get(arg[col], arg[col])
         return new_arg
@@ -367,6 +382,8 @@ def _is_supported(arg, supported: set):
             for col in arg:
                 if isinstance(arg[col], list):
                     _global_set = _global_set.union(set(arg[col]))
+                elif isinstance(arg[col], dict):
+                    _global_set = _global_set.union(set(arg[col].values()))
                 else:
                     _global_set.add(arg[col])
         else:
@@ -460,10 +477,8 @@ def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep):
         agg = col.split(sep)[-1]
         if agg in ("count", "sum"):
             agg_dict[col] = ["sum"]
-        elif agg in ("min", "max"):
+        elif agg in ("min", "max", "collect"):
             agg_dict[col] = [agg]
-        elif agg == "collect":
-            agg_dict[col] = ["collect"]
         else:
             raise ValueError(f"Unexpected aggregation: {agg}")
 
@@ -508,6 +523,7 @@ def _finalize_gb_agg(
     sort,
     sep,
     str_cols_out,
+    aggs_renames,
 ):
     """ Final aggregation task.
 
@@ -564,7 +580,7 @@ def _finalize_gb_agg(
         else:
             name, agg = col.split(sep)
             col_array.append(name)
-            agg_array.append(agg)
+            agg_array.append(aggs_renames.get((name, agg), agg))
     if str_cols_out:
         gb.columns = col_array
     else:
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index 61fa32b76ed..6569ffa94c5 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -645,3 +645,32 @@ def test_groupby_with_list_of_series():
     dd.assert_eq(
         gdf.groupby([ggs]).agg(["sum"]), ddf.groupby([pgs]).agg(["sum"])
     )
+
+
+@pytest.mark.parametrize(
+    "func",
+    [
+        lambda df: df.groupby("x").agg({"y": {"foo": "sum"}}),
+        lambda df: df.groupby("x").agg({"y": {"foo": "sum", "bar": "count"}}),
+    ],
+)
+def test_groupby_nested_dict(func):
+    pdf = pd.DataFrame(
+        {
+            "x": np.random.randint(0, 5, size=10000),
+            "y": np.random.normal(size=10000),
+        }
+    )
+
+    ddf = dd.from_pandas(pdf, npartitions=5)
+    c_ddf = ddf.map_partitions(cudf.from_pandas)
+
+    a = func(ddf).compute()
+    b = func(c_ddf).compute().to_pandas()
+
+    a.index.name = None
+    a.name = None
+    b.index.name = None
+    b.name = None
+
+    dd.assert_eq(a, b)

From d9d565ef911f1a579085a33510119ae71d935b90 Mon Sep 17 00:00:00 2001
From: MithunR <mythrocks@gmail.com>
Date: Thu, 26 Aug 2021 15:39:11 -0700
Subject: [PATCH 44/46] Add support for `STRUCT` input to `groupby` (#9024)

This commit adds support for `STRUCT` columns in `groupby`. This should now allow for groupby aggregations to work when any of the grouping columns are `STRUCT`, including nested `STRUCTS`.

Note: List columns are still not supported on `groupby`, even as members of `STRUCT` columns, at any level of nesting. Only `STRUCT`, `STRUCT<STRUCT>`, etc. are currently supported.

Depends on #8956 (i.e. `unflatten_nested_columns()`).

Authors:
  - MithunR (https://github.com/mythrocks)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)

URL: https://github.com/rapidsai/cudf/pull/9024
---
 .../cudf/detail/groupby/sort_helper.hpp       |  14 +-
 cpp/src/groupby/groupby.cu                    |  11 +-
 cpp/src/groupby/sort/sort_helper.cu           |  30 +-
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/groupby/structs_tests.cpp           | 319 ++++++++++++++++++
 5 files changed, 361 insertions(+), 14 deletions(-)
 create mode 100644 cpp/tests/groupby/structs_tests.cpp

diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp
index 471cd060dc3..1e36b2b2797 100644
--- a/cpp/include/cudf/detail/groupby/sort_helper.hpp
+++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp
@@ -60,17 +60,7 @@ struct sort_groupby_helper {
    */
   sort_groupby_helper(table_view const& keys,
                       null_policy include_null_keys = null_policy::EXCLUDE,
-                      sorted keys_pre_sorted        = sorted::NO)
-    : _keys(keys),
-      _num_keys(-1),
-      _keys_pre_sorted(keys_pre_sorted),
-      _include_null_keys(include_null_keys)
-  {
-    if (keys_pre_sorted == sorted::YES and include_null_keys == null_policy::EXCLUDE and
-        has_nulls(keys)) {
-      _keys_pre_sorted = sorted::NO;
-    }
-  };
+                      sorted keys_pre_sorted        = sorted::NO);
 
   ~sort_groupby_helper()                          = default;
   sort_groupby_helper(sort_groupby_helper const&) = delete;
@@ -227,6 +217,8 @@ struct sort_groupby_helper {
   column_ptr _unsorted_keys_labels;  ///< Group labels for unsorted _keys
   column_ptr _keys_bitmask_column;   ///< Column representing rows with one or more nulls values
   table_view _keys;                  ///< Input keys to sort by
+  table_view _unflattened_keys;      ///< Input keys, unflattened and possibly nested
+  std::vector<column_ptr> _struct_null_vectors;  ///< Null vectors for struct columns in _keys
 
   index_vector_ptr
     _group_offsets;  ///< Indices into sorted _keys indicating starting index of each groups
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index a26d69e3d46..533f193d692 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -31,6 +31,7 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
+#include <structs/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -62,6 +63,8 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::disp
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
+  using namespace cudf::structs::detail;
+
   // If sort groupby has been called once on this groupby object, then
   // always use sort groupby from now on. Because once keys are sorted,
   // all the aggs that can be done by hash groupby are efficiently done by
@@ -70,7 +73,13 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::disp
   // satisfied with a hash implementation
   if (_keys_are_sorted == sorted::NO and not _helper and
       detail::hash::can_use_hash_groupby(_keys, requests)) {
-    return detail::hash::groupby(_keys, requests, _include_null_keys, stream, mr);
+    // Optionally flatten nested key columns.
+    auto [flattened_keys, _, __, ___] =
+      flatten_nested_columns(_keys, {}, {}, column_nullability::FORCE);
+    auto [grouped_keys, results] =
+      detail::hash::groupby(flattened_keys, requests, _include_null_keys, stream, mr);
+    return std::make_pair(unflatten_nested_columns(std::move(grouped_keys), _keys),
+                          std::move(results));
   } else {
     return sort_aggregate(requests, stream, mr);
   }
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index 5e944f75712..69d68f7b6bc 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -25,6 +25,7 @@
 #include <cudf/detail/sorting.hpp>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
+#include <structs/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -88,6 +89,31 @@ namespace cudf {
 namespace groupby {
 namespace detail {
 namespace sort {
+
+sort_groupby_helper::sort_groupby_helper(table_view const& keys,
+                                         null_policy include_null_keys,
+                                         sorted keys_pre_sorted)
+  : _unflattened_keys(keys),
+    _num_keys(-1),
+    _keys_pre_sorted(keys_pre_sorted),
+    _include_null_keys(include_null_keys)
+{
+  using namespace cudf::structs::detail;
+
+  auto [flattened_keys, _, __, struct_null_vectors] =
+    flatten_nested_columns(keys, {}, {}, column_nullability::FORCE);
+  _struct_null_vectors = std::move(struct_null_vectors);
+  _keys                = flattened_keys;
+
+  // Cannot depend on caller's sorting if the column contains nulls,
+  // and null values are to be excluded.
+  // Re-sort the data, to filter out nulls more easily.
+  if (keys_pre_sorted == sorted::YES and include_null_keys == null_policy::EXCLUDE and
+      has_nulls(keys)) {
+    _keys_pre_sorted = sorted::NO;
+  }
+};
+
 size_type sort_groupby_helper::num_keys(rmm::cuda_stream_view stream)
 {
   if (_num_keys > -1) return _num_keys;
@@ -309,7 +335,7 @@ std::unique_ptr<table> sort_groupby_helper::unique_keys(rmm::cuda_stream_view st
   auto gather_map_it = thrust::make_transform_iterator(
     group_offsets(stream).begin(), [idx_data] __device__(size_type i) { return idx_data[i]; });
 
-  return cudf::detail::gather(_keys,
+  return cudf::detail::gather(_unflattened_keys,
                               gather_map_it,
                               gather_map_it + num_groups(stream),
                               out_of_bounds_policy::DONT_CHECK,
@@ -320,7 +346,7 @@ std::unique_ptr<table> sort_groupby_helper::unique_keys(rmm::cuda_stream_view st
 std::unique_ptr<table> sort_groupby_helper::sorted_keys(rmm::cuda_stream_view stream,
                                                         rmm::mr::device_memory_resource* mr)
 {
-  return cudf::detail::gather(_keys,
+  return cudf::detail::gather(_unflattened_keys,
                               key_sort_order(stream),
                               cudf::out_of_bounds_policy::DONT_CHECK,
                               cudf::detail::negative_index_policy::NOT_ALLOWED,
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index edfbba74eb1..d9553d463ab 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -80,6 +80,7 @@ ConfigureTest(GROUPBY_TEST
     groupby/replace_nulls_tests.cpp
     groupby/shift_tests.cpp
     groupby/std_tests.cpp
+    groupby/structs_tests.cpp
     groupby/sum_of_squares_tests.cpp
     groupby/sum_scan_tests.cpp
     groupby/sum_tests.cpp
diff --git a/cpp/tests/groupby/structs_tests.cpp b/cpp/tests/groupby/structs_tests.cpp
new file mode 100644
index 00000000000..00126a4a5a0
--- /dev/null
+++ b/cpp/tests/groupby/structs_tests.cpp
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include "cudf/aggregation.hpp"
+#include "cudf/types.hpp"
+
+using namespace cudf::test::iterators;
+
+namespace cudf {
+namespace test {
+
+template <typename V>
+struct groupby_structs_test : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_CASE(groupby_structs_test, cudf::test::FixedWidthTypes);
+
+using V       = int32_t;  // Type of Aggregation Column.
+using M0      = int32_t;  // Type of STRUCT's first (i.e. 0th) member.
+using R       = cudf::detail::target_type_t<V, aggregation::SUM>;  // Type of aggregation result.
+using offsets = std::vector<cudf::offset_type>;
+using strings = strings_column_wrapper;
+using structs = structs_column_wrapper;
+
+template <typename T>
+using fwcw = fixed_width_column_wrapper<T>;
+
+template <typename T>
+using lcw = lists_column_wrapper<T>;
+
+namespace {
+static constexpr auto null = -1;  // Signifies null value.
+
+// Checking with a single aggregation, and aggregation column.
+// This test is orthogonal to the aggregation type; it focuses on testing the grouping
+// with STRUCT keys.
+auto sum_agg() { return cudf::make_sum_aggregation<groupby_aggregation>(); }
+
+// Set this to true to enable printing, for debugging.
+auto constexpr print_enabled = false;
+
+void print_agg_results(column_view const& keys, column_view const& vals)
+{
+  if constexpr (print_enabled) {
+    auto requests = std::vector<groupby::aggregation_request>{};
+    requests.push_back(groupby::aggregation_request{});
+    requests.back().values = vals;
+    requests.back().aggregations.push_back(sum_agg());
+    requests.back().aggregations.push_back(
+      cudf::make_nth_element_aggregation<groupby_aggregation>(0));
+
+    auto gby    = groupby::groupby{table_view({keys}), null_policy::INCLUDE, sorted::NO, {}, {}};
+    auto result = gby.aggregate(requests);
+    std::cout << "Results: Keys: " << std::endl;
+    print(result.first->get_column(0).view());
+    std::cout << "Results: Values: " << std::endl;
+    print(result.second.front().results[0]->view());
+  }
+}
+
+void test_sum_agg(column_view const& keys,
+                  column_view const& values,
+                  column_view const& expected_keys,
+                  column_view const& expected_values)
+{
+  test_single_agg(keys,
+                  values,
+                  expected_keys,
+                  expected_values,
+                  sum_agg(),
+                  force_use_sort_impl::NO,
+                  null_policy::INCLUDE);
+  test_single_agg(keys,
+                  values,
+                  expected_keys,
+                  expected_values,
+                  sum_agg(),
+                  force_use_sort_impl::YES,
+                  null_policy::INCLUDE);
+}
+
+}  // namespace
+
+TYPED_TEST(groupby_structs_test, basic)
+{
+  using M1 = TypeParam;  // Type of STRUCT's second (i.e. 1th) member.
+
+  // clang-format off
+  auto values   = fwcw<V> {  0,    1,    2,    3,    4,    5,    6,    7,    8,    9};
+  auto member_0 = fwcw<M0>{  1,    2,    3,    1,    2,    2,    1,    3,    3,    2};
+  auto member_1 = fwcw<M1>{ 11,   22,   33,   11,   22,   22,   11,   33,   33,   22};
+  auto member_2 = strings {"11", "22", "33", "11", "22", "22", "11", "33", "33", "22"};
+  auto keys     = structs{member_0, member_1, member_2};
+
+  auto expected_values   = fwcw<R> {  9,   19,   17 };
+  auto expected_member_0 = fwcw<M0>{  1,    2,    3 };
+  auto expected_member_1 = fwcw<M1>{ 11,   22,   33 };
+  auto expected_member_2 = strings {"11", "22", "33"};
+  auto expected_keys     = structs{expected_member_0, expected_member_1, expected_member_2};
+  // clang-format on
+
+  test_sum_agg(keys, values, expected_keys, expected_values);
+}
+
+TYPED_TEST(groupby_structs_test, structs_with_nulls_in_members)
+{
+  using M1 = TypeParam;  // Type of STRUCT's second (i.e. 1th) member.
+
+  // clang-format off
+  auto values   = fwcw<V> {  0,       1,    2,    3,    4,    5,    6,      7,    8,    9 };
+  auto member_0 = fwcw<M0>{{ 1,    null,    3,    1,    2,    2,    1,      3,    3,    2 }, null_at(1)};
+  auto member_1 = fwcw<M1>{{ 11,     22,   33,   11,   22,   22,   11,   null,   33,   22 }, null_at(7)};
+  auto member_2 = strings { "11",   "22", "33", "11", "22", "22", "11",   "33", "33", "22"};
+  auto keys     = structs{{member_0, member_1, member_2}};
+  // clang-format on
+
+  print_agg_results(keys, values);
+
+  // clang-format off
+  auto expected_values   = fwcw<R> {    9,   18,    10,     7,     1  };
+  auto expected_member_0 = fwcw<M0>{ {  1,    2,     3,     3,  null  }, null_at(4)};
+  auto expected_member_1 = fwcw<M1>{ { 11,   22,    33,  null,    22  }, null_at(3)};
+  auto expected_member_2 = strings {  "11", "22",  "33",  "33",  "22" };
+  auto expected_keys     = structs{expected_member_0, expected_member_1, expected_member_2};
+  // clang-format on
+
+  test_sum_agg(keys, values, expected_keys, expected_values);
+}
+
+TYPED_TEST(groupby_structs_test, structs_with_null_rows)
+{
+  using M1 = TypeParam;  // Type of STRUCT's second (i.e. 1th) member.
+
+  // clang-format off
+  auto values   = fwcw<V> {  0,    1,    2,    3,    4,    5,    6,    7,    8,    9};
+  auto member_0 = fwcw<M0>{  1,    2,    3,    1,    2,    2,    1,    3,    3,    2};
+  auto member_1 = fwcw<M1>{ 11,   22,   33,   11,   22,   22,   11,   33,   33,   22};
+  auto member_2 = strings {"11", "22", "33", "11", "22", "22", "11", "33", "33", "22"};
+  auto keys     = structs{{member_0, member_1, member_2}, nulls_at({0, 3})};
+
+  auto expected_values   = fwcw<R> {    6,   19,   17,      3  };
+  auto expected_member_0 = fwcw<M0>{ {  1,    2,    3,   null  }, null_at(3)};
+  auto expected_member_1 = fwcw<M1>{ { 11,   22,   33,   null  }, null_at(3)};
+  auto expected_member_2 = strings { {"11", "22", "33", "null" }, null_at(3)};
+  auto expected_keys     = structs{{expected_member_0, expected_member_1, expected_member_2}, null_at(3)};
+  // clang-format on
+
+  print_agg_results(keys, values);
+
+  test_sum_agg(keys, values, expected_keys, expected_values);
+}
+
+TYPED_TEST(groupby_structs_test, structs_with_nulls_in_rows_and_members)
+{
+  using M1 = TypeParam;  // Type of STRUCT's second (i.e. 1th) member.
+
+  // clang-format off
+  auto values   = fwcw<V> {  0,    1,    2,    3,    4,    5,    6,    7,    8,    9  };
+  auto member_0 = fwcw<M0>{{ 1,    2,    3,    1,    2,    2,    1,    3,    3,    2  }, null_at(1)};
+  auto member_1 = fwcw<M1>{{ 11,   22,   33,   11,   22,   22,   11,   33,   33,   22 }, null_at(7)};
+  auto member_2 = strings { "11", "22", "33", "11", "22", "22", "11", "33", "33", "22"};
+  auto keys     = structs{{member_0, member_1, member_2}, null_at(4)};
+  // clang-format on
+
+  print_agg_results(keys, values);
+
+  // clang-format off
+  auto expected_values   = fwcw<R> {    9,   14,    10,     7,     1,      4  };
+  auto expected_member_0 = fwcw<M0>{{   1,    2,     3,     3,  null,   null  }, nulls_at({4,5})};
+  auto expected_member_1 = fwcw<M1>{{  11,   22,    33,  null,    22,   null  }, nulls_at({3,5})};
+  auto expected_member_2 = strings {{ "11", "22",  "33",  "33",  "22", "null" }, null_at(5)};
+  auto expected_keys     = structs{{expected_member_0, expected_member_1, expected_member_2}, null_at(5)};
+  // clang-format on
+
+  print_agg_results(keys, values);
+  test_sum_agg(keys, values, expected_keys, expected_values);
+}
+
+TYPED_TEST(groupby_structs_test, null_members_differ_from_null_structs)
+{
+  // This test specifically confirms that a non-null STRUCT row `{null, null, null}` is grouped
+  // differently from a null STRUCT row (whose members are incidentally null).
+
+  using M1 = TypeParam;  // Type of STRUCT's second (i.e. 1th) member.
+
+  // clang-format off
+  auto values   = fwcw<V> {    0,      1,    2,    3,    4,    5,    6,    7,    8,    9 };
+  auto member_0 = fwcw<M0>{{   1,   null,    3,    1,    2,    2,    1,    3,    3,    2 }, null_at(1)};
+  auto member_1 = fwcw<M1>{{  11,   null,   33,   11,   22,   22,   11,   33,   33,   22 }, null_at(1)};
+  auto member_2 = strings {{ "11", "null", "33", "11", "22", "22", "11", "33", "33", "22"}, null_at(1)};
+  auto keys     = structs{{member_0, member_1, member_2}, null_at(4)};
+  // clang-format on
+
+  print_agg_results(keys, values);
+
+  // Index-3 => Non-null Struct row, with nulls for all members.
+  // Index-4 => Null Struct row.
+
+  // clang-format off
+  auto expected_values   = fwcw<R> {    9,   14,    17,      1,      4  };
+  auto expected_member_0 = fwcw<M0>{ {  1,    2,     3,   null,   null  }, nulls_at({3,4})};
+  auto expected_member_1 = fwcw<M1>{ { 11,   22,    33,   null,   null  }, nulls_at({3,4})};
+  auto expected_member_2 = strings { {"11", "22",  "33", "null", "null" }, nulls_at({3,4})};
+  auto expected_keys     = structs{{expected_member_0, expected_member_1, expected_member_2}, null_at(4)};
+  // clang-format on
+
+  test_sum_agg(keys, values, expected_keys, expected_values);
+}
+
+TYPED_TEST(groupby_structs_test, structs_of_structs)
+{
+  using M1 = TypeParam;  // Type of STRUCT's second (i.e. 1th) member.
+
+  // clang-format off
+  auto values            = fwcw<V> {    0,      1,    2,    3,    4,    5,    6,    7,    8,    9 };
+  auto struct_0_member_0 = fwcw<M0>{{   1,   null,    3,    1,    2,    2,    1,    3,    3,    2 }, null_at(1)};
+  auto struct_0_member_1 = fwcw<M1>{{  11,   null,   33,   11,   22,   22,   11,   33,   33,   22 }, null_at(1)};
+  auto struct_0_member_2 = strings {{ "11", "null", "33", "11", "22", "22", "11", "33", "33", "22"}, null_at(1)};
+  // clang-format on
+
+  auto struct_0 = structs{{struct_0_member_0, struct_0_member_1, struct_0_member_2}, null_at(4)};
+  auto struct_1_member_1 = fwcw<M1>{8, 9, 6, 8, 0, 7, 8, 6, 6, 7};
+
+  auto keys = structs{{struct_0, struct_1_member_1}};  // Struct of structs.
+
+  print_agg_results(keys, values);
+
+  // clang-format off
+  auto expected_values            = fwcw<R> {    9,   14,    17,      1,      4  };
+  auto expected_member_0          = fwcw<M0>{ {  1,    2,     3,   null,   null  }, nulls_at({3,4})};
+  auto expected_member_1          = fwcw<M1>{ { 11,   22,    33,   null,   null  }, nulls_at({3,4})};
+  auto expected_member_2          = strings { {"11", "22",  "33", "null", "null" }, nulls_at({3,4})};
+  auto expected_structs           = structs{{expected_member_0, expected_member_1, expected_member_2}, null_at(4)};
+  auto expected_struct_1_member_1 = fwcw<M1>{    8,    7,     6,      9,      0  };
+  auto expected_keys              = structs{{expected_structs, expected_struct_1_member_1}};
+  // clang-format on
+
+  test_sum_agg(keys, values, expected_keys, expected_values);
+}
+
+TYPED_TEST(groupby_structs_test, empty_input)
+{
+  using M1 = TypeParam;  // Type of STRUCT's second (i.e. 1th) member.
+
+  // clang-format off
+  auto values   = fwcw<V> {};
+  auto member_0 = fwcw<M0>{};
+  auto member_1 = fwcw<M1>{};
+  auto member_2 = strings {};
+  auto keys     = structs{member_0, member_1, member_2};
+
+  auto expected_values   = fwcw<R> {};
+  auto expected_member_0 = fwcw<M0>{};
+  auto expected_member_1 = fwcw<M1>{};
+  auto expected_member_2 = strings {};
+  auto expected_keys     = structs{expected_member_0, expected_member_1, expected_member_2};
+  // clang-format on
+
+  test_sum_agg(keys, values, expected_keys, expected_values);
+}
+
+TYPED_TEST(groupby_structs_test, all_null_input)
+{
+  using M1 = TypeParam;  // Type of STRUCT's second (i.e. 1th) member.
+
+  // clang-format off
+  auto values   = fwcw<V> {  0,    1,    2,    3,    4,    5,    6,    7,    8,    9};
+  auto member_0 = fwcw<M0>{  1,    2,    3,    1,    2,    2,    1,    3,    3,    2};
+  auto member_1 = fwcw<M1>{ 11,   22,   33,   11,   22,   22,   11,   33,   33,   22};
+  auto member_2 = strings {"11", "22", "33", "11", "22", "22", "11", "33", "33", "22"};
+  auto keys     = structs{{member_0, member_1, member_2}, all_nulls()};
+
+  auto expected_values   = fwcw<R> {    45 };
+  auto expected_member_0 = fwcw<M0>{ null };
+  auto expected_member_1 = fwcw<M1>{ null };
+  auto expected_member_2 = strings {"null"};
+  auto expected_keys     = structs{{expected_member_0, expected_member_1, expected_member_2}, all_nulls()};
+  // clang-format on
+
+  test_sum_agg(keys, values, expected_keys, expected_values);
+}
+
+TYPED_TEST(groupby_structs_test, lists_are_unsupported)
+{
+  using M1 = TypeParam;  // Type of STRUCT's second (i.e. 1th) member.
+
+  // clang-format off
+  auto values   = fwcw<V> {     0,      1,      2,      3,       4  };
+  auto member_0 = lcw<M0> { {1,1},  {2,2},  {3,3},   {1,1},   {2,2} };
+  auto member_1 = fwcw<M1>{     1,      2,      3,      1,       2  };
+  // clang-format on
+  auto keys = structs{{member_0, member_1}};
+
+  EXPECT_THROW(test_sum_agg(keys, values, keys, values), cudf::logic_error);
+}
+
+}  // namespace test
+}  // namespace cudf

From 86a44593c0bcac16ac04768a5f5006ceb5608fc5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= <i.kone@edu.umi.ac.ma>
Date: Fri, 27 Aug 2021 00:03:53 +0100
Subject: [PATCH 45/46] Restructuring `Contributing doc` (#9026)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR is related to the raised issue [here](https://github.com/rapidsai/cudf/issues/8921). It is a refactoring of the contributing doc with instructions to locally build `cuDF` in edit mode.

Authors:
  - Ismaël Koné (https://github.com/iskode)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/9026
---
 CONTRIBUTING.md | 299 ++++++++++++++++++++++++++++--------------------
 1 file changed, 176 insertions(+), 123 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 841a02f72e6..f83d7c5b759 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -55,87 +55,17 @@ implementation of the issue, ask them in the issue instead of the PR.
 
 The following instructions are for developers and contributors to cuDF OSS development. These instructions are tested on Linux Ubuntu 16.04 & 18.04. Use these instructions to build cuDF from source and contribute to its development.  Other operating systems may be compatible, but are not currently tested.
 
-### Code Formatting
-
-#### Python
-
-cuDF uses [Black](https://black.readthedocs.io/en/stable/),
-[isort](https://readthedocs.org/projects/isort/), and
-[flake8](http://flake8.pycqa.org/en/latest/) to ensure a consistent code format
-throughout the project. `Black`, `isort`, and `flake8` can be installed with
-`conda` or `pip`:
-
-```bash
-conda install black isort flake8
-```
-
-```bash
-pip install black isort flake8
-```
 
-These tools are used to auto-format the Python code, as well as check the Cython
-code in the repository. Additionally, there is a CI check in place to enforce
-that committed code follows our standards. You can use the tools to
-automatically format your python code by running:
 
-```bash
-isort --atomic python/**/*.py
-black python
-```
+### General requirements
 
-and then check the syntax of your Python and Cython code by running:
-
-```bash
-flake8 python
-flake8 --config=python/.flake8.cython
-```
-
-Additionally, many editors have plugins that will apply `isort` and `Black` as
-you edit files, as well as use `flake8` to report any style / syntax issues.
-
-#### C++/CUDA
-
-cuDF uses [`clang-format`](https://clang.llvm.org/docs/ClangFormat.html)
-
-In order to format the C++/CUDA files, navigate to the root (`cudf`) directory and run:
-```
-python3 ./cpp/scripts/run-clang-format.py -inplace
-```
-
-Additionally, many editors have plugins or extensions that you can set up to automatically run `clang-format` either manually or on file save.
-
-#### Pre-commit hooks
-
-Optionally, you may wish to setup [pre-commit hooks](https://pre-commit.com/)
-to automatically run `isort`, `Black`, `flake8` and `clang-format` when you make a git commit.
-This can be done by installing `pre-commit` via `conda` or `pip`:
-
-```bash
-conda install -c conda-forge pre_commit
-```
-
-```bash
-pip install pre-commit
-```
-
-and then running:
-
-```bash
-pre-commit install
-```
-
-from the root of the cuDF repository. Now `isort`, `Black`, `flake8` and `clang-format` will be
-run each time you commit changes.
-
-### Get libcudf Dependencies
-
-Compiler requirements:
+Compilers:
 
 * `gcc`     version 9.3+
 * `nvcc`    version 11.0+
 * `cmake`   version 3.20.1+
 
-CUDA/GPU requirements:
+CUDA/GPU:
 
 * CUDA 11.0+
 * NVIDIA driver 450.80.02+
@@ -143,11 +73,7 @@ CUDA/GPU requirements:
 
 You can obtain CUDA from [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads).
 
-## Script to build cuDF from source
-
-### Build from Source
-
-To install cuDF from source, ensure the dependencies are met and follow the steps below:
+### Create the build Environment
 
 - Clone the repository and submodules
 ```bash
@@ -166,86 +92,147 @@ conda activate cudf_dev
 ```
 - For other CUDA versions, check the corresponding cudf_dev_cuda*.yml file in conda/environments
 
-- Build and install `libcudf` after its dependencies. CMake depends on the `nvcc` executable being on your path or defined in `$CUDACXX`.
+### Build cuDF from source
+
+- A `build.sh` script is provided in `$CUDF_HOME`. Running the script with no additional arguments will install the `libcudf`, `cudf` and `dask_cudf` libraries. By default, the libraries are installed to the `$CONDA_PREFIX` directory. To install into a different location, set the location in `$INSTALL_PREFIX`. Finally, note that the script depends on the `nvcc` executable being on your path, or defined in `$CUDACXX`.
+```bash
+cd $CUDF_HOME
+
+# Choose one of the following commands, depending on whether
+# you want to build and install the libcudf C++ library only, 
+# or include the cudf and/or dask_cudf Python libraries:
+
+./build.sh  # libcudf, cudf and dask_cudf
+./build.sh libcudf  # libcudf only
+./build.sh libcudf cudf  # libcudf and cudf only             
+```
+- Other libraries like `cudf-kafka` and `custreamz` can be installed with this script. For the complete list of libraries as well as details about the script usage, run the `help` command:
+```bash
+./build.sh --help            
+```
+
+### Build, install and test cuDF libraries for contributors
+
+The general workflow is provided below. Please, also see the last section about [code formatting](###code-formatting).
+
+#### `libcudf` (C++)
+
+If you're only interested in building the library (and not the unit tests):
+ 
+```bash
+cd $CUDF_HOME
+./build.sh libcudf
+```
+If, in addition, you want to build tests:
+
+```bash
+./build.sh libcudf tests
+```
+To run the tests:
+
 ```bash
-$ cd $CUDF_HOME/cpp                                                       # navigate to C/C++ CUDA source root directory
-$ mkdir build                                                             # make a build directory
-$ cd build                                                                # enter the build directory
+make test                                      
+```
 
-# CMake options:
-# -DCMAKE_INSTALL_PREFIX set to the install path for your libraries or $CONDA_PREFIX if you're using Anaconda, i.e. -DCMAKE_INSTALL_PREFIX=/install/path or -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX
-$ cmake .. -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX                           # configure cmake ...
-$ make -j                                                                 # compile the libraries librmm.so, libcudf.so ... '-j' will start a parallel job using the number of physical cores available on your system
-$ make install                                                            # install the libraries librmm.so, libcudf.so to the CMAKE_INSTALL_PREFIX
+#### `cudf` (Python)
+
+- First, build the `libcudf` C++ library following the steps above
+
+- To build and install in edit/develop `cudf` python package:
+```bash
+cd $CUDF_HOME/python/cudf
+python setup.py build_ext --inplace
+python setup.py develop
 ```
 
-- As a convenience, a `build.sh` script is provided in `$CUDF_HOME`. To execute the same build commands above, run the script as shown below.  Note that the libraries will be installed to the location set in `$INSTALL_PREFIX` if set (i.e. `export INSTALL_PREFIX=/install/path`), otherwise to `$CONDA_PREFIX`.
+- To run `cudf` tests :
 ```bash
-$ cd $CUDF_HOME
-$ ./build.sh                                                              # To build both C++ and Python cuDF versions with their dependencies
+cd $CUDF_HOME/python
+py.test -v cudf/cudf/tests
 ```
-- To build only the C++ component with the script
+
+#### `dask-cudf` (Python)
+
+- First, build the `libcudf` C++ and `cudf` Python libraries following the steps above
+
+- To install in edit/develop mode the `dask-cudf` python package:
 ```bash
-$ ./build.sh libcudf                                                      # Build only the cuDF C++ components and install them to $INSTALL_PREFIX if set, otherwise $CONDA_PREFIX
+cd $CUDF_HOME/python/dask_cudf
+python setup.py build_ext --inplace
+python setup.py develop
 ```
 
-- To run tests (Optional):
+- To run `dask_cudf` tests :
 ```bash
-$ make test
+cd $CUDF_HOME/python
+py.test -v dask_cudf
 ```
-- Build the `cudf` python package, in the `python/cudf` folder:
+
+#### `libcudf_kafka` (C++)
+
+If you're only interested in building the library (and not the unit tests):
+ 
 ```bash
-$ cd $CUDF_HOME/python/cudf
-$ python setup.py build_ext --inplace
-$ python setup.py install
+cd $CUDF_HOME
+./build.sh libcudf_kafka
 ```
+If, in addition, you want to build tests:
 
-- Like the `libcudf` build step above, `build.sh` can also be used to build the `cudf` python package, as shown below:
 ```bash
-$ cd $CUDF_HOME
-$ ./build.sh cudf
+./build.sh libcudf_kafka tests
 ```
+To run the tests:
 
-- Additionally to build the `dask-cudf` python package, in the `python/dask_cudf` folder:
 ```bash
-$ cd $CUDF_HOME/python/dask_cudf
-$ python setup.py install
+make test                                      
 ```
 
-- The `build.sh` script can also  be used to build the `dask-cudf` python package, as shown below:
+#### `cudf-kafka` (Python)
+
+- First, build the `libcudf` and `libcudf_kafka` following the steps above
+
+- To install in edit/develop mode the `cudf-kafka` python package:
 ```bash
-$ cd $CUDF_HOME
-$ ./build.sh dask_cudf
+cd $CUDF_HOME/python/cudf_kafka
+python setup.py build_ext --inplace
+python setup.py develop
 ```
 
-- To run Python tests (Optional):
+#### `custreamz` (Python)
+
+- First, build `libcudf`, `libcudf_kafka`, and `cudf_kafka` following the steps above
+
+- To install in edit/develop mode the `custreamz` python package:
 ```bash
-$ cd $CUDF_HOME/python
-$ py.test -v cudf                           # run cudf test suite
-$ py.test -v dask_cudf                      # run dask_cudf test suite
+cd $CUDF_HOME/python/custreamz
+python setup.py build_ext --inplace
+python setup.py develop
 ```
 
-- Other `build.sh` options:
+- To run `custreamz` tests :
 ```bash
-$ cd $CUDF_HOME
-$ ./build.sh clean                     # remove any prior build artifacts and configuration (start over)
-$ ./build.sh libcudf -v                # compile and install libcudf with verbose output
-$ ./build.sh libcudf -g                # compile and install libcudf for debug
-$ PARALLEL_LEVEL=4 ./build.sh libcudf  # compile and install libcudf limiting parallel build jobs to 4 (make -j4)
-$ ./build.sh libcudf -n                # compile libcudf but do not install
+cd $CUDF_HOME/python
+py.test -v custreamz
 ```
 
-Done! You are ready to develop for the cuDF OSS project.
+#### `cudf` (Java):
+
+- First, build the `libcudf` C++ library following the steps above
+
+- Then, refer to [Java README](https://github.com/rapidsai/cudf/blob/branch-21.10/java/README.md)
+
+
+Done! You are ready to develop for the cuDF OSS project. But please go to [code formatting](###code-formatting) to ensure that you contributing code follows the expected format.
 
 ## Debugging cuDF
 
 ### Building Debug mode from source
 
-Follow the [above instructions](#build-from-source) to build from source and add `-DCMAKE_BUILD_TYPE=Debug` to the `cmake` step.
+Follow the [above instructions](####build-cudf-from-source) to build from source and add `-g` to the `./build.sh` command.
 
 For example:
 ```bash
-$ cmake .. -DCMAKE_INSTALL_PREFIX=/install/path -DCMAKE_BUILD_TYPE=Debug     # configure cmake ... use -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX if you're using Anaconda
+./build.sh libcudf -g
 ```
 
 This builds `libcudf` in Debug mode which enables some `assert` safety checks and includes symbols in the library for debugging.
@@ -289,6 +276,7 @@ You can then use `cuda-dbg` to debug into the kernels in that source file.
 Before submitting a pull request, you can do a local build and test on your machine that mimics our gpuCI environment using the `ci/local/build.sh` script.
 For detailed information on usage of this script, see [here](ci/local/README.md).
 
+
 ## Automated Build in Docker Container
 
 A Dockerfile is provided with a preconfigured conda environment for building and installing cuDF from source based off of the main branch.
@@ -303,11 +291,11 @@ A Dockerfile is provided with a preconfigured conda environment for building and
 
 From cudf project root run the following, to build with defaults:
 ```bash
-$ docker build --tag cudf .
+docker build --tag cudf .
 ```
 After the container is built run the container:
 ```bash
-$ docker run --runtime=nvidia -it cudf bash
+docker run --runtime=nvidia -it cudf bash
 ```
 Activate the conda environment `cudf` to use the newly built cuDF and libcudf libraries:
 ```
@@ -337,6 +325,71 @@ flag. Below is a list of the available arguments and their purpose:
 | `CYTHON_VERSION` | 0.29 | Not supported | set Cython version |
 | `PYTHON_VERSION` | 3.7 | 3.8 | set python version |
 
+
+### Code Formatting
+
+
+#### Python
+
+cuDF uses [Black](https://black.readthedocs.io/en/stable/),
+[isort](https://readthedocs.org/projects/isort/), and
+[flake8](http://flake8.pycqa.org/en/latest/) to ensure a consistent code format
+throughout the project. They have been installed during the `cudf_dev` environment creation.
+
+These tools are used to auto-format the Python code, as well as check the Cython
+code in the repository. Additionally, there is a CI check in place to enforce
+that committed code follows our standards. You can use the tools to
+automatically format your python code by running:
+
+```bash
+isort --atomic python/**/*.py
+black python
+```
+
+and then check the syntax of your Python and Cython code by running:
+
+```bash
+flake8 python
+flake8 --config=python/.flake8.cython
+```
+
+Additionally, many editors have plugins that will apply `isort` and `Black` as
+you edit files, as well as use `flake8` to report any style / syntax issues.
+
+#### C++/CUDA
+
+cuDF uses [`clang-format`](https://clang.llvm.org/docs/ClangFormat.html)
+
+In order to format the C++/CUDA files, navigate to the root (`cudf`) directory and run:
+```
+python3 ./cpp/scripts/run-clang-format.py -inplace
+```
+
+Additionally, many editors have plugins or extensions that you can set up to automatically run `clang-format` either manually or on file save.
+
+#### Pre-commit hooks
+
+Optionally, you may wish to setup [pre-commit hooks](https://pre-commit.com/)
+to automatically run `isort`, `Black`, `flake8` and `clang-format` when you make a git commit.
+This can be done by installing `pre-commit` via `conda` or `pip`:
+
+```bash
+conda install -c conda-forge pre_commit
+```
+
+```bash
+pip install pre-commit
+```
+
+and then running:
+
+```bash
+pre-commit install
+```
+
+from the root of the cuDF repository. Now `isort`, `Black`, `flake8` and `clang-format` will be
+run each time you commit changes.
+
 ---
 
 ## Attribution

From 4d8e401f778396bd909f5ea0f75b80ec52ebd74c Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 26 Aug 2021 19:05:45 -0400
Subject: [PATCH 46/46] Fix a bug: inner_join_size return zero if build table
 is empty (#9128)

Closes https://github.com/rapidsai/cudf/issues/9092

This PR fixed a bug where `inner_join_size` would throw an exception if the build table is empty. Corresponding unit tests are added as well.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/9128
---
 cpp/src/join/hash_join.cu     | 10 +++---
 cpp/tests/join/join_tests.cpp | 66 +++++++++++++++++++++++++++++++----
 2 files changed, 66 insertions(+), 10 deletions(-)

diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index ee1eaeaed47..636729a735e 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -347,7 +347,9 @@ std::size_t hash_join::hash_join_impl::inner_join_size(cudf::table_view const& p
                                                        rmm::cuda_stream_view stream) const
 {
   CUDF_FUNC_RANGE();
-  CUDF_EXPECTS(_hash_table, "Hash table of hash join is null.");
+
+  // Return directly if build table is empty
+  if (_hash_table == nullptr) { return 0; }
 
   auto flattened_probe = structs::detail::flatten_nested_columns(
     probe, {}, {}, structs::detail::column_nullability::FORCE);
@@ -367,7 +369,7 @@ std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const& pr
   CUDF_FUNC_RANGE();
 
   // Trivial left join case - exit early
-  if (!_hash_table) { return probe.num_rows(); }
+  if (_hash_table == nullptr) { return probe.num_rows(); }
 
   auto flattened_probe = structs::detail::flatten_nested_columns(
     probe, {}, {}, structs::detail::column_nullability::FORCE);
@@ -388,7 +390,7 @@ std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const& pr
   CUDF_FUNC_RANGE();
 
   // Trivial left join case - exit early
-  if (!_hash_table) { return probe.num_rows(); }
+  if (_hash_table == nullptr) { return probe.num_rows(); }
 
   auto flattened_probe = structs::detail::flatten_nested_columns(
     probe, {}, {}, structs::detail::column_nullability::FORCE);
@@ -447,7 +449,7 @@ hash_join::hash_join_impl::probe_join_indices(cudf::table_view const& probe,
                                               rmm::mr::device_memory_resource* mr) const
 {
   // Trivial left join case - exit early
-  if (!_hash_table && JoinKind != cudf::detail::join_kind::INNER_JOIN) {
+  if (_hash_table == nullptr and JoinKind != cudf::detail::join_kind::INNER_JOIN) {
     return get_trivial_left_join_indices(probe, stream, mr);
   }
 
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index af998e366e9..8945f82baef 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -987,8 +987,26 @@ TEST_F(JoinTest, EmptyRightTableInnerJoin)
   Table t0(std::move(cols0));
   Table empty1(std::move(cols1));
 
-  auto result = cudf::inner_join(t0, empty1, {0, 1}, {0, 1});
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty1, *result);
+  {
+    auto result = cudf::inner_join(t0, empty1, {0, 1}, {0, 1});
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty1, *result);
+  }
+
+  {
+    cudf::hash_join hash_join(empty1, cudf::null_equality::EQUAL);
+
+    auto output_size                         = hash_join.inner_join_size(t0);
+    std::optional<std::size_t> optional_size = output_size;
+
+    std::size_t const size_gold = 0;
+    EXPECT_EQ(output_size, size_gold);
+
+    auto result = hash_join.inner_join(t0, cudf::null_equality::EQUAL, optional_size);
+    column_wrapper<int32_t> col_gold_0{};
+    column_wrapper<int32_t> col_gold_1{};
+    auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
+  }
 }
 
 TEST_F(JoinTest, EmptyRightTableLeftJoin)
@@ -1008,8 +1026,26 @@ TEST_F(JoinTest, EmptyRightTableLeftJoin)
   Table t0(std::move(cols0));
   Table empty1(std::move(cols1));
 
-  auto result = cudf::left_join(t0, empty1, {0, 1}, {0, 1});
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(t0, *result);
+  {
+    auto result = cudf::left_join(t0, empty1, {0, 1}, {0, 1});
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(t0, *result);
+  }
+
+  {
+    cudf::hash_join hash_join(empty1, cudf::null_equality::EQUAL);
+
+    auto output_size                         = hash_join.left_join_size(t0);
+    std::optional<std::size_t> optional_size = output_size;
+
+    std::size_t const size_gold = 5;
+    EXPECT_EQ(output_size, size_gold);
+
+    auto result = hash_join.left_join(t0, cudf::null_equality::EQUAL, optional_size);
+    column_wrapper<int32_t> col_gold_0{{0, 1, 2, 3, 4}};
+    column_wrapper<int32_t> col_gold_1{{NoneValue, NoneValue, NoneValue, NoneValue, NoneValue}};
+    auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
+  }
 }
 
 TEST_F(JoinTest, EmptyRightTableFullJoin)
@@ -1029,8 +1065,26 @@ TEST_F(JoinTest, EmptyRightTableFullJoin)
   Table t0(std::move(cols0));
   Table empty1(std::move(cols1));
 
-  auto result = cudf::full_join(t0, empty1, {0, 1}, {0, 1});
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(t0, *result);
+  {
+    auto result = cudf::full_join(t0, empty1, {0, 1}, {0, 1});
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(t0, *result);
+  }
+
+  {
+    cudf::hash_join hash_join(empty1, cudf::null_equality::EQUAL);
+
+    auto output_size                         = hash_join.full_join_size(t0);
+    std::optional<std::size_t> optional_size = output_size;
+
+    std::size_t const size_gold = 5;
+    EXPECT_EQ(output_size, size_gold);
+
+    auto result = hash_join.full_join(t0, cudf::null_equality::EQUAL, optional_size);
+    column_wrapper<int32_t> col_gold_0{{0, 1, 2, 3, 4}};
+    column_wrapper<int32_t> col_gold_1{{NoneValue, NoneValue, NoneValue, NoneValue, NoneValue}};
+    auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
+  }
 }
 
 // Both tables empty