From 8d9d22231b983f0a85ce594dc9758ab6a6c09559 Mon Sep 17 00:00:00 2001
From: Paul Taylor <paul.e.taylor@me.com>
Date: Mon, 29 Nov 2021 09:26:03 -0600
Subject: [PATCH 01/42] [FIX] Add `arrow_dataset` and `parquet` targets to
 build exports (#9491)

This PR adds the `arrow_dataset` and `parquet` targets to the build export when Arrow is built from source by CPM, similar to what we have to do today for `arrow` and `arrow_cuda` targets.
---
 cpp/CMakeLists.txt                   | 21 ++++++
 cpp/cmake/thirdparty/get_arrow.cmake | 96 +++++++++++++++++++++++-----
 2 files changed, 100 insertions(+), 17 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 966728d7647..59dc3c74af2 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -733,6 +733,27 @@ set(install_code_string
     [=[
 set(ArrowCUDA_DIR "${Arrow_DIR}")
 find_dependency(ArrowCUDA)
+]=]
+)
+
+if(CUDF_ENABLE_ARROW_PARQUET)
+  string(
+    APPEND
+    install_code_string
+    [=[
+  if(NOT Parquet_DIR)
+    set(Parquet_DIR "${Arrow_DIR}")
+  endif()
+  set(ArrowDataset_DIR "${Arrow_DIR}")
+  find_dependency(ArrowDataset)
+  ]=]
+  )
+endif()
+
+string(
+  APPEND
+  install_code_string
+  [=[
 if(testing IN_LIST cudf_FIND_COMPONENTS)
   enable_language(CUDA)
   if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-testing-dependencies.cmake")
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 5fe37402096..ae1448da502 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -90,7 +90,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
 
   rapids_cpm_find(
     Arrow ${VERSION}
-    GLOBAL_TARGETS arrow_shared arrow_cuda_shared
+    GLOBAL_TARGETS arrow_shared parquet_shared arrow_cuda_shared arrow_dataset_shared
     CPM_ARGS
     GIT_REPOSITORY https://github.com/apache/arrow.git
     GIT_TAG apache-arrow-${VERSION}
@@ -142,6 +142,15 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
       set(ArrowCUDA_DIR "${Arrow_DIR}")
       find_package(Arrow REQUIRED QUIET)
       find_package(ArrowCUDA REQUIRED QUIET)
+      if(ENABLE_PARQUET)
+        if(NOT Parquet_DIR)
+          # Set this to enable `find_package(Parquet)`
+          set(Parquet_DIR "${Arrow_DIR}")
+        endif()
+        # Set this to enable `find_package(ArrowDataset)`
+        set(ArrowDataset_DIR "${Arrow_DIR}")
+        find_package(ArrowDataset REQUIRED QUIET)
+      endif()
     elseif(Arrow_ADDED)
       # Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to
       # target_include_directories. That defeats ccache.
@@ -182,24 +191,15 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
   endif()
 
   if(Arrow_ADDED)
+
     set(arrow_code_string
         [=[
-        if (TARGET cudf::arrow_shared AND (NOT TARGET arrow_shared))
-            add_library(arrow_shared ALIAS cudf::arrow_shared)
-        endif()
-        if (TARGET cudf::arrow_static AND (NOT TARGET arrow_static))
-            add_library(arrow_static ALIAS cudf::arrow_static)
-        endif()
-        ]=]
-    )
-    set(arrow_cuda_code_string
-        [=[
-        if (TARGET cudf::arrow_cuda_shared AND (NOT TARGET arrow_cuda_shared))
-            add_library(arrow_cuda_shared ALIAS cudf::arrow_cuda_shared)
-        endif()
-        if (TARGET cudf::arrow_cuda_static AND (NOT TARGET arrow_cuda_static))
-            add_library(arrow_cuda_static ALIAS cudf::arrow_cuda_static)
-        endif()
+          if (TARGET cudf::arrow_shared AND (NOT TARGET arrow_shared))
+              add_library(arrow_shared ALIAS cudf::arrow_shared)
+          endif()
+          if (TARGET cudf::arrow_static AND (NOT TARGET arrow_static))
+              add_library(arrow_static ALIAS cudf::arrow_static)
+          endif()
         ]=]
     )
 
@@ -212,6 +212,17 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
       FINAL_CODE_BLOCK arrow_code_string
     )
 
+    set(arrow_cuda_code_string
+        [=[
+          if (TARGET cudf::arrow_cuda_shared AND (NOT TARGET arrow_cuda_shared))
+              add_library(arrow_cuda_shared ALIAS cudf::arrow_cuda_shared)
+          endif()
+          if (TARGET cudf::arrow_cuda_static AND (NOT TARGET arrow_cuda_static))
+              add_library(arrow_cuda_static ALIAS cudf::arrow_cuda_static)
+          endif()
+        ]=]
+    )
+
     rapids_export(
       BUILD ArrowCUDA
       VERSION ${VERSION}
@@ -220,6 +231,49 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
       NAMESPACE cudf::
       FINAL_CODE_BLOCK arrow_cuda_code_string
     )
+
+    if(ENABLE_PARQUET)
+
+      set(arrow_dataset_code_string
+          [=[
+              if (TARGET cudf::arrow_dataset_shared AND (NOT TARGET arrow_dataset_shared))
+                  add_library(arrow_dataset_shared ALIAS cudf::arrow_dataset_shared)
+              endif()
+              if (TARGET cudf::arrow_dataset_static AND (NOT TARGET arrow_dataset_static))
+                  add_library(arrow_dataset_static ALIAS cudf::arrow_dataset_static)
+              endif()
+            ]=]
+      )
+
+      rapids_export(
+        BUILD ArrowDataset
+        VERSION ${VERSION}
+        EXPORT_SET arrow_dataset_targets
+        GLOBAL_TARGETS arrow_dataset_shared arrow_dataset_static
+        NAMESPACE cudf::
+        FINAL_CODE_BLOCK arrow_dataset_code_string
+      )
+
+      set(parquet_code_string
+          [=[
+              if (TARGET cudf::parquet_shared AND (NOT TARGET parquet_shared))
+                  add_library(parquet_shared ALIAS cudf::parquet_shared)
+              endif()
+              if (TARGET cudf::parquet_static AND (NOT TARGET parquet_static))
+                  add_library(parquet_static ALIAS cudf::parquet_static)
+              endif()
+            ]=]
+      )
+
+      rapids_export(
+        BUILD Parquet
+        VERSION ${VERSION}
+        EXPORT_SET parquet_targets
+        GLOBAL_TARGETS parquet_shared parquet_static
+        NAMESPACE cudf::
+        FINAL_CODE_BLOCK parquet_code_string
+      )
+    endif()
   endif()
   # We generate the arrow-config and arrowcuda-config files when we built arrow locally, so always
   # do `find_dependency`
@@ -230,10 +284,18 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
   # ArrowCUDA_DIR to be where Arrow was found, since Arrow packages ArrowCUDA.config in a
   # non-standard location
   rapids_export_package(BUILD ArrowCUDA cudf-exports)
+  if(ENABLE_PARQUET)
+    rapids_export_package(BUILD Parquet cudf-exports)
+    rapids_export_package(BUILD ArrowDataset cudf-exports)
+  endif()
 
   include("${rapids-cmake-dir}/export/find_package_root.cmake")
   rapids_export_find_package_root(BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
   rapids_export_find_package_root(BUILD ArrowCUDA [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
+  if(ENABLE_PARQUET)
+    rapids_export_find_package_root(BUILD Parquet [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
+    rapids_export_find_package_root(BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
+  endif()
 
   set(ARROW_FOUND
       "${ARROW_FOUND}"

From a1ca8c1e408ac1791c4f4bae563e775bbddb5a29 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Mon, 29 Nov 2021 11:32:43 -0500
Subject: [PATCH 02/42] Use ptxcompiler to patch Numba at runtime to support
 CUDA enhanced compatibility. (#9687)

---
 conda/environments/cudf_dev_cuda11.0.yml      |  3 +-
 conda/environments/cudf_dev_cuda11.2.yml      |  3 +-
 conda/environments/cudf_dev_cuda11.5.yml      |  3 +-
 conda/recipes/cudf/meta.yaml                  |  5 +-
 python/cudf/cudf/__init__.py                  | 11 ++++
 .../cudf/tests/test_extension_compilation.py  | 57 ++++++++++++-------
 6 files changed, 56 insertions(+), 26 deletions(-)

diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index e2ead779861..7c22b4d35e3 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -14,7 +14,7 @@ dependencies:
   - cmake>=3.20.1
   - cmake_setuptools>=0.1.3
   - python>=3.7,<3.9
-  - numba>=0.53.1
+  - numba>=0.54
   - numpy
   - pandas>=1.0,<1.4.0dev0
   - pyarrow=5.0.0=*cuda
@@ -66,3 +66,4 @@ dependencies:
       - git+https://github.com/dask/distributed.git@main
       - git+https://github.com/python-streamz/streamz.git@master
       - pyorc
+  - ptxcompiler  # [linux64]
diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml
index 6146d84835a..0978ae7c8f9 100644
--- a/conda/environments/cudf_dev_cuda11.2.yml
+++ b/conda/environments/cudf_dev_cuda11.2.yml
@@ -14,7 +14,7 @@ dependencies:
   - cmake>=3.20.1
   - cmake_setuptools>=0.1.3
   - python>=3.7,<3.9
-  - numba>=0.53.1
+  - numba>=0.54
   - numpy
   - pandas>=1.0,<1.4.0dev0
   - pyarrow=5.0.0=*cuda
@@ -66,3 +66,4 @@ dependencies:
       - git+https://github.com/dask/distributed.git@main
       - git+https://github.com/python-streamz/streamz.git@master
       - pyorc
+  - ptxcompiler  # [linux64]
diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
index 043c81c9e01..d2d0a38c44e 100644
--- a/conda/environments/cudf_dev_cuda11.5.yml
+++ b/conda/environments/cudf_dev_cuda11.5.yml
@@ -14,7 +14,7 @@ dependencies:
   - cmake>=3.20.1
   - cmake_setuptools>=0.1.3
   - python>=3.7,<3.9
-  - numba>=0.53.1
+  - numba>=0.54
   - numpy
   - pandas>=1.0,<1.4.0dev0
   - pyarrow=5.0.0=*cuda
@@ -66,3 +66,4 @@ dependencies:
       - git+https://github.com/dask/distributed.git@main
       - git+https://github.com/python-streamz/streamz.git@master
       - pyorc
+  - ptxcompiler  # [linux64]
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 6d56b0c0c94..46eefbc825f 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -29,7 +29,7 @@ requirements:
     - python
     - cython >=0.29,<0.30
     - setuptools
-    - numba >=0.53.1
+    - numba >=0.54
     - dlpack>=0.5,<0.6.0a0
     - pyarrow 5.0.0 *cuda
     - libcudf {{ version }}
@@ -41,7 +41,7 @@ requirements:
     - typing_extensions
     - pandas >=1.0,<1.4.0dev0
     - cupy >=9.5.0,<10.0.0a0
-    - numba >=0.53.1
+    - numba >=0.54
     - numpy
     - {{ pin_compatible('pyarrow', max_pin='x.x.x') }} *cuda
     - fastavro >=0.22.0
@@ -51,6 +51,7 @@ requirements:
     - nvtx >=0.2.1
     - packaging
     - cachetools
+    - ptxcompiler  # [linux64]  # CUDA enhanced compatibility. See https://github.com/rapidsai/ptxcompiler
 
 test:                                   # [linux64]
   requires:                             # [linux64]
diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index bc35551b5bd..b24e71e7785 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -102,6 +102,17 @@
 from cudf.utils.dtypes import _NA_REP
 from cudf.utils.utils import set_allocator
 
+try:
+    from ptxcompiler.patch import patch_numba_codegen_if_needed
+except ImportError:
+    pass
+else:
+    # Patch Numba to support CUDA enhanced compatibility.
+    # See https://github.com/rapidsai/ptxcompiler for
+    # details.
+    patch_numba_codegen_if_needed()
+    del patch_numba_codegen_if_needed
+
 cuda.set_memory_manager(rmm.RMMNumbaManager)
 cupy.cuda.set_allocator(rmm.rmm_cupy_allocator)
 
diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py
index 39fa7b11ce2..47c9448cf63 100644
--- a/python/cudf/cudf/tests/test_extension_compilation.py
+++ b/python/cudf/cudf/tests/test_extension_compilation.py
@@ -1,5 +1,6 @@
 import operator
 
+import cupy as cp
 import pytest
 from numba import cuda, types
 from numba.cuda import compile_ptx
@@ -71,8 +72,8 @@ def test_execute_masked_binary(op, ty):
     def func(x, y):
         return op(x, y)
 
-    @cuda.jit(debug=True)
-    def test_kernel(x, y):
+    @cuda.jit
+    def test_kernel(x, y, err):
         # Reference result with unmasked value
         u = func(x, y)
 
@@ -87,14 +88,22 @@ def test_kernel(x, y):
         # Check masks are as expected, and unmasked result matches masked
         # result
         if r0.valid:
-            raise RuntimeError("Expected r0 to be invalid")
+            # TODO: ideally, we would raise an exception here rather
+            # than return an "error code", and that is what the
+            # previous version of this (and below) tests did. But,
+            # Numba kernels cannot currently use `debug=True` with
+            # CUDA enhanced compatibility.  Once a solution to that is
+            # reached, we should switch back to raising exceptions
+            # here.
+            err[0] = 1
         if not r1.valid:
-            raise RuntimeError("Expected r1 to be valid")
+            err[0] = 2
         if u != r1.value:
-            print("Values: ", u, r1.value)
-            raise RuntimeError("u != r1.value")
+            err[0] = 3
 
-    test_kernel[1, 1](1, 2)
+    err = cp.asarray([0], dtype="int8")
+    test_kernel[1, 1](1, 2, err)
+    assert err[0] == 0
 
 
 @pytest.mark.parametrize("op", ops)
@@ -187,18 +196,20 @@ def test_is_na(fn):
 
     device_fn = cuda.jit(device=True)(fn)
 
-    @cuda.jit(debug=True)
-    def test_kernel():
+    @cuda.jit
+    def test_kernel(err):
         valid_is_na = device_fn(valid)
         invalid_is_na = device_fn(invalid)
 
         if valid_is_na:
-            raise RuntimeError("Valid masked value is NA and should not be")
+            err[0] = 1
 
         if not invalid_is_na:
-            raise RuntimeError("Invalid masked value is not NA and should be")
+            err[0] = 2
 
-    test_kernel[1, 1]()
+    err = cp.asarray([0], dtype="int8")
+    test_kernel[1, 1](err)
+    assert err[0] == 0
 
 
 def func_lt_na(x):
@@ -271,8 +282,8 @@ def test_na_masked_comparisons(fn, ty):
 
     device_fn = cuda.jit(device=True)(fn)
 
-    @cuda.jit(debug=True)
-    def test_kernel():
+    @cuda.jit
+    def test_kernel(err):
         unmasked = ty(1)
         valid_masked = Masked(unmasked, True)
         invalid_masked = Masked(unmasked, False)
@@ -281,12 +292,14 @@ def test_kernel():
         invalid_cmp_na = device_fn(invalid_masked)
 
         if valid_cmp_na:
-            raise RuntimeError("Valid masked value compared True with NA")
+            err[0] = 1
 
         if invalid_cmp_na:
-            raise RuntimeError("Invalid masked value compared True with NA")
+            err[0] = 2
 
-    test_kernel[1, 1]()
+    err = cp.asarray([0], dtype="int8")
+    test_kernel[1, 1](err)
+    assert err[0] == 0
 
 
 # xfail because scalars do not yet cast for a comparison to NA
@@ -297,13 +310,15 @@ def test_na_scalar_comparisons(fn, ty):
 
     device_fn = cuda.jit(device=True)(fn)
 
-    @cuda.jit(debug=True)
-    def test_kernel():
+    @cuda.jit
+    def test_kernel(err):
         unmasked = ty(1)
 
         unmasked_cmp_na = device_fn(unmasked)
 
         if unmasked_cmp_na:
-            raise RuntimeError("Unmasked value compared True with NA")
+            err[0] = 1
 
-    test_kernel[1, 1]()
+    err = cp.asarray([0], dtype="int8")
+    test_kernel[1, 1](err)
+    assert err[0] == 0

From 0ebeffa4c8122cd1f54fe9fc05c4bec660b7e37e Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 23 Nov 2021 16:14:17 -0500
Subject: [PATCH 03/42] Only run runtime jit tests with CUDA 11.5 runtime

CI runs all tests over a variety of different hardware. Tests
that have to use NVRTC to re-compile cudf C++ code are only
supported on CUDA 11.5+.
---
 cpp/tests/binaryop/binop-generic-ptx-test.cpp |  6 ++++
 cpp/tests/binaryop/binop-integration-test.cpp |  6 ++++
 cpp/tests/binaryop/binop-null-test.cpp        |  8 +++++
 cpp/tests/binaryop/util/runtime_support.h     | 33 +++++++++++++++++++
 4 files changed, 53 insertions(+)
 create mode 100644 cpp/tests/binaryop/util/runtime_support.h

diff --git a/cpp/tests/binaryop/binop-generic-ptx-test.cpp b/cpp/tests/binaryop/binop-generic-ptx-test.cpp
index 6e35bdac41c..0b6cfdab498 100644
--- a/cpp/tests/binaryop/binop-generic-ptx-test.cpp
+++ b/cpp/tests/binaryop/binop-generic-ptx-test.cpp
@@ -21,11 +21,17 @@
 
 #include <tests/binaryop/assert-binops.h>
 #include <tests/binaryop/binop-fixture.hpp>
+#include <tests/binaryop/util/runtime_support.h>
 
 namespace cudf {
 namespace test {
 namespace binop {
 struct BinaryOperationGenericPTXTest : public BinaryOperationTest {
+ protected:
+  void SetUp() override
+  {
+    if (!can_do_runtime_jit()) { GTEST_SKIP() << "Skipping tests that require 11.5 runtime"; }
+  }
 };
 
 TEST_F(BinaryOperationGenericPTXTest, CAdd_Vector_Vector_FP32_FP32_FP32)
diff --git a/cpp/tests/binaryop/binop-integration-test.cpp b/cpp/tests/binaryop/binop-integration-test.cpp
index 427a21512a3..21696a419ee 100644
--- a/cpp/tests/binaryop/binop-integration-test.cpp
+++ b/cpp/tests/binaryop/binop-integration-test.cpp
@@ -31,6 +31,7 @@
 
 #include <tests/binaryop/assert-binops.h>
 #include <tests/binaryop/binop-fixture.hpp>
+#include <tests/binaryop/util/runtime_support.h>
 #include "cudf/utilities/error.hpp"
 
 namespace cudf {
@@ -40,6 +41,11 @@ namespace binop {
 constexpr debug_output_level verbosity{debug_output_level::ALL_ERRORS};
 
 struct BinaryOperationIntegrationTest : public BinaryOperationTest {
+ protected:
+  void SetUp() override
+  {
+    if (!can_do_runtime_jit()) { GTEST_SKIP() << "Skipping tests that require 11.5 runtime"; }
+  }
 };
 
 TEST_F(BinaryOperationIntegrationTest, Add_Scalar_Vector_SI32_FP32_SI64)
diff --git a/cpp/tests/binaryop/binop-null-test.cpp b/cpp/tests/binaryop/binop-null-test.cpp
index 25ec3b30834..b7e7702bd6c 100644
--- a/cpp/tests/binaryop/binop-null-test.cpp
+++ b/cpp/tests/binaryop/binop-null-test.cpp
@@ -23,6 +23,8 @@
 #include <tests/binaryop/assert-binops.h>
 #include <tests/binaryop/binop-fixture.hpp>
 
+#include <tests/binaryop/util/runtime_support.h>
+
 namespace cudf {
 namespace test {
 namespace binop {
@@ -52,6 +54,12 @@ struct BinaryOperationNullTest : public BinaryOperationTest {
       default: CUDF_FAIL("Unknown mask state " + std::to_string(static_cast<int64_t>(state)));
     }
   }
+
+ protected:
+  void SetUp() override
+  {
+    if (!can_do_runtime_jit()) { GTEST_SKIP() << "Skipping tests that require 11.5 runtime"; }
+  }
 };  // namespace binop
 
 TEST_F(BinaryOperationNullTest, Scalar_Null_Vector_Valid)
diff --git a/cpp/tests/binaryop/util/runtime_support.h b/cpp/tests/binaryop/util/runtime_support.h
new file mode 100644
index 00000000000..b6cfdadee3c
--- /dev/null
+++ b/cpp/tests/binaryop/util/runtime_support.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Copyright 2018-2019 BlazingDB, Inc.
+ *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+
+
+bool can_do_runtime_jit()
+{
+  // We require a CUDA NVRTC of 11.5+ to do runtime jit
+  // as we need support for __int128
+
+  int runtime = 0;
+  auto error_value = cudaRuntimeGetVersion(&runtime);
+  return (error_value == cudaSuccess) && (runtime >= 11050);
+}

From dfcb48d09a56daa226ec5962acb00ddcaadaf494 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 23 Nov 2021 16:22:08 -0500
Subject: [PATCH 04/42] Fix style issues found by CI

---
 cpp/tests/binaryop/binop-generic-ptx-test.cpp | 2 +-
 cpp/tests/binaryop/binop-integration-test.cpp | 2 +-
 cpp/tests/binaryop/util/runtime_support.h     | 5 ++---
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/cpp/tests/binaryop/binop-generic-ptx-test.cpp b/cpp/tests/binaryop/binop-generic-ptx-test.cpp
index 0b6cfdab498..f4407834786 100644
--- a/cpp/tests/binaryop/binop-generic-ptx-test.cpp
+++ b/cpp/tests/binaryop/binop-generic-ptx-test.cpp
@@ -20,8 +20,8 @@
 #include <cudf/binaryop.hpp>
 
 #include <tests/binaryop/assert-binops.h>
-#include <tests/binaryop/binop-fixture.hpp>
 #include <tests/binaryop/util/runtime_support.h>
+#include <tests/binaryop/binop-fixture.hpp>
 
 namespace cudf {
 namespace test {
diff --git a/cpp/tests/binaryop/binop-integration-test.cpp b/cpp/tests/binaryop/binop-integration-test.cpp
index 21696a419ee..4181b20220b 100644
--- a/cpp/tests/binaryop/binop-integration-test.cpp
+++ b/cpp/tests/binaryop/binop-integration-test.cpp
@@ -30,8 +30,8 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <tests/binaryop/assert-binops.h>
-#include <tests/binaryop/binop-fixture.hpp>
 #include <tests/binaryop/util/runtime_support.h>
+#include <tests/binaryop/binop-fixture.hpp>
 #include "cudf/utilities/error.hpp"
 
 namespace cudf {
diff --git a/cpp/tests/binaryop/util/runtime_support.h b/cpp/tests/binaryop/util/runtime_support.h
index b6cfdadee3c..a7ee0c3a391 100644
--- a/cpp/tests/binaryop/util/runtime_support.h
+++ b/cpp/tests/binaryop/util/runtime_support.h
@@ -21,13 +21,12 @@
 
 #include <cuda_runtime.h>
 
-
-bool can_do_runtime_jit()
+inline bool can_do_runtime_jit()
 {
   // We require a CUDA NVRTC of 11.5+ to do runtime jit
   // as we need support for __int128
 
-  int runtime = 0;
+  int runtime      = 0;
   auto error_value = cudaRuntimeGetVersion(&runtime);
   return (error_value == cudaSuccess) && (runtime >= 11050);
 }

From bbf137eb16ba69e14de9924acd9ca47997da0324 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Wed, 24 Nov 2021 09:58:53 -0500
Subject: [PATCH 05/42] WIP: disable csv test

---
 cpp/tests/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 98bade7e15f..39ca11d8184 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -192,7 +192,7 @@ ConfigureTest(
 # * io tests --------------------------------------------------------------------------------------
 ConfigureTest(DECOMPRESSION_TEST io/comp/decomp_test.cpp)
 
-ConfigureTest(CSV_TEST io/csv_test.cpp)
+# ConfigureTest(CSV_TEST io/csv_test.cpp)
 ConfigureTest(ORC_TEST io/orc_test.cpp)
 ConfigureTest(PARQUET_TEST io/parquet_test.cpp)
 ConfigureTest(JSON_TEST io/json_test.cpp)

From a24d2a841e9ab0b94f39418790b4f4b1d88234ff Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Wed, 24 Nov 2021 11:15:22 -0500
Subject: [PATCH 06/42] WIP: disable all io tests

---
 cpp/tests/CMakeLists.txt | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 39ca11d8184..f60a2361752 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -192,15 +192,11 @@ ConfigureTest(
 # * io tests --------------------------------------------------------------------------------------
 ConfigureTest(DECOMPRESSION_TEST io/comp/decomp_test.cpp)
 
-# ConfigureTest(CSV_TEST io/csv_test.cpp)
-ConfigureTest(ORC_TEST io/orc_test.cpp)
-ConfigureTest(PARQUET_TEST io/parquet_test.cpp)
-ConfigureTest(JSON_TEST io/json_test.cpp)
-ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp)
-ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp)
-if(CUDF_ENABLE_ARROW_S3)
-  target_compile_definitions(ARROW_IO_SOURCE_TEST PRIVATE "S3_ENABLED")
-endif()
+# ConfigureTest(CSV_TEST io/csv_test.cpp) ConfigureTest(ORC_TEST io/orc_test.cpp)
+# ConfigureTest(PARQUET_TEST io/parquet_test.cpp) ConfigureTest(JSON_TEST io/json_test.cpp)
+# ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp) ConfigureTest(MULTIBYTE_SPLIT_TEST
+# io/text/multibyte_split_test.cpp) if(CUDF_ENABLE_ARROW_S3)
+# target_compile_definitions(ARROW_IO_SOURCE_TEST PRIVATE "S3_ENABLED") endif()
 
 # ##################################################################################################
 # * sort tests ------------------------------------------------------------------------------------

From f6143952bf27ccd3c29c7b015099d9db0fdf0528 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Thu, 25 Nov 2021 23:06:33 +0530
Subject: [PATCH 07/42] remove jit integration tests

---
 cpp/tests/CMakeLists.txt                      |    1 -
 .../binop-compiled-fixed_point-test.cpp       |   40 +
 cpp/tests/binaryop/binop-compiled-test.cpp    |    2 +
 cpp/tests/binaryop/binop-integration-test.cpp | 2722 -----------------
 cpp/tests/binaryop/binop-null-test.cpp        |   32 +-
 .../binaryop/binop-verify-input-test.cpp      |   12 +-
 6 files changed, 64 insertions(+), 2745 deletions(-)
 delete mode 100644 cpp/tests/binaryop/binop-integration-test.cpp

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index f60a2361752..8ae31d7d74d 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -164,7 +164,6 @@ ConfigureTest(
   BINARY_TEST
   binaryop/binop-verify-input-test.cpp
   binaryop/binop-null-test.cpp
-  binaryop/binop-integration-test.cpp
   binaryop/binop-compiled-test.cpp
   binaryop/binop-compiled-fixed_point-test.cpp
   binaryop/binop-generic-ptx-test.cpp
diff --git a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
index 7925f0dd618..5020fbf898b 100644
--- a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
@@ -684,4 +684,44 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpThrows)
                cudf::logic_error);
 }
 
+template <typename T>
+struct FixedPointTest_64_128_Reps : public cudf::test::BaseFixture {
+};
+
+using Decimal64And128Types = cudf::test::Types<numeric::decimal64, numeric::decimal128>;
+TYPED_TEST_SUITE(FixedPointTest_64_128_Reps, Decimal64And128Types);
+
+TYPED_TEST(FixedPointTest_64_128_Reps, FixedPoint_64_128_ComparisonTests)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  for (auto const rhs_value : {10000000000000000, 100000000000000000}) {
+    auto const lhs       = fp_wrapper<RepType>{{33041, 97290, 36438, 25379, 48473}, scale_type{2}};
+    auto const rhs       = make_fixed_point_scalar<decimalXX>(rhs_value, scale_type{0});
+    auto const trues     = wrapper<bool>{{1, 1, 1, 1, 1}};
+    auto const falses    = wrapper<bool>{{0, 0, 0, 0, 0}};
+    auto const bool_type = cudf::data_type{type_id::BOOL8};
+
+    auto const a = cudf::binary_operation(lhs, *rhs, binary_operator::LESS, bool_type);
+    auto const b = cudf::binary_operation(lhs, *rhs, binary_operator::LESS_EQUAL, bool_type);
+    auto const c = cudf::binary_operation(lhs, *rhs, binary_operator::GREATER, bool_type);
+    auto const d = cudf::binary_operation(lhs, *rhs, binary_operator::GREATER_EQUAL, bool_type);
+    auto const e = cudf::binary_operation(*rhs, lhs, binary_operator::GREATER, bool_type);
+    auto const f = cudf::binary_operation(*rhs, lhs, binary_operator::GREATER_EQUAL, bool_type);
+    auto const g = cudf::binary_operation(*rhs, lhs, binary_operator::LESS, bool_type);
+    auto const h = cudf::binary_operation(*rhs, lhs, binary_operator::LESS_EQUAL, bool_type);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(a->view(), trues);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(b->view(), trues);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(c->view(), falses);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(d->view(), falses);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(e->view(), trues);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(f->view(), trues);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(g->view(), falses);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(h->view(), falses);
+  }
+}
+
 }  // namespace cudf::test::binop
diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp
index 7a9f6135bcd..37212c30d80 100644
--- a/cpp/tests/binaryop/binop-compiled-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-test.cpp
@@ -679,3 +679,5 @@ TEST_F(BinaryOperationCompiledTest_NullOpsString, NullMin_Vector_Vector)
 }
 
 }  // namespace cudf::test::binop
+
+CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/binaryop/binop-integration-test.cpp b/cpp/tests/binaryop/binop-integration-test.cpp
deleted file mode 100644
index 4181b20220b..00000000000
--- a/cpp/tests/binaryop/binop-integration-test.cpp
+++ /dev/null
@@ -1,2722 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Copyright 2018-2019 BlazingDB, Inc.
- *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/binaryop.hpp>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/types.hpp>
-#include <cudf/unary.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
-
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <tests/binaryop/assert-binops.h>
-#include <tests/binaryop/util/runtime_support.h>
-#include <tests/binaryop/binop-fixture.hpp>
-#include "cudf/utilities/error.hpp"
-
-namespace cudf {
-namespace test {
-namespace binop {
-
-constexpr debug_output_level verbosity{debug_output_level::ALL_ERRORS};
-
-struct BinaryOperationIntegrationTest : public BinaryOperationTest {
- protected:
-  void SetUp() override
-  {
-    if (!can_do_runtime_jit()) { GTEST_SKIP() << "Skipping tests that require 11.5 runtime"; }
-  }
-};
-
-TEST_F(BinaryOperationIntegrationTest, Add_Scalar_Vector_SI32_FP32_SI64)
-{
-  using TypeOut = int32_t;
-  using TypeLhs = float;
-  using TypeRhs = int64_t;
-
-  using ADD = cudf::library::operation::Add<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_scalar<TypeLhs>();
-  auto rhs = make_random_wrapped_column<TypeRhs>(10000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Add_Vector_Vector_SI32_FP32_FP32)
-{
-  using TypeOut = int32_t;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  using ADD = cudf::library::operation::Add<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(10000);
-  auto rhs = make_random_wrapped_column<TypeRhs>(10000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Sub_Scalar_Vector_SI32_FP32_FP32)
-{
-  using TypeOut = int32_t;
-  using TypeLhs = float;
-  using TypeRhs = int64_t;
-
-  using SUB = cudf::library::operation::Sub<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_scalar<TypeLhs>();
-  auto rhs = make_random_wrapped_column<TypeRhs>(10000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Add_Vector_Scalar_SI08_SI16_SI32)
-{
-  using TypeOut = int8_t;
-  using TypeLhs = int16_t;
-  using TypeRhs = int32_t;
-
-  using ADD = cudf::library::operation::Add<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_scalar<TypeRhs>();
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Add_Vector_Vector_SI32_FP64_SI08)
-{
-  using TypeOut = int32_t;
-  using TypeLhs = double;
-  using TypeRhs = int8_t;
-
-  using ADD = cudf::library::operation::Add<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Sub_Vector_Vector_SI64)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int64_t;
-  using TypeRhs = int64_t;
-
-  using SUB = cudf::library::operation::Sub<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Sub_Vector_Scalar_SI64_FP64_SI32)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = double;
-  using TypeRhs = int32_t;
-
-  using SUB = cudf::library::operation::Sub<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(10000);
-  auto rhs = make_random_wrapped_scalar<TypeRhs>();
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Sub_Vector_Vector_TimepointD_DurationS_TimepointUS)
-{
-  using TypeOut = cudf::timestamp_us;
-  using TypeLhs = cudf::timestamp_D;
-  using TypeRhs = cudf::duration_s;
-
-  using SUB = cudf::library::operation::Sub<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Sub_Vector_Scalar_TimepointD_TimepointS_DurationS)
-{
-  using TypeOut = cudf::duration_s;
-  using TypeLhs = cudf::timestamp_D;
-  using TypeRhs = cudf::timestamp_s;
-
-  using SUB = cudf::library::operation::Sub<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = cudf::scalar_type_t<TypeRhs>(typename TypeRhs::duration{34}, true);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Sub_Scalar_Vector_DurationS_DurationD_DurationMS)
-{
-  using TypeOut = cudf::duration_ms;
-  using TypeLhs = cudf::duration_s;
-  using TypeRhs = cudf::duration_D;
-
-  using SUB = cudf::library::operation::Sub<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::scalar_type_t<TypeLhs>(TypeLhs{-9});
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mul_Vector_Vector_SI64)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int64_t;
-  using TypeRhs = int64_t;
-
-  using MUL = cudf::library::operation::Mul<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MUL());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mul_Vector_Vector_SI64_FP32_FP32)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  using MUL = cudf::library::operation::Mul<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MUL());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mul_Scalar_Vector_SI32_DurationD_DurationMS)
-{
-  // Double the duration of days and convert the time interval to ms
-  using TypeOut = cudf::duration_ms;
-  using TypeLhs = int32_t;
-  using TypeRhs = cudf::duration_D;
-
-  using MUL = cudf::library::operation::Mul<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::scalar_type_t<TypeLhs>(2);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MUL());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mul_Vector_Vector_DurationS_SI32_DurationNS)
-{
-  // Multiple each duration with some random value and promote the result
-  using TypeOut = cudf::duration_ns;
-  using TypeLhs = cudf::duration_s;
-  using TypeRhs = int32_t;
-
-  using MUL = cudf::library::operation::Mul<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MUL());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Div_Vector_Vector_SI64)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int64_t;
-  using TypeRhs = int64_t;
-
-  using DIV = cudf::library::operation::Div<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, DIV());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Div_Vector_Vector_SI64_FP32_FP32)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  using DIV = cudf::library::operation::Div<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, DIV());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Div_Scalar_Vector_DurationD_SI32_DurationS)
-{
-  using TypeOut = cudf::duration_s;
-  using TypeLhs = cudf::duration_D;
-  using TypeRhs = int64_t;
-
-  using DIV = cudf::library::operation::Div<TypeOut, TypeLhs, TypeRhs>;
-
-  // Divide 2 days by an integer and convert the ticks to seconds
-  auto lhs = cudf::scalar_type_t<TypeLhs>(TypeLhs{2});
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, DIV());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Div_Vector_Vector_DurationD_DurationS_DurationMS)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = cudf::duration_D;
-  using TypeRhs = cudf::duration_s;
-
-  using DIV = cudf::library::operation::Div<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, DIV());
-}
-
-TEST_F(BinaryOperationIntegrationTest, TrueDiv_Vector_Vector_SI64)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int64_t;
-  using TypeRhs = int64_t;
-
-  using TRUEDIV = cudf::library::operation::TrueDiv<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::TRUE_DIV, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, TRUEDIV());
-}
-
-TEST_F(BinaryOperationIntegrationTest, FloorDiv_Vector_Vector_SI64)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int64_t;
-  using TypeRhs = int64_t;
-
-  using FLOORDIV = cudf::library::operation::FloorDiv<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::FLOOR_DIV, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, FLOORDIV());
-}
-
-TEST_F(BinaryOperationIntegrationTest, FloorDiv_Vector_Vector_SI64_FP32_FP32)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  using FLOORDIV = cudf::library::operation::FloorDiv<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::FLOOR_DIV, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, FLOORDIV());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Vector_SI64)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int64_t;
-  using TypeRhs = int64_t;
-
-  using MOD = cudf::library::operation::Mod<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Vector_FP32)
-{
-  using TypeOut = float;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  using MOD = cudf::library::operation::Mod<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Vector_SI64_FP32_FP32)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  using MOD = cudf::library::operation::Mod<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Vector_FP64)
-{
-  using TypeOut = double;
-  using TypeLhs = double;
-  using TypeRhs = double;
-
-  using MOD = cudf::library::operation::Mod<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Scalar_DurationD_SI32_DurationUS)
-{
-  using TypeOut = cudf::duration_us;
-  using TypeLhs = cudf::duration_D;
-  using TypeRhs = int64_t;
-
-  using MOD = cudf::library::operation::Mod<TypeOut, TypeLhs, TypeRhs>;
-
-  // Half the number of days and convert the remainder ticks to microseconds
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = cudf::scalar_type_t<TypeRhs>(2);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Scalar_DurationS_DurationMS_DurationUS)
-{
-  using TypeOut = cudf::duration_us;
-  using TypeLhs = cudf::duration_s;
-  using TypeRhs = cudf::duration_ms;
-
-  using MOD = cudf::library::operation::Mod<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Pow_Vector_Vector_FP64_SI64_SI64)
-{
-  using TypeOut = double;
-  using TypeLhs = int64_t;
-  using TypeRhs = int64_t;
-
-  using POW = cudf::library::operation::Pow<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::POW, data_type(type_to_id<TypeOut>()));
-
-  /**
-   * According to CUDA Programming Guide, 'E.1. Standard Functions', 'Table 7 - Double-Precision
-   * Mathematical Standard Library Functions with Maximum ULP Error'
-   * The pow function has 2 (full range) maximum ulp error.
-   */
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, POW(), NearEqualComparator<TypeOut>{2});
-}
-
-TEST_F(BinaryOperationIntegrationTest, Pow_Vector_Vector_FP32)
-{
-  using TypeOut = float;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  using POW = cudf::library::operation::Pow<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::POW, data_type(type_to_id<TypeOut>()));
-  /**
-   * According to CUDA Programming Guide, 'E.1. Standard Functions', 'Table 7 - Double-Precision
-   * Mathematical Standard Library Functions with Maximum ULP Error'
-   * The pow function has 2 (full range) maximum ulp error.
-   */
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, POW(), NearEqualComparator<TypeOut>{2});
-}
-
-TEST_F(BinaryOperationIntegrationTest, And_Vector_Vector_SI16_SI64_SI32)
-{
-  using TypeOut = int16_t;
-  using TypeLhs = int64_t;
-  using TypeRhs = int32_t;
-
-  using AND = cudf::library::operation::BitwiseAnd<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::BITWISE_AND, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, AND());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Or_Vector_Vector_SI64_SI16_SI32)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int16_t;
-  using TypeRhs = int32_t;
-
-  using OR = cudf::library::operation::BitwiseOr<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::BITWISE_OR, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, OR());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Xor_Vector_Vector_SI32_SI16_SI64)
-{
-  using TypeOut = int32_t;
-  using TypeLhs = int16_t;
-  using TypeRhs = int64_t;
-
-  using XOR = cudf::library::operation::BitwiseXor<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::BITWISE_XOR, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, XOR());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Logical_And_Vector_Vector_SI16_FP64_SI8)
-{
-  using TypeOut = int16_t;
-  using TypeLhs = double;
-  using TypeRhs = int8_t;
-
-  using AND = cudf::library::operation::LogicalAnd<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LOGICAL_AND, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, AND());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Logical_Or_Vector_Vector_B8_SI16_SI64)
-{
-  using TypeOut = bool;
-  using TypeLhs = int16_t;
-  using TypeRhs = int64_t;
-
-  using OR = cudf::library::operation::LogicalOr<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LOGICAL_OR, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, OR());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Less_Scalar_Vector_B8_TSS_TSS)
-{
-  using TypeOut = bool;
-  using TypeLhs = cudf::timestamp_s;
-  using TypeRhs = cudf::timestamp_s;
-
-  using LESS = cudf::library::operation::Less<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_scalar<TypeLhs>();
-  auto rhs = make_random_wrapped_column<TypeRhs>(10);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Greater_Scalar_Vector_B8_TSMS_TSS)
-{
-  using TypeOut = bool;
-  using TypeLhs = cudf::timestamp_ms;
-  using TypeRhs = cudf::timestamp_s;
-
-  using GREATER = cudf::library::operation::Greater<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_scalar<TypeLhs>();
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::GREATER, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, GREATER());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Less_Vector_Vector_B8_TSS_TSS)
-{
-  using TypeOut = bool;
-  using TypeLhs = cudf::timestamp_s;
-  using TypeRhs = cudf::timestamp_s;
-
-  using LESS = cudf::library::operation::Less<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(10);
-  auto rhs = make_random_wrapped_column<TypeRhs>(10);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Greater_Vector_Vector_B8_TSMS_TSS)
-{
-  using TypeOut = bool;
-  using TypeLhs = cudf::timestamp_ms;
-  using TypeRhs = cudf::timestamp_s;
-
-  using GREATER = cudf::library::operation::Greater<TypeOut, TypeLhs, TypeRhs>;
-
-  cudf::test::UniformRandomGenerator<long> rand_gen(1, 10);
-  auto itr = cudf::detail::make_counting_transform_iterator(
-    0, [&rand_gen](auto row) { return rand_gen.generate() * 1000; });
-
-  cudf::test::fixed_width_column_wrapper<TypeLhs, typename decltype(itr)::value_type> lhs(
-    itr, itr + 100, make_validity_iter());
-
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::GREATER, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, GREATER());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Less_Scalar_Vector_B8_STR_STR)
-{
-  using TypeOut = bool;
-  using TypeLhs = std::string;
-  using TypeRhs = std::string;
-
-  using LESS = cudf::library::operation::Less<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::string_scalar("eee");
-  auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Less_Vector_Scalar_B8_STR_STR)
-{
-  using TypeOut = bool;
-  using TypeLhs = std::string;
-  using TypeRhs = std::string;
-
-  using LESS = cudf::library::operation::Less<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto rhs = cudf::string_scalar("eee");
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Less_Vector_Vector_B8_STR_STR)
-{
-  using TypeOut = bool;
-  using TypeLhs = std::string;
-  using TypeRhs = std::string;
-
-  using LESS = cudf::library::operation::Less<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
-  auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Greater_Vector_Vector_B8_STR_STR)
-{
-  using TypeOut = bool;
-  using TypeLhs = std::string;
-  using TypeRhs = std::string;
-
-  using GREATER = cudf::library::operation::Greater<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
-  auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::GREATER, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, GREATER());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Equal_Vector_Vector_B8_STR_STR)
-{
-  using TypeOut = bool;
-  using TypeLhs = std::string;
-  using TypeRhs = std::string;
-
-  using EQUAL = cudf::library::operation::Equal<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
-  auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::EQUAL, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, EQUAL());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Equal_Vector_Scalar_B8_STR_STR)
-{
-  using TypeOut = bool;
-  using TypeLhs = std::string;
-  using TypeRhs = std::string;
-
-  using EQUAL = cudf::library::operation::Equal<TypeOut, TypeLhs, TypeRhs>;
-
-  auto rhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
-  auto lhs = cudf::string_scalar("");
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::EQUAL, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, EQUAL());
-}
-
-TEST_F(BinaryOperationIntegrationTest, LessEqual_Vector_Vector_B8_STR_STR)
-{
-  using TypeOut = bool;
-  using TypeLhs = std::string;
-  using TypeRhs = std::string;
-
-  using LESS_EQUAL = cudf::library::operation::LessEqual<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
-  auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LESS_EQUAL, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS_EQUAL());
-}
-
-TEST_F(BinaryOperationIntegrationTest, GreaterEqual_Vector_Vector_B8_STR_STR)
-{
-  using TypeOut = bool;
-  using TypeLhs = std::string;
-  using TypeRhs = std::string;
-
-  using GREATER_EQUAL = cudf::library::operation::GreaterEqual<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
-  auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::GREATER_EQUAL, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, GREATER_EQUAL());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftLeft_Vector_Vector_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  using SHIFT_LEFT = cudf::library::operation::ShiftLeft<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_LEFT, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_LEFT());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftLeft_Vector_Vector_SI32_SI16_SI64)
-{
-  using TypeOut = int;
-  using TypeLhs = int16_t;
-  using TypeRhs = int64_t;
-
-  using SHIFT_LEFT = cudf::library::operation::ShiftLeft<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_LEFT, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_LEFT());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftLeft_Scalar_Vector_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  using SHIFT_LEFT = cudf::library::operation::ShiftLeft<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_scalar<TypeLhs>();
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_LEFT, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_LEFT());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftLeft_Vector_Scalar_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  using SHIFT_LEFT = cudf::library::operation::ShiftLeft<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_scalar<TypeRhs>();
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_LEFT, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_LEFT());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRight_Vector_Vector_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  using SHIFT_RIGHT = cudf::library::operation::ShiftRight<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_RIGHT, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRight_Vector_Vector_SI32_SI16_SI64)
-{
-  using TypeOut = int;
-  using TypeLhs = int16_t;
-  using TypeRhs = int64_t;
-
-  using SHIFT_RIGHT = cudf::library::operation::ShiftRight<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_RIGHT, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRight_Scalar_Vector_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  using SHIFT_RIGHT = cudf::library::operation::ShiftRight<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_scalar<TypeLhs>();
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_RIGHT, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRight_Vector_Scalar_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  using SHIFT_RIGHT = cudf::library::operation::ShiftRight<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_scalar<TypeRhs>();
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_RIGHT, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRightUnsigned_Vector_Vector_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  int num_els = 4;
-
-  TypeLhs lhs[] = {-8, 78, -93, 0, -INT_MAX};
-  cudf::test::fixed_width_column_wrapper<TypeLhs> lhs_w(lhs, lhs + num_els);
-
-  TypeRhs shift[] = {1, 1, 3, 2, 16};
-  cudf::test::fixed_width_column_wrapper<TypeRhs> shift_w(shift, shift + num_els);
-
-  TypeOut expected[] = {2147483644, 39, 536870900, 0, 32768};
-  cudf::test::fixed_width_column_wrapper<TypeOut> expected_w(expected, expected + num_els);
-
-  auto out = cudf::jit::binary_operation(
-    lhs_w, shift_w, cudf::binary_operator::SHIFT_RIGHT_UNSIGNED, data_type(type_to_id<TypeOut>()));
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*out, expected_w);
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRightUnsigned_Vector_Vector_SI32_SI16_SI64)
-{
-  using TypeOut = int;
-  using TypeLhs = int16_t;
-  using TypeRhs = int64_t;
-
-  using SHIFT_RIGHT_UNSIGNED =
-    cudf::library::operation::ShiftRightUnsigned<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_RIGHT_UNSIGNED, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT_UNSIGNED());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRightUnsigned_Scalar_Vector_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  using SHIFT_RIGHT_UNSIGNED =
-    cudf::library::operation::ShiftRightUnsigned<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_scalar<TypeLhs>();
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_RIGHT_UNSIGNED, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT_UNSIGNED());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRightUnsigned_Vector_Scalar_SI32)
-{
-  using TypeOut = int;
-  using TypeLhs = int;
-  using TypeRhs = int;
-
-  using SHIFT_RIGHT_UNSIGNED =
-    cudf::library::operation::ShiftRightUnsigned<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_scalar<TypeRhs>();
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_RIGHT_UNSIGNED, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT_UNSIGNED());
-}
-
-TEST_F(BinaryOperationIntegrationTest, LogBase_Vector_Scalar_SI32_SI32_float)
-{
-  using TypeOut = int;      // Cast the result value to int for easy comparison
-  using TypeLhs = int32_t;  // All input types get converted into doubles
-  using TypeRhs = float;
-
-  using LOG_BASE = cudf::library::operation::LogBase<TypeOut, TypeLhs, TypeRhs>;
-
-  // Make sure there are no zeros. The log value is purposefully cast to int for easy comparison
-  auto elements = cudf::detail::make_counting_transform_iterator(1, [](auto i) { return i + 10; });
-  fixed_width_column_wrapper<TypeLhs> lhs(elements, elements + 100);
-  // Find log to the base 10
-  auto rhs = numeric_scalar<TypeRhs>(10);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LOG_BASE, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LOG_BASE());
-}
-
-TEST_F(BinaryOperationIntegrationTest, LogBase_Scalar_Vector_float_SI32)
-{
-  using TypeOut = float;
-  using TypeLhs = int;
-  using TypeRhs = int;  // Integral types promoted to double
-
-  using LOG_BASE = cudf::library::operation::LogBase<TypeOut, TypeLhs, TypeRhs>;
-
-  // Make sure there are no zeros
-  auto elements = cudf::detail::make_counting_transform_iterator(1, [](auto i) { return i + 30; });
-  fixed_width_column_wrapper<TypeRhs> rhs(elements, elements + 100);
-  // Find log to the base 2
-  auto lhs = numeric_scalar<TypeLhs>(2);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LOG_BASE, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LOG_BASE());
-}
-
-TEST_F(BinaryOperationIntegrationTest, LogBase_Vector_Vector_double_SI64_SI32)
-{
-  using TypeOut = double;
-  using TypeLhs = int64_t;
-  using TypeRhs = int32_t;  // Integral types promoted to double
-
-  using LOG_BASE = cudf::library::operation::LogBase<TypeOut, TypeLhs, TypeRhs>;
-
-  // Make sure there are no zeros
-  auto elements =
-    cudf::detail::make_counting_transform_iterator(1, [](auto i) { return std::pow(2, i); });
-  fixed_width_column_wrapper<TypeLhs> lhs(elements, elements + 50);
-
-  // Find log to the base 7
-  auto rhs_elements = cudf::detail::make_counting_transform_iterator(0, [](auto) { return 7; });
-  fixed_width_column_wrapper<TypeRhs> rhs(rhs_elements, rhs_elements + 50);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::LOG_BASE, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LOG_BASE());
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Scalar_B8_SI32_SI32)
-{
-  using TypeOut = bool;
-  using TypeLhs = int32_t;
-  using TypeRhs = int32_t;
-
-  auto int_col =
-    fixed_width_column_wrapper<TypeLhs>{{999, -37, 0, INT32_MAX}, {true, true, true, false}};
-  auto int_scalar = cudf::scalar_type_t<TypeRhs>(999);
-
-  auto op_col = cudf::jit::binary_operation(
-    int_col, int_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{true, false, false, false}, {true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_ScalarInvalid_B8_SI32_SI32)
-{
-  using TypeOut = bool;
-  using TypeLhs = int32_t;
-  using TypeRhs = int32_t;
-
-  auto int_col    = fixed_width_column_wrapper<TypeLhs>{{-INT32_MAX, -37, 0, 499, 44, INT32_MAX},
-                                                     {false, true, false, true, true, false}};
-  auto int_scalar = cudf::scalar_type_t<TypeRhs>(999);
-  int_scalar.set_valid_async(false);
-
-  auto op_col = cudf::jit::binary_operation(
-    int_col, int_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*op_col,
-                                 fixed_width_column_wrapper<bool>{
-                                   {true, false, true, false, false, true},
-                                   {true, true, true, true, true, true},
-                                 },
-                                 verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_tsD_tsD)
-{
-  using TypeOut = bool;
-  using TypeLhs = cudf::timestamp_D;
-  using TypeRhs = cudf::timestamp_D;
-
-  cudf::test::fixed_width_column_wrapper<TypeLhs, TypeLhs::rep> ts_col{
-    {
-      999,    // Random nullable field
-      0,      // This is the UNIX epoch - 1970-01-01
-      44376,  // 2091-07-01 00:00:00 GMT
-      47695,  // 2100-08-02 00:00:00 GMT
-      3,      // Random nullable field
-      66068,  // 2150-11-21 00:00:00 GMT
-      22270,  // 2030-12-22 00:00:00 GMT
-      111,    // Random nullable field
-    },
-    {false, true, true, true, false, true, true, false}};
-  auto ts_scalar = cudf::scalar_type_t<TypeRhs>(typename TypeRhs::duration{44376}, true);
-
-  auto op_col = cudf::jit::binary_operation(
-    ts_scalar, ts_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*op_col,
-                                 fixed_width_column_wrapper<bool>{
-                                   {false, false, true, false, false, false, false, false},
-                                   {true, true, true, true, true, true, true, true},
-                                 },
-                                 verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Scalar_B8_string_string_EmptyString)
-{
-  using TypeOut = bool;
-
-  auto str_col = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"},
-                                                    {true, false, true, true, true, false, true});
-  // Empty string
-  cudf::string_scalar str_scalar("");
-
-  auto op_col = cudf::jit::binary_operation(
-    str_col, str_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, false, false, true, false, false, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_string_string_ValidString)
-{
-  using TypeOut = bool;
-
-  auto str_col = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"},
-                                                    {true, false, true, true, true, false, true});
-  // Match a valid string
-  cudf::string_scalar str_scalar("<null>");
-
-  auto op_col = cudf::jit::binary_operation(
-    str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, false, true, false, false, false, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Scalar_B8_string_string_NoMatch)
-{
-  using TypeOut = bool;
-
-  // Try with non nullable input
-  auto str_col =
-    cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
-  // Matching a string that isn't present
-  cudf::string_scalar str_scalar("foo");
-
-  auto op_col = cudf::jit::binary_operation(
-    str_col, str_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, false, false, false, false, false, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_string_string_NullNonNull)
-{
-  using TypeOut = bool;
-
-  // Try with all invalid input
-  auto str_col = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"},
-                                                    {true, true, true, true, true, true, true});
-  // Matching a scalar that is invalid
-  cudf::string_scalar str_scalar("foo");
-  str_scalar.set_valid_async(false);
-
-  auto op_col = cudf::jit::binary_operation(
-    str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, false, false, false, false, false, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Scalar_B8_string_string_NullNonNull)
-{
-  using TypeOut = bool;
-
-  // Try with all invalid input
-  auto str_col =
-    cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"},
-                                       {false, false, false, false, false, false, false});
-  // Matching a scalar that is valid
-  cudf::string_scalar str_scalar("foo");
-
-  auto op_col = cudf::jit::binary_operation(
-    str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, false, false, false, false, false, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_string_string_NullNull)
-{
-  using TypeOut = bool;
-
-  // Try with all invalid input
-  auto str_col =
-    cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"},
-                                       {false, false, false, false, false, false, false});
-  // Matching a scalar that is invalid
-  cudf::string_scalar str_scalar("foo");
-  str_scalar.set_valid_async(false);
-
-  auto op_col = cudf::jit::binary_operation(
-    str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{true, true, true, true, true, true, true},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_string_string_MatchInvalid)
-{
-  using TypeOut = bool;
-
-  auto str_col = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"},
-                                                    {true, false, true, true, true, false, true});
-  // Matching an invalid string
-  cudf::string_scalar str_scalar("bb");
-
-  auto op_col = cudf::jit::binary_operation(
-    str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, false, false, false, false, false, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_InvalidScalar_B8_string_string)
-{
-  using TypeOut = bool;
-
-  auto str_col = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"},
-                                                    {true, false, true, true, true, false, true});
-  // Valid string invalidated
-  cudf::string_scalar str_scalar("bb");
-  str_scalar.set_valid_async(false);
-
-  auto op_col = cudf::jit::binary_operation(
-    str_col, str_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, true, false, false, false, true, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_tsD_tsD_NonNullable)
-{
-  using TypeOut = bool;
-  using TypeLhs = cudf::timestamp_D;
-  using TypeRhs = cudf::timestamp_D;
-
-  cudf::test::fixed_width_column_wrapper<TypeLhs, TypeLhs::rep> lhs_col{
-    0,      // This is the UNIX epoch - 1970-01-01
-    44376,  // 2091-07-01 00:00:00 GMT
-    47695,  // 2100-08-02 00:00:00 GMT
-    66068,  // 2150-11-21 00:00:00 GMT
-    22270,  // 2030-12-22 00:00:00 GMT
-  };
-  ASSERT_EQ(column_view{lhs_col}.nullable(), false);
-  cudf::test::fixed_width_column_wrapper<TypeRhs, TypeRhs::rep> rhs_col{
-    0,      // This is the UNIX epoch - 1970-01-01
-    44380,  // Mismatched
-    47695,  // 2100-08-02 00:00:00 GMT
-    66070,  // Mismatched
-    22270,  // 2030-12-22 00:00:00 GMT
-  };
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*op_col,
-                                 fixed_width_column_wrapper<bool>{
-                                   {true, false, true, false, true},
-                                   {true, true, true, true, true},
-                                 },
-                                 verbosity);
-}
-
-// Both vectors with mixed validity
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_string_MixMix)
-{
-  using TypeOut = bool;
-
-  auto lhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
-                                       {true, false, true, true, true, false, true});
-  auto rhs_col =
-    cudf::test::strings_column_wrapper({"foo", "valid", "<null>", "", "invalid", "inv", "ééé"},
-                                       {true, true, true, true, false, false, true});
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, false, true, true, false, true, true},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_string_MixValid)
-{
-  using TypeOut = bool;
-
-  auto lhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
-                                       {true, false, true, true, true, false, true});
-  auto rhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"});
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{true, false, true, true, true, false, true},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_string_MixInvalid)
-{
-  using TypeOut = bool;
-
-  auto lhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
-                                       {true, false, true, true, true, false, true});
-  auto rhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
-                                       {false, false, false, false, false, false, false});
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, true, false, false, false, true, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_string_ValidValid)
-{
-  using TypeOut = bool;
-
-  auto lhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"});
-  auto rhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"});
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{true, true, true, true, true, true, true},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_string_ValidInvalid)
-{
-  using TypeOut = bool;
-
-  auto lhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"});
-  auto rhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
-                                       {false, false, false, false, false, false, false});
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{false, false, false, false, false, false, false},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_string_InvalidInvalid)
-{
-  using TypeOut = bool;
-
-  auto lhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
-                                       {false, false, false, false, false, false, false});
-  auto rhs_col =
-    cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
-                                       {false, false, false, false, false, false, false});
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<bool>{{true, true, true, true, true, true, true},
-                                     {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_VectorAllInvalid_B8_SI32_SI32)
-{
-  using TypeOut = bool;
-  using TypeLhs = int32_t;
-
-  auto lhs_col = fixed_width_column_wrapper<TypeLhs>{{-INT32_MAX, -37, 0, 499, 44, INT32_MAX},
-                                                     {false, false, false, false, false, false}};
-  auto rhs_col = fixed_width_column_wrapper<TypeLhs>{{-47, 37, 12, 99, 4, -INT32_MAX},
-                                                     {false, false, false, false, false, false}};
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*op_col,
-                                 fixed_width_column_wrapper<bool>{
-                                   {true, true, true, true, true, true},
-                                   {true, true, true, true, true, true},
-                                 },
-                                 verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMin_Vector_Scalar_SI64_SI32_SI8)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int32_t;
-  using TypeRhs = int8_t;
-
-  auto int_col = fixed_width_column_wrapper<TypeLhs>{
-    {999, -37, 0, INT32_MAX},
-  };
-  auto int_scalar = cudf::scalar_type_t<TypeRhs>(77);
-
-  auto op_col = cudf::jit::binary_operation(
-    int_col, int_scalar, cudf::binary_operator::NULL_MIN, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<TypeOut>{{77, -37, 0, 77}, {true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Scalar_Vector_FP64_SI32_SI64)
-{
-  using TypeOut = double;
-  using TypeLhs = int32_t;
-  using TypeRhs = int64_t;
-
-  auto int_col =
-    fixed_width_column_wrapper<TypeLhs>{{999, -37, 0, INT32_MAX, -INT32_MAX, -4379, 55},
-                                        {false, true, false, true, false, true, false}};
-  auto int_scalar = cudf::scalar_type_t<TypeRhs>(INT32_MAX);
-
-  auto op_col = cudf::jit::binary_operation(
-    int_scalar, int_col, cudf::binary_operator::NULL_MAX, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<TypeOut>{
-      {INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX},
-      {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMin_Vector_Scalar_SI64_SI32_FP32)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int32_t;
-  using TypeRhs = float;
-
-  auto int_col =
-    fixed_width_column_wrapper<TypeLhs>{{999, -37, 0, INT32_MAX, -INT32_MAX, -4379, 55},
-                                        {false, true, false, true, false, true, false}};
-  auto float_scalar = cudf::scalar_type_t<TypeRhs>(-3.14f);
-  float_scalar.set_valid_async(false);
-
-  auto op_col = cudf::jit::binary_operation(
-    int_col, float_scalar, cudf::binary_operator::NULL_MIN, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<TypeOut>{{0, -37, 0, INT32_MAX, 0, -4379, 0},
-                                        {false, true, false, true, false, true, false}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Scalar_Vector_SI8_SI8_FP32)
-{
-  using TypeOut = int8_t;
-  using TypeLhs = int8_t;
-  using TypeRhs = float;
-
-  auto int_col = fixed_width_column_wrapper<TypeLhs>{
-    {9, -37, 0, 32, -47, -4, 55}, {false, false, false, false, false, false, false}};
-  auto float_scalar = cudf::scalar_type_t<TypeRhs>(-3.14f);
-  float_scalar.set_valid_async(false);
-
-  auto op_col = cudf::jit::binary_operation(
-    float_scalar, int_col, cudf::binary_operator::NULL_MAX, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<TypeOut>{{0, 0, 0, 0, 0, 0, 0},
-                                        {false, false, false, false, false, false, false}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMin_Vector_Vector_SI64_SI32_SI8)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int32_t;
-
-  auto int_col =
-    fixed_width_column_wrapper<TypeLhs>{{999, -37, 0, INT32_MAX, -INT32_MAX, -4379, 55},
-                                        {false, false, false, false, false, false, false}};
-  auto another_int_col = fixed_width_column_wrapper<TypeLhs>{
-    {9, -37, 0, 32, -47, -4, 55}, {false, false, false, false, false, false, false}};
-
-  auto op_col = cudf::jit::binary_operation(
-    int_col, another_int_col, cudf::binary_operator::NULL_MIN, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<TypeOut>{{0, 0, 0, 0, 0, 0, 0},
-                                        {false, false, false, false, false, false, false}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Vector_Vector_SI64_SI32_SI8)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int32_t;
-
-  auto int_col = fixed_width_column_wrapper<TypeLhs>{
-    {999, -37, 0, INT32_MAX, -INT32_MAX, -4379, 55}, {true, true, true, true, true, true, true}};
-  auto another_int_col = fixed_width_column_wrapper<TypeLhs>{
-    {9, -37, 0, 32, -47, -4, 55}, {false, false, false, false, false, false, false}};
-
-  auto op_col = cudf::jit::binary_operation(
-    int_col, another_int_col, cudf::binary_operator::NULL_MAX, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<TypeOut>{{999, -37, 0, INT32_MAX, -INT32_MAX, -4379, 55},
-                                        {true, true, true, true, true, true, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMin_Vector_Vector_tsD_tsD_tsD)
-{
-  cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep> lhs_col{
-    {
-      0,      // This is the UNIX epoch - 1970-01-01
-      44376,  // 2091-07-01 00:00:00 GMT
-      47695,  // 2100-08-02 00:00:00 GMT
-      66068,  // 2150-11-21 00:00:00 GMT
-      22270,  // 2030-12-22 00:00:00 GMT
-    },
-    {true, false, true, true, false}};
-  cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep> rhs_col{
-    {
-      0,      // This is the UNIX epoch - 1970-01-01
-      44380,  // Mismatched
-      47695,  // 2100-08-02 00:00:00 GMT
-      66070,  // Mismatched
-      22270,  // 2030-12-22 00:00:00 GMT
-    },
-    {false, true, true, true, false}};
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_MIN, data_type(type_to_id<cudf::timestamp_D>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {0, 44380, 47695, 66068, 0}, {true, true, true, true, false}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Vector_Vector_SI32_SI64_SI8)
-{
-  using TypeOut = int32_t;
-  using TypeLhs = int64_t;
-
-  auto int_col =
-    fixed_width_column_wrapper<TypeLhs>{{999, -37, 0, INT32_MAX, -INT32_MAX, -4379, 55},
-                                        {false, false, false, false, false, false, false}};
-  auto another_int_col = fixed_width_column_wrapper<TypeLhs>{
-    {9, -37, 0, 32, -47, -4, 55}, {true, false, true, false, true, false, true}};
-
-  auto op_col = cudf::jit::binary_operation(
-    int_col, another_int_col, cudf::binary_operator::NULL_MAX, data_type(type_to_id<TypeOut>()));
-
-  // Every row has a value
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    *op_col,
-    fixed_width_column_wrapper<TypeOut>{{9, 0, 0, 0, -47, 0, 55},
-                                        {true, false, true, false, true, false, true}},
-    verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Vector_Vector_string_string_string_Mix)
-{
-  auto lhs_col = cudf::test::strings_column_wrapper(
-    {"eee", "invalid", "<null>", "", "", "", "ééé", "foo", "bar", "abc", "def"},
-    {false, true, true, false, true, true, true, false, false, true, true});
-  auto rhs_col = cudf::test::strings_column_wrapper(
-    {"eee", "goo", "<null>", "", "", "", "ééé", "bar", "foo", "def", "abc"},
-    {false, true, true, true, false, true, true, false, false, true, true});
-
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, rhs_col, cudf::binary_operator::NULL_MAX, data_type{type_id::STRING});
-
-  auto exp_col = cudf::test::strings_column_wrapper(
-    {"", "invalid", "<null>", "", "", "", "ééé", "", "", "def", "def"},
-    {false, true, true, true, true, true, true, false, false, true, true});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*op_col, exp_col, verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMin_Vector_Scalar_string_string_string_Mix)
-{
-  auto lhs_col = cudf::test::strings_column_wrapper(
-    {"eee", "invalid", "<null>", "", "", "", "ééé", "foo", "bar", "abc", "foo"},
-    {false, true, true, false, true, true, true, false, false, true, true});
-  cudf::string_scalar str_scalar("foo");
-
-  // Returns a non-nullable column as all elements are valid - it will have the scalar
-  // value at the very least
-  auto op_col = cudf::jit::binary_operation(
-    lhs_col, str_scalar, cudf::binary_operator::NULL_MIN, data_type{type_id::STRING});
-
-  auto exp_col = cudf::test::strings_column_wrapper(
-    {"foo", "foo", "<null>", "foo", "", "", "foo", "foo", "foo", "abc", "foo"});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*op_col, exp_col, verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Scalar_Vector_string_string_string_Mix)
-{
-  auto lhs_col = cudf::test::strings_column_wrapper(
-    {"eee", "invalid", "<null>", "", "", "", "ééé", "foo", "bar", "abc", "foo"},
-    {false, true, true, false, true, true, true, false, false, true, true});
-  cudf::string_scalar str_scalar("foo");
-  str_scalar.set_valid_async(false);
-
-  // Returns the lhs_col
-  auto op_col = cudf::jit::binary_operation(
-    str_scalar, lhs_col, cudf::binary_operator::NULL_MAX, data_type{type_id::STRING});
-
-  auto exp_col = cudf::test::strings_column_wrapper(
-    {"", "invalid", "<null>", "", "", "", "ééé", "", "", "abc", "foo"},
-    {false, true, true, false, true, true, true, false, false, true, true});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*op_col, exp_col, verbosity);
-}
-
-TEST_F(BinaryOperationIntegrationTest, CastAdd_Vector_Vector_SI32_float_float)
-{
-  using TypeOut = int32_t;
-  using TypeLhs = float;
-  using TypeRhs = float;  // Integral types promoted to double
-
-  using ADD = cudf::library::operation::Add<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs      = cudf::test::fixed_width_column_wrapper<float>{1.3f, 1.6f};
-  auto rhs      = cudf::test::fixed_width_column_wrapper<float>{1.3f, 1.6f};
-  auto expected = cudf::test::fixed_width_column_wrapper<int>{2, 3};
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Add_Vector_Vector_TimepointD_DurationS_TimepointUS)
-{
-  using TypeOut = cudf::timestamp_us;
-  using TypeLhs = cudf::timestamp_D;
-  using TypeRhs = cudf::duration_s;
-
-  using ADD = cudf::library::operation::Add<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Add_Vector_Scalar_DurationD_TimepointS_TimepointS)
-{
-  using TypeOut = cudf::timestamp_s;
-  using TypeLhs = cudf::duration_D;
-  using TypeRhs = cudf::timestamp_s;
-
-  using ADD = cudf::library::operation::Add<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(100);
-  auto rhs = cudf::scalar_type_t<TypeRhs>(typename TypeRhs::duration{34}, true);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, Add_Scalar_Vector_DurationS_DurationD_DurationMS)
-{
-  using TypeOut = cudf::duration_ms;
-  using TypeLhs = cudf::duration_s;
-  using TypeRhs = cudf::duration_D;
-
-  using ADD = cudf::library::operation::Add<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::scalar_type_t<TypeLhs>(TypeLhs{-9});
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ShiftRightUnsigned_Scalar_Vector_SI64_SI64_SI32)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int64_t;
-  using TypeRhs = int32_t;
-
-  using SHIFT_RIGHT_UNSIGNED =
-    cudf::library::operation::ShiftRightUnsigned<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = cudf::scalar_type_t<TypeLhs>(-12);
-  // this generates values in the range 1-10 which should be reasonable for the shift
-  auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::SHIFT_RIGHT_UNSIGNED, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT_UNSIGNED());
-}
-
-TEST_F(BinaryOperationIntegrationTest, PMod_Scalar_Vector_FP32)
-{
-  using TypeOut = float;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  auto lhs = cudf::scalar_type_t<TypeLhs>(-86099.68377);
-  auto rhs = fixed_width_column_wrapper<TypeRhs>{{90770.74881, -15456.4335, 32213.22119}};
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
-
-  auto expected_result =
-    fixed_width_column_wrapper<TypeOut>{{4671.0625, -8817.51953125, 10539.974609375}};
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*out, expected_result);
-}
-
-TEST_F(BinaryOperationIntegrationTest, PMod_Vector_Scalar_FP64)
-{
-  using TypeOut = double;
-  using TypeLhs = double;
-  using TypeRhs = double;
-
-  auto lhs = fixed_width_column_wrapper<TypeLhs>{{90770.74881, -15456.4335, 32213.22119}};
-  auto rhs = cudf::scalar_type_t<TypeRhs>(-86099.68377);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
-
-  auto expected_result = fixed_width_column_wrapper<TypeOut>{
-    {4671.0650400000013178, -15456.433499999999185, 32213.221190000000206}};
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*out, expected_result);
-}
-
-TEST_F(BinaryOperationIntegrationTest, PMod_Vector_Vector_FP64_FP32_FP64)
-{
-  using TypeOut = double;
-  using TypeLhs = float;
-  using TypeRhs = double;
-
-  auto lhs = fixed_width_column_wrapper<TypeLhs>{
-    {24854.55893, 79946.87288, -86099.68377, -86099.68377, 1.0, 1.0, -1.0, -1.0}};
-  auto rhs = fixed_width_column_wrapper<TypeRhs>{{90770.74881,
-                                                  -15456.4335,
-                                                  36223.96138,
-                                                  -15456.4335,
-                                                  2.1336193413893147E307,
-                                                  -2.1336193413893147E307,
-                                                  2.1336193413893147E307,
-                                                  -2.1336193413893147E307}};
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
-
-  auto expected_result = fixed_width_column_wrapper<TypeOut>{{24854.55859375,
-                                                              2664.7075000000040745,
-                                                              22572.196640000001935,
-                                                              -8817.5200000000040745,
-                                                              1.0,
-                                                              1.0,
-                                                              0.0,
-                                                              0.0}};
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*out, expected_result);
-}
-
-TEST_F(BinaryOperationIntegrationTest, PMod_Vector_Vector_FP64_SI32_SI64)
-{
-  using TypeOut = double;
-  using TypeLhs = int32_t;
-  using TypeRhs = int64_t;
-
-  using PMOD = cudf::library::operation::PMod<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(1000);
-  auto rhs = make_random_wrapped_column<TypeRhs>(1000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, PMOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, PMod_Vector_Vector_SI64_SI32_SI64)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = int32_t;
-  using TypeRhs = int64_t;
-
-  using PMOD = cudf::library::operation::PMod<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(1000);
-  auto rhs = make_random_wrapped_column<TypeRhs>(1000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, PMOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, PMod_Vector_Vector_SI64_FP64_FP64)
-{
-  using TypeOut = int64_t;
-  using TypeLhs = double;
-  using TypeRhs = double;
-
-  using PMOD = cudf::library::operation::PMod<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(1000);
-  auto rhs = make_random_wrapped_column<TypeRhs>(1000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
-
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, PMOD());
-}
-
-TEST_F(BinaryOperationIntegrationTest, ATan2_Scalar_Vector_FP32)
-{
-  using TypeOut = float;
-  using TypeLhs = float;
-  using TypeRhs = float;
-
-  using ATAN2 = cudf::library::operation::ATan2<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_scalar<TypeLhs>();
-  auto rhs = make_random_wrapped_column<TypeRhs>(10000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ATAN2, data_type(type_to_id<TypeOut>()));
-
-  // atan2 has a max ULP error of 2 per CUDA programming guide
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ATAN2(), NearEqualComparator<TypeOut>{2});
-}
-
-TEST_F(BinaryOperationIntegrationTest, ATan2_Vector_Scalar_FP64)
-{
-  using TypeOut = double;
-  using TypeLhs = double;
-  using TypeRhs = double;
-
-  using ATAN2 = cudf::library::operation::ATan2<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(10000);
-  auto rhs = make_random_wrapped_scalar<TypeRhs>();
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ATAN2, data_type(type_to_id<TypeOut>()));
-
-  // atan2 has a max ULP error of 2 per CUDA programming guide
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ATAN2(), NearEqualComparator<TypeOut>{2});
-}
-
-TEST_F(BinaryOperationIntegrationTest, ATan2_Vector_Vector_FP64_FP32_FP64)
-{
-  using TypeOut = double;
-  using TypeLhs = float;
-  using TypeRhs = double;
-
-  using ATAN2 = cudf::library::operation::ATan2<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(10000);
-  auto rhs = make_random_wrapped_column<TypeRhs>(10000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ATAN2, data_type(type_to_id<TypeOut>()));
-
-  // atan2 has a max ULP error of 2 per CUDA programming guide
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ATAN2(), NearEqualComparator<TypeOut>{2});
-}
-
-TEST_F(BinaryOperationIntegrationTest, ATan2_Vector_Vector_FP64_SI32_SI64)
-{
-  using TypeOut = double;
-  using TypeLhs = int32_t;
-  using TypeRhs = int64_t;
-
-  using ATAN2 = cudf::library::operation::ATan2<TypeOut, TypeLhs, TypeRhs>;
-
-  auto lhs = make_random_wrapped_column<TypeLhs>(10000);
-  auto rhs = make_random_wrapped_column<TypeRhs>(10000);
-
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ATAN2, data_type(type_to_id<TypeOut>()));
-
-  // atan2 has a max ULP error of 2 per CUDA programming guide
-  ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ATAN2(), NearEqualComparator<TypeOut>{2});
-}
-
-template <typename T>
-struct FixedPointTestAllReps : public cudf::test::BaseFixture {
-};
-
-template <typename T>
-using wrapper = cudf::test::fixed_width_column_wrapper<T>;
-TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-
-  auto const sz = std::size_t{1000};
-
-  auto begin      = cudf::detail::make_counting_transform_iterator(1, [](auto i) {
-    return decimalXX{i, scale_type{0}};
-  });
-  auto const vec1 = std::vector<decimalXX>(begin, begin + sz);
-  auto const vec2 = std::vector<decimalXX>(sz, decimalXX{2, scale_type{0}});
-  auto expected   = std::vector<decimalXX>(sz);
-
-  std::transform(std::cbegin(vec1),
-                 std::cend(vec1),
-                 std::cbegin(vec2),
-                 std::begin(expected),
-                 std::plus<decimalXX>());
-
-  auto const lhs          = wrapper<decimalXX>(vec1.begin(), vec1.end());
-  auto const rhs          = wrapper<decimalXX>(vec2.begin(), vec2.end());
-  auto const expected_col = wrapper<decimalXX>(expected.begin(), expected.end());
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
-                                                   static_cast<cudf::column_view>(lhs).type(),
-                                                   static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpMultiply)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-
-  auto const sz = std::size_t{1000};
-
-  auto begin      = cudf::detail::make_counting_transform_iterator(1, [](auto i) {
-    return decimalXX{i, scale_type{0}};
-  });
-  auto const vec1 = std::vector<decimalXX>(begin, begin + sz);
-  auto const vec2 = std::vector<decimalXX>(sz, decimalXX{2, scale_type{0}});
-  auto expected   = std::vector<decimalXX>(sz);
-
-  std::transform(std::cbegin(vec1),
-                 std::cend(vec1),
-                 std::cbegin(vec2),
-                 std::begin(expected),
-                 std::multiplies<decimalXX>());
-
-  auto const lhs          = wrapper<decimalXX>(vec1.begin(), vec1.end());
-  auto const rhs          = wrapper<decimalXX>(vec2.begin(), vec2.end());
-  auto const expected_col = wrapper<decimalXX>(expected.begin(), expected.end());
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::MUL,
-                                                   static_cast<cudf::column_view>(lhs).type(),
-                                                   static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, result->view());
-}
-
-template <typename T>
-using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpMultiply2)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{11, 22, 33, 44, 55}, scale_type{-1}};
-  auto const rhs      = fp_wrapper<RepType>{{10, 10, 10, 10, 10}, scale_type{0}};
-  auto const expected = fp_wrapper<RepType>{{110, 220, 330, 440, 550}, scale_type{-1}};
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::MUL,
-                                                   static_cast<cudf::column_view>(lhs).type(),
-                                                   static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpDiv)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{-1}};
-  auto const rhs      = fp_wrapper<RepType>{{4, 4, 4, 4}, scale_type{0}};
-  auto const expected = fp_wrapper<RepType>{{2, 7, 12, 17}, scale_type{-1}};
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::DIV,
-                                                   static_cast<cudf::column_view>(lhs).type(),
-                                                   static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpDiv2)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{-1}};
-  auto const rhs      = fp_wrapper<RepType>{{4, 4, 4, 4}, scale_type{-2}};
-  auto const expected = fp_wrapper<RepType>{{2, 7, 12, 17}, scale_type{1}};
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::DIV,
-                                                   static_cast<cudf::column_view>(lhs).type(),
-                                                   static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpDiv3)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{-1}};
-  auto const rhs      = make_fixed_point_scalar<decimalXX>(12, scale_type{-1});
-  auto const expected = fp_wrapper<RepType>{{0, 2, 4, 5}, scale_type{0}};
-
-  auto const type = cudf::binary_operation_fixed_point_output_type(
-    cudf::binary_operator::DIV, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpDiv4)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto begin = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 11; });
-  auto result_begin =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i * 11) / 12; });
-  auto const lhs      = fp_wrapper<RepType>(begin, begin + 1000, scale_type{-1});
-  auto const rhs      = make_fixed_point_scalar<decimalXX>(12, scale_type{-1});
-  auto const expected = fp_wrapper<RepType>(result_begin, result_begin + 1000, scale_type{0});
-
-  auto const type = cudf::binary_operation_fixed_point_output_type(
-    cudf::binary_operator::DIV, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd2)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{11, 22, 33, 44, 55}, scale_type{-1}};
-  auto const rhs      = fp_wrapper<RepType>{{100, 200, 300, 400, 500}, scale_type{-2}};
-  auto const expected = fp_wrapper<RepType>{{210, 420, 630, 840, 1050}, scale_type{-2}};
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
-                                                   static_cast<cudf::column_view>(lhs).type(),
-                                                   static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd3)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{1100, 2200, 3300, 4400, 5500}, scale_type{-3}};
-  auto const rhs      = fp_wrapper<RepType>{{100, 200, 300, 400, 500}, scale_type{-2}};
-  auto const expected = fp_wrapper<RepType>{{2100, 4200, 6300, 8400, 10500}, scale_type{-3}};
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
-                                                   static_cast<cudf::column_view>(lhs).type(),
-                                                   static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd4)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{11, 22, 33, 44, 55}, scale_type{-1}};
-  auto const rhs      = make_fixed_point_scalar<decimalXX>(100, scale_type{-2});
-  auto const expected = fp_wrapper<RepType>{{210, 320, 430, 540, 650}, scale_type{-2}};
-
-  auto const type = cudf::binary_operation_fixed_point_output_type(
-    cudf::binary_operator::ADD, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::ADD, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd5)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = make_fixed_point_scalar<decimalXX>(100, scale_type{-2});
-  auto const rhs      = fp_wrapper<RepType>{{11, 22, 33, 44, 55}, scale_type{-1}};
-  auto const expected = fp_wrapper<RepType>{{210, 320, 430, 540, 650}, scale_type{-2}};
-
-  auto const type = cudf::binary_operation_fixed_point_output_type(
-    cudf::binary_operator::ADD, lhs->type(), static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::ADD, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpAdd6)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const col = fp_wrapper<RepType>{{3, 4, 5, 6, 7, 8}, scale_type{0}};
-
-  auto const expected1 = fp_wrapper<RepType>{{6, 8, 10, 12, 14, 16}, scale_type{0}};
-  auto const expected2 = fp_wrapper<RepType>{{0, 0, 1, 1, 1, 1}, scale_type{1}};
-  auto const type1     = cudf::data_type{cudf::type_to_id<decimalXX>(), 0};
-  auto const type2     = cudf::data_type{cudf::type_to_id<decimalXX>(), 1};
-  auto const result1   = cudf::binary_operation(col, col, cudf::binary_operator::ADD, type1);
-  auto const result2   = cudf::binary_operation(col, col, cudf::binary_operator::ADD, type2);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointCast)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const col      = fp_wrapper<RepType>{{6, 8, 10, 12, 14, 16}, scale_type{0}};
-  auto const expected = fp_wrapper<RepType>{{0, 0, 1, 1, 1, 1}, scale_type{1}};
-  auto const type     = cudf::data_type{cudf::type_to_id<decimalXX>(), 1};
-  auto const result   = cudf::cast(col, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpMultiplyScalar)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{11, 22, 33, 44, 55}, scale_type{-1}};
-  auto const rhs      = make_fixed_point_scalar<decimalXX>(100, scale_type{-1});
-  auto const expected = fp_wrapper<RepType>{{1100, 2200, 3300, 4400, 5500}, scale_type{-2}};
-
-  auto const type = cudf::binary_operation_fixed_point_output_type(
-    cudf::binary_operator::MUL, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::MUL, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpSimplePlus)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{150, 200}, scale_type{-2}};
-  auto const rhs      = fp_wrapper<RepType>{{2250, 1005}, scale_type{-3}};
-  auto const expected = fp_wrapper<RepType>{{3750, 3005}, scale_type{-3}};
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
-                                                   static_cast<cudf::column_view>(lhs).type(),
-                                                   static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpEqualSimple)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const trues    = std::vector<bool>(4, true);
-  auto const col1     = fp_wrapper<RepType>{{1, 2, 3, 4}, scale_type{0}};
-  auto const col2     = fp_wrapper<RepType>{{100, 200, 300, 400}, scale_type{-2}};
-  auto const expected = wrapper<bool>(trues.begin(), trues.end());
-
-  auto const result =
-    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpEqualSimpleScale0)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const trues    = std::vector<bool>(4, true);
-  auto const col      = fp_wrapper<RepType>{{1, 2, 3, 4}, scale_type{0}};
-  auto const expected = wrapper<bool>(trues.begin(), trues.end());
-
-  auto const result =
-    cudf::binary_operation(col, col, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpEqualSimpleScale0Null)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const col1     = fp_wrapper<RepType>{{1, 2, 3, 4}, {1, 1, 1, 1}, scale_type{0}};
-  auto const col2     = fp_wrapper<RepType>{{1, 2, 3, 4}, {0, 0, 0, 0}, scale_type{0}};
-  auto const expected = wrapper<bool>{{0, 1, 0, 1}, {0, 0, 0, 0}};
-
-  auto const result =
-    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpEqualSimpleScale2Null)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const col1     = fp_wrapper<RepType>{{1, 2, 3, 4}, {1, 1, 1, 1}, scale_type{-2}};
-  auto const col2     = fp_wrapper<RepType>{{1, 2, 3, 4}, {0, 0, 0, 0}, scale_type{0}};
-  auto const expected = wrapper<bool>{{0, 1, 0, 1}, {0, 0, 0, 0}};
-
-  auto const result =
-    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpEqualLessGreater)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const sz = std::size_t{1000};
-
-  // TESTING binary op ADD
-
-  auto begin = cudf::detail::make_counting_transform_iterator(1, [](auto e) { return e * 1000; });
-  auto const vec1 = std::vector<RepType>(begin, begin + sz);
-  auto const vec2 = std::vector<RepType>(sz, 0);
-
-  auto const iota_3  = fp_wrapper<RepType>(vec1.begin(), vec1.end(), scale_type{-3});
-  auto const zeros_3 = fp_wrapper<RepType>(vec2.begin(), vec2.end(), scale_type{-1});
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
-                                                   static_cast<cudf::column_view>(iota_3).type(),
-                                                   static_cast<cudf::column_view>(zeros_3).type());
-  auto const iota_3_after_add = cudf::binary_operation(zeros_3, iota_3, binary_operator::ADD, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(iota_3, iota_3_after_add->view());
-
-  // TESTING binary op EQUAL, LESS, GREATER
-
-  auto const trues    = std::vector<bool>(sz, true);
-  auto const true_col = wrapper<bool>(trues.begin(), trues.end());
-
-  auto const btype = cudf::data_type{type_id::BOOL8};
-  auto const equal_result =
-    cudf::binary_operation(iota_3, iota_3_after_add->view(), binary_operator::EQUAL, btype);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, equal_result->view());
-
-  auto const less_result =
-    cudf::binary_operation(zeros_3, iota_3_after_add->view(), binary_operator::LESS, btype);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, less_result->view());
-
-  auto const greater_result =
-    cudf::binary_operation(iota_3_after_add->view(), zeros_3, binary_operator::GREATER, btype);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, greater_result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpNullMaxSimple)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const trues    = std::vector<bool>(4, true);
-  auto const col1     = fp_wrapper<RepType>{{40, 30, 20, 10, 0}, {1, 0, 1, 1, 0}, scale_type{-2}};
-  auto const col2     = fp_wrapper<RepType>{{10, 20, 30, 40, 0}, {1, 1, 1, 0, 0}, scale_type{-2}};
-  auto const expected = fp_wrapper<RepType>{{40, 20, 30, 10, 0}, {1, 1, 1, 1, 0}, scale_type{-2}};
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::NULL_MAX,
-                                                   static_cast<cudf::column_view>(col1).type(),
-                                                   static_cast<cudf::column_view>(col2).type());
-  auto const result = cudf::binary_operation(col1, col2, binary_operator::NULL_MAX, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpNullMinSimple)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const trues    = std::vector<bool>(4, true);
-  auto const col1     = fp_wrapper<RepType>{{40, 30, 20, 10, 0}, {1, 1, 1, 0, 0}, scale_type{-1}};
-  auto const col2     = fp_wrapper<RepType>{{10, 20, 30, 40, 0}, {1, 0, 1, 1, 0}, scale_type{-1}};
-  auto const expected = fp_wrapper<RepType>{{10, 30, 20, 40, 0}, {1, 1, 1, 1, 0}, scale_type{-1}};
-
-  auto const type =
-    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::NULL_MIN,
-                                                   static_cast<cudf::column_view>(col1).type(),
-                                                   static_cast<cudf::column_view>(col2).type());
-  auto const result = cudf::binary_operation(col1, col2, binary_operator::NULL_MIN, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpNullEqualsSimple)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const trues    = std::vector<bool>(4, true);
-  auto const col1     = fp_wrapper<RepType>{{400, 300, 300, 100}, {1, 1, 1, 0}, scale_type{-2}};
-  auto const col2     = fp_wrapper<RepType>{{40, 200, 20, 400}, {1, 0, 1, 0}, scale_type{-1}};
-  auto const expected = wrapper<bool>{{1, 0, 0, 1}, {1, 1, 1, 1}};
-
-  auto const result = cudf::binary_operation(
-    col1, col2, binary_operator::NULL_EQUALS, cudf::data_type{type_id::BOOL8});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{100, 300, 500, 700}, scale_type{-2}};
-  auto const rhs      = fp_wrapper<RepType>{{4, 4, 4, 4}, scale_type{0}};
-  auto const expected = fp_wrapper<RepType>{{25, 75, 125, 175}, scale_type{-2}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div2)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{100000, 300000, 500000, 700000}, scale_type{-3}};
-  auto const rhs      = fp_wrapper<RepType>{{20, 20, 20, 20}, scale_type{-1}};
-  auto const expected = fp_wrapper<RepType>{{5000, 15000, 25000, 35000}, scale_type{-2}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div3)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{10000, 30000, 50000, 70000}, scale_type{-2}};
-  auto const rhs      = fp_wrapper<RepType>{{3, 9, 3, 3}, scale_type{0}};
-  auto const expected = fp_wrapper<RepType>{{3333, 3333, 16666, 23333}, scale_type{-2}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div4)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{1}};
-  auto const rhs      = make_fixed_point_scalar<decimalXX>(3, scale_type{0});
-  auto const expected = fp_wrapper<RepType>{{3, 10, 16, 23}, scale_type{1}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div6)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs = make_fixed_point_scalar<decimalXX>(3000, scale_type{-3});
-  auto const rhs = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{-1}};
-
-  auto const expected = fp_wrapper<RepType>{{300, 100, 60, 42}, scale_type{-2}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div7)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs = make_fixed_point_scalar<decimalXX>(1200, scale_type{0});
-  auto const rhs = fp_wrapper<RepType>{{100, 200, 300, 500, 600, 800, 1200, 1300}, scale_type{-2}};
-
-  auto const expected = fp_wrapper<RepType>{{12, 6, 4, 2, 2, 1, 1, 0}, scale_type{2}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), 2};
-  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div8)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{4000, 6000, 80000}, scale_type{-1}};
-  auto const rhs      = make_fixed_point_scalar<decimalXX>(5000, scale_type{-3});
-  auto const expected = fp_wrapper<RepType>{{0, 1, 16}, scale_type{2}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), 2};
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div9)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{10, 20, 30}, scale_type{2}};
-  auto const rhs      = make_fixed_point_scalar<decimalXX>(7, scale_type{1});
-  auto const expected = fp_wrapper<RepType>{{1, 2, 4}, scale_type{1}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div10)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{100, 200, 300}, scale_type{1}};
-  auto const rhs      = make_fixed_point_scalar<decimalXX>(7, scale_type{0});
-  auto const expected = fp_wrapper<RepType>{{14, 28, 42}, scale_type{1}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOp_Div11)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs      = fp_wrapper<RepType>{{1000, 2000, 3000}, scale_type{1}};
-  auto const rhs      = fp_wrapper<RepType>{{7, 7, 7}, scale_type{0}};
-  auto const expected = fp_wrapper<RepType>{{142, 285, 428}, scale_type{1}};
-
-  auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-}
-
-TYPED_TEST(FixedPointTestAllReps, FixedPointBinaryOpThrows)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const col           = fp_wrapper<RepType>{{100, 300, 500, 700}, scale_type{-2}};
-  auto const non_bool_type = data_type{type_to_id<decimalXX>(), -2};
-  EXPECT_THROW(cudf::binary_operation(col, col, cudf::binary_operator::LESS, non_bool_type),
-               cudf::logic_error);
-}
-
-template <typename T>
-struct FixedPointTest_64_128_Reps : public cudf::test::BaseFixture {
-};
-
-using Decimal64And128Types = cudf::test::Types<numeric::decimal64, numeric::decimal128>;
-TYPED_TEST_SUITE(FixedPointTest_64_128_Reps, Decimal64And128Types);
-
-TYPED_TEST(FixedPointTest_64_128_Reps, FixedPoint_64_128_ComparisonTests)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  for (auto const rhs_value : {10000000000000000, 100000000000000000}) {
-    auto const lhs       = fp_wrapper<RepType>{{33041, 97290, 36438, 25379, 48473}, scale_type{2}};
-    auto const rhs       = make_fixed_point_scalar<decimalXX>(rhs_value, scale_type{0});
-    auto const trues     = wrapper<bool>{{1, 1, 1, 1, 1}};
-    auto const falses    = wrapper<bool>{{0, 0, 0, 0, 0}};
-    auto const bool_type = cudf::data_type{type_id::BOOL8};
-
-    auto const a = cudf::binary_operation(lhs, *rhs, binary_operator::LESS, bool_type);
-    auto const b = cudf::binary_operation(lhs, *rhs, binary_operator::LESS_EQUAL, bool_type);
-    auto const c = cudf::binary_operation(lhs, *rhs, binary_operator::GREATER, bool_type);
-    auto const d = cudf::binary_operation(lhs, *rhs, binary_operator::GREATER_EQUAL, bool_type);
-    auto const e = cudf::binary_operation(*rhs, lhs, binary_operator::GREATER, bool_type);
-    auto const f = cudf::binary_operation(*rhs, lhs, binary_operator::GREATER_EQUAL, bool_type);
-    auto const g = cudf::binary_operation(*rhs, lhs, binary_operator::LESS, bool_type);
-    auto const h = cudf::binary_operation(*rhs, lhs, binary_operator::LESS_EQUAL, bool_type);
-
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(a->view(), trues);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(b->view(), trues);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(c->view(), falses);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(d->view(), falses);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(e->view(), trues);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(f->view(), trues);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(g->view(), falses);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(h->view(), falses);
-  }
-}
-
-}  // namespace binop
-}  // namespace test
-}  // namespace cudf
-
-CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/binaryop/binop-null-test.cpp b/cpp/tests/binaryop/binop-null-test.cpp
index b7e7702bd6c..55ddde5ce5f 100644
--- a/cpp/tests/binaryop/binop-null-test.cpp
+++ b/cpp/tests/binaryop/binop-null-test.cpp
@@ -74,8 +74,8 @@ TEST_F(BinaryOperationNullTest, Scalar_Null_Vector_Valid)
   lhs.set_valid_async(false);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::ALL_VALID);
 
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out =
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -91,8 +91,8 @@ TEST_F(BinaryOperationNullTest, Scalar_Valid_Vector_NonNullable)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::UNALLOCATED);
 
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out =
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -109,8 +109,8 @@ TEST_F(BinaryOperationNullTest, Scalar_Null_Vector_NonNullable)
   lhs.set_valid_async(false);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::UNALLOCATED);
 
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out =
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -126,8 +126,8 @@ TEST_F(BinaryOperationNullTest, Vector_Null_Scalar_Valid)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::ALL_NULL);
 
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out =
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -143,8 +143,8 @@ TEST_F(BinaryOperationNullTest, Vector_Null_Vector_Valid)
   auto lhs = make_random_wrapped_column<TypeLhs>(100, mask_state::ALL_NULL);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::ALL_VALID);
 
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out =
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -160,8 +160,8 @@ TEST_F(BinaryOperationNullTest, Vector_Null_Vector_NonNullable)
   auto lhs = make_random_wrapped_column<TypeLhs>(100, mask_state::ALL_NULL);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::UNALLOCATED);
 
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out =
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -177,8 +177,8 @@ TEST_F(BinaryOperationNullTest, Vector_Valid_Vector_NonNullable)
   auto lhs = make_random_wrapped_column<TypeLhs>(100, mask_state::ALL_VALID);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::UNALLOCATED);
 
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out =
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -194,8 +194,8 @@ TEST_F(BinaryOperationNullTest, Vector_NonNullable_Vector_NonNullable)
   auto lhs = make_random_wrapped_column<TypeLhs>(100, mask_state::UNALLOCATED);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::UNALLOCATED);
 
-  auto out = cudf::jit::binary_operation(
-    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out =
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
diff --git a/cpp/tests/binaryop/binop-verify-input-test.cpp b/cpp/tests/binaryop/binop-verify-input-test.cpp
index 779dc7c4c1f..167fbc22bde 100644
--- a/cpp/tests/binaryop/binop-verify-input-test.cpp
+++ b/cpp/tests/binaryop/binop-verify-input-test.cpp
@@ -35,9 +35,9 @@ TEST_F(BinopVerifyInputTest, Vector_Scalar_ErrorOutputVectorType)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   auto rhs = make_random_wrapped_column<TypeRhs>(10);
 
-  EXPECT_THROW(cudf::jit::binary_operation(
-                 lhs, rhs, cudf::binary_operator::ADD, data_type(type_id::NUM_TYPE_IDS)),
-               cudf::logic_error);
+  EXPECT_THROW(
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_id::NUM_TYPE_IDS)),
+    cudf::logic_error);
 }
 
 TEST_F(BinopVerifyInputTest, Vector_Vector_ErrorSecondOperandVectorZeroSize)
@@ -49,9 +49,9 @@ TEST_F(BinopVerifyInputTest, Vector_Vector_ErrorSecondOperandVectorZeroSize)
   auto lhs = make_random_wrapped_column<TypeLhs>(1);
   auto rhs = make_random_wrapped_column<TypeRhs>(10);
 
-  EXPECT_THROW(cudf::jit::binary_operation(
-                 lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>())),
-               cudf::logic_error);
+  EXPECT_THROW(
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>())),
+    cudf::logic_error);
 }
 
 }  // namespace binop

From 16fcf4880e74c264899a8b09332cd8a99d08babe Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Thu, 25 Nov 2021 23:40:44 +0530
Subject: [PATCH 08/42] remove jit code which are supported by compiled binops

---
 cpp/include/cudf/binaryop.hpp        |  78 ----
 cpp/include/cudf/detail/binaryop.hpp |  45 +-
 cpp/src/binaryop/binaryop.cpp        | 237 +---------
 cpp/src/binaryop/jit/kernel.cu       |  51 ---
 cpp/src/binaryop/jit/operation.hpp   | 646 ---------------------------
 cpp/src/binaryop/jit/traits.hpp      |  68 ---
 cpp/src/binaryop/jit/util.hpp        |  88 ----
 7 files changed, 6 insertions(+), 1207 deletions(-)
 delete mode 100644 cpp/src/binaryop/jit/operation.hpp
 delete mode 100644 cpp/src/binaryop/jit/traits.hpp
 delete mode 100644 cpp/src/binaryop/jit/util.hpp

diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index fe548a36cf0..a514010c1f0 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -210,83 +210,5 @@ cudf::data_type binary_operation_fixed_point_output_type(binary_operator op,
                                                          cudf::data_type const& lhs,
                                                          cudf::data_type const& rhs);
 
-namespace jit {
-/**
- * @brief Performs a binary operation between a scalar and a column.
- *
- * The output contains the result of `op(lhs, rhs[i])` for all `0 <= i < rhs.size()`
- * The scalar is the left operand and the column elements are the right operand.
- * This distinction is significant in case of non-commutative binary operations
- *
- * Regardless of the operator, the validity of the output value is the logical
- * AND of the validity of the two operands
- *
- * @param lhs         The left operand scalar
- * @param rhs         The right operand column
- * @param op          The binary operator
- * @param output_type The desired data type of the output column
- * @param mr          Device memory resource used to allocate the returned column's device memory
- * @return            Output column of `output_type` type containing the result of
- *                    the binary operation
- * @throw cudf::logic_error if @p output_type dtype isn't fixed-width
- */
-std::unique_ptr<column> binary_operation(
-  scalar const& lhs,
-  column_view const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Performs a binary operation between a column and a scalar.
- *
- * The output contains the result of `op(lhs[i], rhs)` for all `0 <= i < lhs.size()`
- * The column elements are the left operand and the scalar is the right operand.
- * This distinction is significant in case of non-commutative binary operations
- *
- * Regardless of the operator, the validity of the output value is the logical
- * AND of the validity of the two operands
- *
- * @param lhs         The left operand column
- * @param rhs         The right operand scalar
- * @param op          The binary operator
- * @param output_type The desired data type of the output column
- * @param mr          Device memory resource used to allocate the returned column's device memory
- * @return            Output column of `output_type` type containing the result of
- *                    the binary operation
- * @throw cudf::logic_error if @p output_type dtype isn't fixed-width
- */
-std::unique_ptr<column> binary_operation(
-  column_view const& lhs,
-  scalar const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Performs a binary operation between two columns.
- *
- * The output contains the result of `op(lhs[i], rhs[i])` for all `0 <= i < lhs.size()`
- *
- * Regardless of the operator, the validity of the output value is the logical
- * AND of the validity of the two operands
- *
- * @param lhs         The left operand column
- * @param rhs         The right operand column
- * @param op          The binary operator
- * @param output_type The desired data type of the output column
- * @param mr          Device memory resource used to allocate the returned column's device memory
- * @return            Output column of `output_type` type containing the result of
- *                    the binary operation
- * @throw cudf::logic_error if @p lhs and @p rhs are different sizes
- * @throw cudf::logic_error if @p output_type dtype isn't fixed-width
- */
-std::unique_ptr<column> binary_operation(
-  column_view const& lhs,
-  column_view const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-}  // namespace jit
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/binaryop.hpp b/cpp/include/cudf/detail/binaryop.hpp
index ce7731ef7d2..9fa31d0e01d 100644
--- a/cpp/include/cudf/detail/binaryop.hpp
+++ b/cpp/include/cudf/detail/binaryop.hpp
@@ -22,52 +22,9 @@
 namespace cudf {
 //! Inner interfaces and implementations
 namespace detail {
-namespace jit {
-/**
- * @copydoc cudf::jit::binary_operation(scalar const&, column_view const&, binary_operator,
- * data_type, rmm::mr::device_memory_resource *)
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
- */
-std::unique_ptr<column> binary_operation(
-  scalar const& lhs,
-  column_view const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @copydoc cudf::jit::binary_operation(column_view const&, scalar const&, binary_operator,
- * data_type, rmm::mr::device_memory_resource *)
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
- */
-std::unique_ptr<column> binary_operation(
-  column_view const& lhs,
-  scalar const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @copydoc cudf::jit::binary_operation(column_view const&, column_view const&,
- * binary_operator, data_type, rmm::mr::device_memory_resource *)
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
- */
-std::unique_ptr<column> binary_operation(
-  column_view const& lhs,
-  column_view const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-}  // namespace jit
-
-/**
- * @copydoc cudf::jit::binary_operation(column_view const&, column_view const&,
+ * @copydoc cudf::binary_operation(column_view const&, column_view const&,
  * std::string const&, data_type, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index e84e175eaca..3398592d5b4 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -18,7 +18,6 @@
  */
 
 #include "compiled/binary_ops.hpp"
-#include "jit/util.hpp"
 
 #include <jit_preprocessed_files/binaryop/jit/kernel.cu.jit.hpp>
 
@@ -126,113 +125,6 @@ bool is_same_scale_necessary(binary_operator op)
 }
 
 namespace jit {
-
-void binary_operation(mutable_column_view& out,
-                      column_view const& lhs,
-                      scalar const& rhs,
-                      binary_operator op,
-                      OperatorType op_type,
-                      rmm::cuda_stream_view stream)
-{
-  if (is_null_dependent(op)) {
-    std::string kernel_name =
-      jitify2::reflection::Template("cudf::binops::jit::kernel_v_s_with_validity")  //
-        .instantiate(cudf::jit::get_type_name(out.type()),  // list of template arguments
-                     cudf::jit::get_type_name(lhs.type()),
-                     cudf::jit::get_type_name(rhs.type()),
-                     get_operator_name(op, op_type));
-
-    cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-      .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
-      ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
-      ->launch(out.size(),
-               cudf::jit::get_data_ptr(out),
-               cudf::jit::get_data_ptr(lhs),
-               cudf::jit::get_data_ptr(rhs),
-               out.null_mask(),
-               lhs.null_mask(),
-               lhs.offset(),
-               rhs.is_valid(stream));
-  } else {
-    std::string kernel_name =
-      jitify2::reflection::Template("cudf::binops::jit::kernel_v_s")  //
-        .instantiate(cudf::jit::get_type_name(out.type()),            // list of template arguments
-                     cudf::jit::get_type_name(lhs.type()),
-                     cudf::jit::get_type_name(rhs.type()),
-                     get_operator_name(op, op_type));
-
-    cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-      .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
-      ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
-      ->launch(out.size(),
-               cudf::jit::get_data_ptr(out),
-               cudf::jit::get_data_ptr(lhs),
-               cudf::jit::get_data_ptr(rhs));
-  }
-}
-
-void binary_operation(mutable_column_view& out,
-                      column_view const& lhs,
-                      scalar const& rhs,
-                      binary_operator op,
-                      rmm::cuda_stream_view stream)
-{
-  return binary_operation(out, lhs, rhs, op, OperatorType::Direct, stream);
-}
-
-void binary_operation(mutable_column_view& out,
-                      scalar const& lhs,
-                      column_view const& rhs,
-                      binary_operator op,
-                      rmm::cuda_stream_view stream)
-{
-  return binary_operation(out, rhs, lhs, op, OperatorType::Reverse, stream);
-}
-
-void binary_operation(mutable_column_view& out,
-                      column_view const& lhs,
-                      column_view const& rhs,
-                      binary_operator op,
-                      rmm::cuda_stream_view stream)
-{
-  if (is_null_dependent(op)) {
-    std::string kernel_name =
-      jitify2::reflection::Template("cudf::binops::jit::kernel_v_v_with_validity")  //
-        .instantiate(cudf::jit::get_type_name(out.type()),  // list of template arguments
-                     cudf::jit::get_type_name(lhs.type()),
-                     cudf::jit::get_type_name(rhs.type()),
-                     get_operator_name(op, OperatorType::Direct));
-
-    cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-      .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
-      ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
-      ->launch(out.size(),
-               cudf::jit::get_data_ptr(out),
-               cudf::jit::get_data_ptr(lhs),
-               cudf::jit::get_data_ptr(rhs),
-               out.null_mask(),
-               lhs.null_mask(),
-               rhs.offset(),
-               rhs.null_mask(),
-               rhs.offset());
-  } else {
-    std::string kernel_name =
-      jitify2::reflection::Template("cudf::binops::jit::kernel_v_v")  //
-        .instantiate(cudf::jit::get_type_name(out.type()),            // list of template arguments
-                     cudf::jit::get_type_name(lhs.type()),
-                     cudf::jit::get_type_name(rhs.type()),
-                     get_operator_name(op, OperatorType::Direct));
-
-    cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-      .get_kernel(kernel_name, {}, {}, {"-arch=sm_."})       //
-      ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
-      ->launch(out.size(),
-               cudf::jit::get_data_ptr(out),
-               cudf::jit::get_data_ptr(lhs),
-               cudf::jit::get_data_ptr(rhs));
-  }
-}
-
 void binary_operation(mutable_column_view& out,
                       column_view const& lhs,
                       column_view const& rhs,
@@ -246,12 +138,11 @@ void binary_operation(mutable_column_view& out,
   std::string cuda_source =
     cudf::jit::parse_single_function_ptx(ptx, "GENERIC_BINARY_OP", output_type_name);
 
-  std::string kernel_name =
-    jitify2::reflection::Template("cudf::binops::jit::kernel_v_v")  //
-      .instantiate(output_type_name,                                // list of template arguments
-                   cudf::jit::get_type_name(lhs.type()),
-                   cudf::jit::get_type_name(rhs.type()),
-                   get_operator_name(binary_operator::GENERIC_BINARY, OperatorType::Direct));
+  std::string kernel_name = jitify2::reflection::Template("cudf::binops::jit::kernel_v_v")  //
+                              .instantiate(output_type_name,  // list of template arguments
+                                           cudf::jit::get_type_name(lhs.type()),
+                                           cudf::jit::get_type_name(rhs.type()),
+                                           std::string("UserDefinedOp"));
 
   cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
     .get_kernel(
@@ -418,126 +309,8 @@ std::unique_ptr<column> make_fixed_width_column_for_output(column_view const& lh
       output_type, lhs.size(), std::move(new_mask), null_count, stream, mr);
   }
 };
-
-namespace jit {
-
-std::unique_ptr<column> binary_operation(scalar const& lhs,
-                                         column_view const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  // calls compiled ops for string types
-  if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
-    return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
-
-  // Check for datatype
-  CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype");
-  CUDF_EXPECTS(not is_fixed_point(lhs.type()), "Invalid/Unsupported lhs datatype");
-  CUDF_EXPECTS(not is_fixed_point(rhs.type()), "Invalid/Unsupported rhs datatype");
-  CUDF_EXPECTS(is_fixed_width(lhs.type()), "Invalid/Unsupported lhs datatype");
-  CUDF_EXPECTS(is_fixed_width(rhs.type()), "Invalid/Unsupported rhs datatype");
-
-  auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
-
-  if (rhs.is_empty()) return out;
-
-  auto out_view = out->mutable_view();
-  binops::jit::binary_operation(out_view, lhs, rhs, op, stream);
-  return out;
-}
-
-std::unique_ptr<column> binary_operation(column_view const& lhs,
-                                         scalar const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  // calls compiled ops for string types
-  if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
-    return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
-
-  // Check for datatype
-  CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype");
-  CUDF_EXPECTS(not is_fixed_point(lhs.type()), "Invalid/Unsupported lhs datatype");
-  CUDF_EXPECTS(not is_fixed_point(rhs.type()), "Invalid/Unsupported rhs datatype");
-  CUDF_EXPECTS(is_fixed_width(lhs.type()), "Invalid/Unsupported lhs datatype");
-  CUDF_EXPECTS(is_fixed_width(rhs.type()), "Invalid/Unsupported rhs datatype");
-
-  auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
-
-  if (lhs.is_empty()) return out;
-
-  auto out_view = out->mutable_view();
-  binops::jit::binary_operation(out_view, lhs, rhs, op, stream);
-  return out;
-}
-
-std::unique_ptr<column> binary_operation(column_view const& lhs,
-                                         column_view const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match");
-
-  // calls compiled ops for string types
-  if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
-    return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
-
-  // Check for datatype
-  CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype");
-  CUDF_EXPECTS(not is_fixed_point(lhs.type()), "Invalid/Unsupported lhs datatype");
-  CUDF_EXPECTS(not is_fixed_point(rhs.type()), "Invalid/Unsupported rhs datatype");
-  CUDF_EXPECTS(is_fixed_width(lhs.type()), "Invalid/Unsupported lhs datatype");
-  CUDF_EXPECTS(is_fixed_width(rhs.type()), "Invalid/Unsupported rhs datatype");
-
-  auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
-
-  if (lhs.is_empty() or rhs.is_empty()) return out;
-
-  auto out_view = out->mutable_view();
-  binops::jit::binary_operation(out_view, lhs, rhs, op, stream);
-  return out;
-}
-}  // namespace jit
 }  // namespace detail
 
-namespace jit {
-std::unique_ptr<column> binary_operation(scalar const& lhs,
-                                         column_view const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::jit::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
-}
-
-std::unique_ptr<column> binary_operation(column_view const& lhs,
-                                         scalar const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::jit::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
-}
-
-std::unique_ptr<column> binary_operation(column_view const& lhs,
-                                         column_view const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::jit::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
-}
-}  // namespace jit
-
 namespace detail {
 std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          column_view const& rhs,
diff --git a/cpp/src/binaryop/jit/kernel.cu b/cpp/src/binaryop/jit/kernel.cu
index fcfe16f979d..3130cf65bb3 100644
--- a/cpp/src/binaryop/jit/kernel.cu
+++ b/cpp/src/binaryop/jit/kernel.cu
@@ -18,8 +18,6 @@
  * limitations under the License.
  */
 
-#include <binaryop/jit/operation.hpp>
-
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
@@ -30,55 +28,6 @@ namespace cudf {
 namespace binops {
 namespace jit {
 
-template <typename TypeOut, typename TypeLhs, typename TypeRhs, typename TypeOpe>
-__global__ void kernel_v_s_with_validity(cudf::size_type size,
-                                         TypeOut* out_data,
-                                         TypeLhs* lhs_data,
-                                         TypeRhs* rhs_data,
-                                         cudf::bitmask_type* output_mask,
-                                         cudf::bitmask_type const* mask,
-                                         cudf::size_type offset,
-                                         bool scalar_valid)
-{
-  int tid    = threadIdx.x;
-  int blkid  = blockIdx.x;
-  int blksz  = blockDim.x;
-  int gridsz = gridDim.x;
-
-  int start = tid + blkid * blksz;
-  int step  = blksz * gridsz;
-
-  for (cudf::size_type i = start; i < size; i += step) {
-    bool output_valid = false;
-    out_data[i]       = TypeOpe::template operate<TypeOut, TypeLhs, TypeRhs>(
-      lhs_data[i],
-      rhs_data[0],
-      mask ? cudf::bit_is_set(mask, offset + i) : true,
-      scalar_valid,
-      output_valid);
-    if (output_mask && !output_valid) cudf::clear_bit(output_mask, i);
-  }
-}
-
-template <typename TypeOut, typename TypeLhs, typename TypeRhs, typename TypeOpe>
-__global__ void kernel_v_s(cudf::size_type size,
-                           TypeOut* out_data,
-                           TypeLhs* lhs_data,
-                           TypeRhs* rhs_data)
-{
-  int tid    = threadIdx.x;
-  int blkid  = blockIdx.x;
-  int blksz  = blockDim.x;
-  int gridsz = gridDim.x;
-
-  int start = tid + blkid * blksz;
-  int step  = blksz * gridsz;
-
-  for (cudf::size_type i = start; i < size; i += step) {
-    out_data[i] = TypeOpe::template operate<TypeOut, TypeLhs, TypeRhs>(lhs_data[i], rhs_data[0]);
-  }
-}
-
 template <typename TypeOut, typename TypeLhs, typename TypeRhs, typename TypeOpe>
 __global__ void kernel_v_v(cudf::size_type size,
                            TypeOut* out_data,
diff --git a/cpp/src/binaryop/jit/operation.hpp b/cpp/src/binaryop/jit/operation.hpp
deleted file mode 100644
index d117f2182f9..00000000000
--- a/cpp/src/binaryop/jit/operation.hpp
+++ /dev/null
@@ -1,646 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Copyright 2018-2019 BlazingDB, Inc.
- *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/types.hpp>
-
-#include <binaryop/jit/operation-udf.hpp>
-#include <binaryop/jit/traits.hpp>
-
-#include <cmath>
-
-#include <cuda/std/type_traits>
-
-#pragma once
-
-using namespace cuda::std;
-
-namespace cudf {
-namespace binops {
-namespace jit {
-
-struct Add {
-  // Allow sum between chronos only when both input and output types
-  // are chronos. Unsupported combinations will fail to compile
-  template <
-    typename TypeOut,
-    typename TypeLhs,
-    typename TypeRhs,
-    enable_if_t<(is_chrono_v<TypeOut> && is_chrono_v<TypeLhs> && is_chrono_v<TypeRhs>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return x + y;
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(!is_chrono_v<TypeOut> || !is_chrono_v<TypeLhs> ||
-                         !is_chrono_v<TypeRhs>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
-    return static_cast<TypeOut>(static_cast<TypeCommon>(x) + static_cast<TypeCommon>(y));
-  }
-};
-
-using RAdd = Add;
-
-struct Sub {
-  // Allow difference between chronos only when both input and output types
-  // are chronos. Unsupported combinations will fail to compile
-  template <
-    typename TypeOut,
-    typename TypeLhs,
-    typename TypeRhs,
-    enable_if_t<(is_chrono_v<TypeOut> && is_chrono_v<TypeLhs> && is_chrono_v<TypeRhs>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return x - y;
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(!is_chrono_v<TypeOut> || !is_chrono_v<TypeLhs> ||
-                         !is_chrono_v<TypeRhs>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
-    return static_cast<TypeOut>(static_cast<TypeCommon>(x) - static_cast<TypeCommon>(y));
-  }
-};
-
-struct RSub {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return Sub::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct Mul {
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(!is_duration_v<TypeOut>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
-    return static_cast<TypeOut>(static_cast<TypeCommon>(x) * static_cast<TypeCommon>(y));
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_duration_v<TypeOut>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return DurationProduct<TypeOut>(x, y);
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_duration_v<TypeLhs> && is_integral_v<TypeRhs>) ||
-                        (is_integral_v<TypeLhs> && is_duration_v<TypeRhs>)>* = nullptr>
-  static TypeOut DurationProduct(TypeLhs x, TypeRhs y)
-  {
-    return x * y;
-  }
-};
-
-using RMul = Mul;
-
-struct Div {
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(!is_duration_v<TypeLhs>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
-    return static_cast<TypeOut>(static_cast<TypeCommon>(x) / static_cast<TypeCommon>(y));
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_duration_v<TypeLhs>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return DurationDivide<TypeOut>(x, y);
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_integral_v<TypeRhs> || is_duration_v<TypeRhs>)>* = nullptr>
-  static TypeOut DurationDivide(TypeLhs x, TypeRhs y)
-  {
-    return x / y;
-  }
-};
-
-struct RDiv {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return Div::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct TrueDiv {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (static_cast<double>(x) / static_cast<double>(y));
-  }
-};
-
-struct RTrueDiv {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return TrueDiv::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct FloorDiv {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return floor(static_cast<double>(x) / static_cast<double>(y));
-  }
-};
-
-struct RFloorDiv {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return FloorDiv::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct Mod {
-  template <
-    typename TypeOut,
-    typename TypeLhs,
-    typename TypeRhs,
-    enable_if_t<(is_integral_v<typename common_type<TypeOut, TypeLhs, TypeRhs>::type>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
-    return static_cast<TypeOut>(static_cast<TypeCommon>(x) % static_cast<TypeCommon>(y));
-  }
-
-  template <
-    typename TypeOut,
-    typename TypeLhs,
-    typename TypeRhs,
-    enable_if_t<(isFloat<typename common_type<TypeOut, TypeLhs, TypeRhs>::type>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return static_cast<TypeOut>(fmodf(static_cast<float>(x), static_cast<float>(y)));
-  }
-
-  template <
-    typename TypeOut,
-    typename TypeLhs,
-    typename TypeRhs,
-    enable_if_t<(isDouble<typename common_type<TypeOut, TypeLhs, TypeRhs>::type>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return static_cast<TypeOut>(fmod(static_cast<double>(x), static_cast<double>(y)));
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_duration_v<TypeLhs> && is_duration_v<TypeOut>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return x % y;
-  }
-};
-
-struct RMod {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return Mod::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct PyMod {
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_integral_v<TypeOut>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return ((x % y) + y) % y;
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_floating_point_v<TypeOut>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    double x1 = static_cast<double>(x);
-    double y1 = static_cast<double>(y);
-    return fmod(fmod(x1, y1) + y1, y1);
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_duration_v<TypeLhs> && is_duration_v<TypeOut>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return ((x % y) + y) % y;
-  }
-};
-
-struct RPyMod {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return PyMod::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct Pow {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return pow(static_cast<double>(x), static_cast<double>(y));
-  }
-};
-
-struct RPow {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return Pow::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct Equal {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x == y);
-  }
-};
-
-using REqual = Equal;
-
-struct NotEqual {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x != y);
-  }
-};
-
-using RNotEqual = NotEqual;
-
-struct Less {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x < y);
-  }
-};
-
-struct RLess {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (y < x);
-  }
-};
-
-struct Greater {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x > y);
-  }
-};
-
-struct RGreater {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (y > x);
-  }
-};
-
-struct LessEqual {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x <= y);
-  }
-};
-
-struct RLessEqual {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (y <= x);
-  }
-};
-
-struct GreaterEqual {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x >= y);
-  }
-};
-
-struct RGreaterEqual {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (y >= x);
-  }
-};
-
-struct BitwiseAnd {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (static_cast<TypeOut>(x) & static_cast<TypeOut>(y));
-  }
-};
-
-using RBitwiseAnd = BitwiseAnd;
-
-struct BitwiseOr {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (static_cast<TypeOut>(x) | static_cast<TypeOut>(y));
-  }
-};
-
-using RBitwiseOr = BitwiseOr;
-
-struct BitwiseXor {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (static_cast<TypeOut>(x) ^ static_cast<TypeOut>(y));
-  }
-};
-
-using RBitwiseXor = BitwiseXor;
-
-struct LogicalAnd {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x && y);
-  }
-};
-
-using RLogicalAnd = LogicalAnd;
-
-struct LogicalOr {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x || y);
-  }
-};
-
-using RLogicalOr = LogicalOr;
-
-struct UserDefinedOp {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    TypeOut output;
-    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
-    GENERIC_BINARY_OP(&output, static_cast<TypeCommon>(x), static_cast<TypeCommon>(y));
-    return output;
-  }
-};
-
-struct ShiftLeft {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x << y);
-  }
-};
-
-struct RShiftLeft {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (y << x);
-  }
-};
-
-struct ShiftRight {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (x >> y);
-  }
-};
-
-struct RShiftRight {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (y >> x);
-  }
-};
-
-struct ShiftRightUnsigned {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (static_cast<make_unsigned_t<TypeLhs>>(x) >> y);
-  }
-};
-
-struct RShiftRightUnsigned {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (static_cast<make_unsigned_t<TypeRhs>>(y) >> x);
-  }
-};
-
-struct LogBase {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return (std::log(static_cast<double>(x)) / std::log(static_cast<double>(y)));
-  }
-};
-
-struct RLogBase {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return LogBase::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct NullEquals {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
-  {
-    output_valid = true;
-    if (!lhs_valid && !rhs_valid) return true;
-    if (lhs_valid && rhs_valid) return x == y;
-    return false;
-  }
-};
-
-struct RNullEquals {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
-  {
-    output_valid = true;
-    return NullEquals::operate<TypeOut, TypeRhs, TypeLhs>(y, x, rhs_valid, lhs_valid, output_valid);
-  }
-};
-
-struct NullMax {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
-  {
-    output_valid = true;
-    if (!lhs_valid && !rhs_valid) {
-      output_valid = false;
-      return TypeOut{};
-    } else if (lhs_valid && rhs_valid) {
-      return (TypeOut{x} > TypeOut{y}) ? TypeOut{x} : TypeOut{y};
-    } else if (lhs_valid)
-      return TypeOut{x};
-    else
-      return TypeOut{y};
-  }
-};
-
-struct RNullMax {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
-  {
-    return NullMax::operate<TypeOut, TypeRhs, TypeLhs>(y, x, rhs_valid, lhs_valid, output_valid);
-  }
-};
-
-struct NullMin {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
-  {
-    output_valid = true;
-    if (!lhs_valid && !rhs_valid) {
-      output_valid = false;
-      return TypeOut{};
-    } else if (lhs_valid && rhs_valid) {
-      return (TypeOut{x} < TypeOut{y}) ? TypeOut{x} : TypeOut{y};
-    } else if (lhs_valid)
-      return TypeOut{x};
-    else
-      return TypeOut{y};
-  }
-};
-
-struct RNullMin {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
-  {
-    return NullMin::operate<TypeOut, TypeRhs, TypeLhs>(y, x, rhs_valid, lhs_valid, output_valid);
-  }
-};
-
-struct PMod {
-  // Ideally, these two specializations - one for integral types and one for non integral
-  // types shouldn't be required, as std::fmod should promote integral types automatically
-  // to double and call the std::fmod overload for doubles. Sadly, doing this in jitified
-  // code does not work - it is having trouble deciding between float/double overloads
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<(is_integral_v<typename cuda::std::common_type<TypeLhs, TypeRhs>::type>)>* =
-              nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    using common_t = typename cuda::std::common_type<TypeLhs, TypeRhs>::type;
-    common_t xconv{x};
-    common_t yconv{y};
-    auto rem = xconv % yconv;
-    if (rem < 0) rem = (rem + yconv) % yconv;
-    return TypeOut{rem};
-  }
-
-  template <typename TypeOut,
-            typename TypeLhs,
-            typename TypeRhs,
-            enable_if_t<
-              !(is_integral_v<typename cuda::std::common_type<TypeLhs, TypeRhs>::type>)>* = nullptr>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    using common_t = typename cuda::std::common_type<TypeLhs, TypeRhs>::type;
-    common_t xconv{x};
-    common_t yconv{y};
-    auto rem = std::fmod(xconv, yconv);
-    if (rem < 0) rem = std::fmod(rem + yconv, yconv);
-    return TypeOut{rem};
-  }
-};
-
-struct RPMod {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return PMod::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-  }
-};
-
-struct ATan2 {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return TypeOut{std::atan2(double{x}, double{y})};
-  }
-};
-
-struct RATan2 {
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  static TypeOut operate(TypeLhs x, TypeRhs y)
-  {
-    return TypeOut{ATan2::operate<TypeOut, TypeRhs, TypeLhs>(y, x)};
-  }
-};
-
-}  // namespace jit
-}  // namespace binops
-}  // namespace cudf
diff --git a/cpp/src/binaryop/jit/traits.hpp b/cpp/src/binaryop/jit/traits.hpp
deleted file mode 100644
index 1033d38a668..00000000000
--- a/cpp/src/binaryop/jit/traits.hpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Copyright 2018-2019 BlazingDB, Inc.
- *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-// Include Jitify's cstddef header first
-#include <cstddef>
-
-#include <cuda/std/climits>
-#include <cuda/std/cstddef>
-#include <cuda/std/limits>
-#include <cuda/std/type_traits>
-
-#include <cudf/wrappers/durations.hpp>
-#include <cudf/wrappers/timestamps.hpp>
-
-namespace cudf {
-namespace binops {
-namespace jit {
-
-// -------------------------------------------------------------------------
-// type_traits cannot tell the difference between float and double
-template <typename Type>
-constexpr bool isFloat = false;
-
-template <typename T>
-constexpr bool is_timestamp_v =
-  cuda::std::is_same_v<cudf::timestamp_D, T> || cuda::std::is_same_v<cudf::timestamp_s, T> ||
-  cuda::std::is_same_v<cudf::timestamp_ms, T> || cuda::std::is_same_v<cudf::timestamp_us, T> ||
-  cuda::std::is_same_v<cudf::timestamp_ns, T>;
-
-template <typename T>
-constexpr bool is_duration_v =
-  cuda::std::is_same_v<cudf::duration_D, T> || cuda::std::is_same_v<cudf::duration_s, T> ||
-  cuda::std::is_same_v<cudf::duration_ms, T> || cuda::std::is_same_v<cudf::duration_us, T> ||
-  cuda::std::is_same_v<cudf::duration_ns, T>;
-
-template <typename T>
-constexpr bool is_chrono_v = is_timestamp_v<T> || is_duration_v<T>;
-
-template <>
-constexpr bool isFloat<float> = true;
-
-template <typename Type>
-constexpr bool isDouble = false;
-
-template <>
-constexpr bool isDouble<double> = true;
-
-}  // namespace jit
-}  // namespace binops
-}  // namespace cudf
diff --git a/cpp/src/binaryop/jit/util.hpp b/cpp/src/binaryop/jit/util.hpp
deleted file mode 100644
index 34c42e28a8b..00000000000
--- a/cpp/src/binaryop/jit/util.hpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/binaryop.hpp>
-
-#include <string>
-
-namespace cudf {
-namespace binops {
-namespace jit {
-
-/**
- * @brief Orientation of lhs and rhs in operator
- */
-enum class OperatorType {
-  Direct,  ///< Orientation of operands is op(lhs, rhs)
-  Reverse  ///< Orientation of operands is op(rhs, lhs)
-};
-
-/**
- * @brief Get the Operator Name
- *
- * @param op The binary operator as enum of type cudf::binary_operator
- * @param type @see OperatorType
- * @return std::string The name of the operator as string
- */
-std::string inline get_operator_name(binary_operator op, OperatorType type)
-{
-  std::string const operator_name = [op] {
-    // clang-format off
-    switch (op) {
-      case binary_operator::ADD:                  return "Add";
-      case binary_operator::SUB:                  return "Sub";
-      case binary_operator::MUL:                  return "Mul";
-      case binary_operator::DIV:                  return "Div";
-      case binary_operator::TRUE_DIV:             return "TrueDiv";
-      case binary_operator::FLOOR_DIV:            return "FloorDiv";
-      case binary_operator::MOD:                  return "Mod";
-      case binary_operator::PYMOD:                return "PyMod";
-      case binary_operator::POW:                  return "Pow";
-      case binary_operator::EQUAL:                return "Equal";
-      case binary_operator::NOT_EQUAL:            return "NotEqual";
-      case binary_operator::LESS:                 return "Less";
-      case binary_operator::GREATER:              return "Greater";
-      case binary_operator::LESS_EQUAL:           return "LessEqual";
-      case binary_operator::GREATER_EQUAL:        return "GreaterEqual";
-      case binary_operator::BITWISE_AND:          return "BitwiseAnd";
-      case binary_operator::BITWISE_OR:           return "BitwiseOr";
-      case binary_operator::BITWISE_XOR:          return "BitwiseXor";
-      case binary_operator::LOGICAL_AND:          return "LogicalAnd";
-      case binary_operator::LOGICAL_OR:           return "LogicalOr";
-      case binary_operator::GENERIC_BINARY:       return "UserDefinedOp";
-      case binary_operator::SHIFT_LEFT:           return "ShiftLeft";
-      case binary_operator::SHIFT_RIGHT:          return "ShiftRight";
-      case binary_operator::SHIFT_RIGHT_UNSIGNED: return "ShiftRightUnsigned";
-      case binary_operator::LOG_BASE:             return "LogBase";
-      case binary_operator::ATAN2:                return "ATan2";
-      case binary_operator::PMOD:                 return "PMod";
-      case binary_operator::NULL_EQUALS:          return "NullEquals";
-      case binary_operator::NULL_MAX:             return "NullMax";
-      case binary_operator::NULL_MIN:             return "NullMin";
-      default:                                    return "";
-    }
-    // clang-format on
-  }();
-
-  if (operator_name == "") { return "None"; }
-
-  return "cudf::binops::jit::" +
-         (type == OperatorType::Direct ? operator_name : 'R' + operator_name);
-}
-
-}  // namespace jit
-}  // namespace binops
-}  // namespace cudf

From 1b9d624af8b3b5d097be3e478d5f878fa26e7632 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Thu, 25 Nov 2021 23:53:17 +0530
Subject: [PATCH 09/42] remove jit benchmark

---
 cpp/benchmarks/CMakeLists.txt                 |  1 -
 .../binaryop/binaryop_benchmark.cpp           |  8 +-
 .../binaryop/jit_binaryop_benchmark.cpp       | 99 -------------------
 3 files changed, 4 insertions(+), 104 deletions(-)
 delete mode 100644 cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index fa1e61e26fd..72b247ae748 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -208,7 +208,6 @@ ConfigureBench(AST_BENCH ast/transform_benchmark.cpp)
 # * binaryop benchmark ----------------------------------------------------------------------------
 ConfigureBench(
   BINARYOP_BENCH binaryop/binaryop_benchmark.cpp binaryop/compiled_binaryop_benchmark.cpp
-  binaryop/jit_binaryop_benchmark.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/binaryop/binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/binaryop_benchmark.cpp
index 9de1112a9db..314d657679b 100644
--- a/cpp/benchmarks/binaryop/binaryop_benchmark.cpp
+++ b/cpp/benchmarks/binaryop/binaryop_benchmark.cpp
@@ -74,14 +74,14 @@ static void BM_binaryop_transform(benchmark::State& state)
     auto const op         = cudf::binary_operator::ADD;
     auto result_data_type = cudf::data_type(cudf::type_to_id<key_type>());
     if (reuse_columns) {
-      auto result = cudf::jit::binary_operation(columns.at(0), columns.at(0), op, result_data_type);
+      auto result = cudf::binary_operation(columns.at(0), columns.at(0), op, result_data_type);
       for (cudf::size_type i = 0; i < tree_levels - 1; i++) {
-        result = cudf::jit::binary_operation(result->view(), columns.at(0), op, result_data_type);
+        result = cudf::binary_operation(result->view(), columns.at(0), op, result_data_type);
       }
     } else {
-      auto result = cudf::jit::binary_operation(columns.at(0), columns.at(1), op, result_data_type);
+      auto result = cudf::binary_operation(columns.at(0), columns.at(1), op, result_data_type);
       std::for_each(std::next(columns.cbegin(), 2), columns.cend(), [&](auto const& col) {
-        result = cudf::jit::binary_operation(result->view(), col, op, result_data_type);
+        result = cudf::binary_operation(result->view(), col, op, result_data_type);
       });
     }
   }
diff --git a/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp
deleted file mode 100644
index 7fda4a50ea1..00000000000
--- a/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <fixture/benchmark_fixture.hpp>
-#include <synchronization/synchronization.hpp>
-
-#include <cudf_test/column_wrapper.hpp>
-
-#include <cudf/binaryop.hpp>
-
-#include <thrust/iterator/counting_iterator.h>
-
-template <typename TypeLhs, typename TypeRhs, typename TypeOut, cudf::binary_operator>
-class JIT_BINARYOP : public cudf::benchmark {
-};
-
-template <typename TypeLhs, typename TypeRhs, typename TypeOut>
-void BM_binaryop(benchmark::State& state, cudf::binary_operator binop)
-{
-  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
-
-  auto data_it = thrust::make_counting_iterator(0);
-  cudf::test::fixed_width_column_wrapper<TypeLhs> input1(data_it, data_it + column_size);
-  cudf::test::fixed_width_column_wrapper<TypeRhs> input2(data_it, data_it + column_size);
-
-  auto lhs          = cudf::column_view(input1);
-  auto rhs          = cudf::column_view(input2);
-  auto output_dtype = cudf::data_type(cudf::type_to_id<TypeOut>());
-
-  // Call once for hot cache.
-  cudf::jit::binary_operation(lhs, rhs, binop, output_dtype);
-
-  for (auto _ : state) {
-    cuda_event_timer timer(state, true);
-    cudf::jit::binary_operation(lhs, rhs, binop, output_dtype);
-  }
-}
-
-// TODO tparam boolean for null.
-#define BINARYOP_BENCHMARK_DEFINE(TypeLhs, TypeRhs, binop, TypeOut)               \
-  BENCHMARK_TEMPLATE_DEFINE_F(                                                    \
-    JIT_BINARYOP, binop, TypeLhs, TypeRhs, TypeOut, cudf::binary_operator::binop) \
-  (::benchmark::State & st)                                                       \
-  {                                                                               \
-    BM_binaryop<TypeLhs, TypeRhs, TypeOut>(st, cudf::binary_operator::binop);     \
-  }                                                                               \
-  BENCHMARK_REGISTER_F(JIT_BINARYOP, binop)                                       \
-    ->Unit(benchmark::kMicrosecond)                                               \
-    ->UseManualTime()                                                             \
-    ->Arg(10000)      /* 10k */                                                   \
-    ->Arg(100000)     /* 100k */                                                  \
-    ->Arg(1000000)    /* 1M */                                                    \
-    ->Arg(10000000)   /* 10M */                                                   \
-    ->Arg(100000000); /* 100M */
-
-using namespace cudf;
-using namespace numeric;
-
-// clang-format off
-BINARYOP_BENCHMARK_DEFINE(float,        int64_t,      ADD,                  int32_t);
-BINARYOP_BENCHMARK_DEFINE(duration_s,   duration_D,   SUB,                  duration_ms);
-BINARYOP_BENCHMARK_DEFINE(float,        float,        MUL,                  int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      DIV,                  int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      TRUE_DIV,             int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      FLOOR_DIV,            int64_t);
-BINARYOP_BENCHMARK_DEFINE(double,       double,       MOD,                  double);
-BINARYOP_BENCHMARK_DEFINE(int32_t,      int64_t,      PMOD,                 double);
-BINARYOP_BENCHMARK_DEFINE(int32_t,      uint8_t,      PYMOD,                int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      POW,                  double);
-BINARYOP_BENCHMARK_DEFINE(float,        double,       LOG_BASE,             double);
-BINARYOP_BENCHMARK_DEFINE(float,        double,       ATAN2,                double);
-BINARYOP_BENCHMARK_DEFINE(int,          int,          SHIFT_LEFT,           int);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      SHIFT_RIGHT,          int);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int32_t,      SHIFT_RIGHT_UNSIGNED, int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int32_t,      BITWISE_AND,          int16_t);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int32_t,      BITWISE_OR,           int64_t);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      BITWISE_XOR,          int32_t);
-BINARYOP_BENCHMARK_DEFINE(double,       int8_t,       LOGICAL_AND,          bool);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      LOGICAL_OR,           bool);
-BINARYOP_BENCHMARK_DEFINE(duration_ms,  duration_ns,  EQUAL,                bool);
-BINARYOP_BENCHMARK_DEFINE(decimal32,    decimal32,    NOT_EQUAL,            bool);
-BINARYOP_BENCHMARK_DEFINE(timestamp_s,  timestamp_s,  LESS,                 bool);
-BINARYOP_BENCHMARK_DEFINE(timestamp_ms, timestamp_s,  GREATER,              bool);
-BINARYOP_BENCHMARK_DEFINE(duration_ms,  duration_ns,  NULL_EQUALS,          bool);
-BINARYOP_BENCHMARK_DEFINE(decimal32,    decimal32,    NULL_MAX,             decimal32);
-BINARYOP_BENCHMARK_DEFINE(timestamp_D, timestamp_s,   NULL_MIN,             timestamp_s);

From e49a3430ec0a75379fcf83a5041f0ae81a1ffdba Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 26 Nov 2021 00:40:40 +0530
Subject: [PATCH 10/42] skip generic op udf (jit ptx) in pytest CUDA<11.5

---
 python/cudf/cudf/tests/test_udf_binops.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/python/cudf/cudf/tests/test_udf_binops.py b/python/cudf/cudf/tests/test_udf_binops.py
index 4d6188acf8c..935c3868a68 100644
--- a/python/cudf/cudf/tests/test_udf_binops.py
+++ b/python/cudf/cudf/tests/test_udf_binops.py
@@ -6,11 +6,20 @@
 from numba.cuda import compile_ptx
 from numba.np import numpy_support
 
+import rmm
+
 import cudf
 from cudf import Series, _lib as libcudf
 from cudf.utils import dtypes as dtypeutils
 
+_driver_version = rmm._cuda.gpu.driverGetVersion()
+_runtime_version = rmm._cuda.gpu.runtimeGetVersion()
+_CUDA_JIT128INT_SUPPORTED = (_driver_version >= 11050) and (
+    _runtime_version >= 11050
+)
+
 
+@pytest.mark.skipif(not _CUDA_JIT128INT_SUPPORTED, reason="requires CUDA 11.5")
 @pytest.mark.parametrize(
     "dtype", sorted(list(dtypeutils.NUMERIC_TYPES - {"int8"}))
 )

From 8f640866b7512409af5e12b38bd7e8b3633b9a9e Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 26 Nov 2021 02:07:56 +0530
Subject: [PATCH 11/42] add deleted UserDefinedOp

---
 cpp/src/binaryop/binaryop.cpp  |  2 +-
 cpp/src/binaryop/jit/kernel.cu | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 3398592d5b4..a89e9795283 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -142,7 +142,7 @@ void binary_operation(mutable_column_view& out,
                               .instantiate(output_type_name,  // list of template arguments
                                            cudf::jit::get_type_name(lhs.type()),
                                            cudf::jit::get_type_name(rhs.type()),
-                                           std::string("UserDefinedOp"));
+                                           std::string("cudf::binops::jit::UserDefinedOp"));
 
   cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
     .get_kernel(
diff --git a/cpp/src/binaryop/jit/kernel.cu b/cpp/src/binaryop/jit/kernel.cu
index 3130cf65bb3..4eb011a1846 100644
--- a/cpp/src/binaryop/jit/kernel.cu
+++ b/cpp/src/binaryop/jit/kernel.cu
@@ -28,6 +28,17 @@ namespace cudf {
 namespace binops {
 namespace jit {
 
+struct UserDefinedOp {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    TypeOut output;
+    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
+    GENERIC_BINARY_OP(&output, static_cast<TypeCommon>(x), static_cast<TypeCommon>(y));
+    return output;
+  }
+};
+
 template <typename TypeOut, typename TypeLhs, typename TypeRhs, typename TypeOpe>
 __global__ void kernel_v_v(cudf::size_type size,
                            TypeOut* out_data,

From efb203bcd847d786faacc25da754251bf4c5eb2b Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 26 Nov 2021 03:51:38 +0530
Subject: [PATCH 12/42] fix missing includes

---
 cpp/src/binaryop/jit/kernel.cu | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cpp/src/binaryop/jit/kernel.cu b/cpp/src/binaryop/jit/kernel.cu
index 4eb011a1846..c9cc61a4f34 100644
--- a/cpp/src/binaryop/jit/kernel.cu
+++ b/cpp/src/binaryop/jit/kernel.cu
@@ -24,6 +24,9 @@
 #include <cudf/wrappers/durations.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
+#include <binaryop/jit/operation-udf.hpp>
+#include <cuda/std/type_traits>
+
 namespace cudf {
 namespace binops {
 namespace jit {
@@ -33,7 +36,7 @@ struct UserDefinedOp {
   static TypeOut operate(TypeLhs x, TypeRhs y)
   {
     TypeOut output;
-    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
+    using TypeCommon = typename cuda::std::common_type<TypeOut, TypeLhs, TypeRhs>::type;
     GENERIC_BINARY_OP(&output, static_cast<TypeCommon>(x), static_cast<TypeCommon>(y));
     return output;
   }

From 011fb482fdba796bc4238add7f86d81829100bff Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 29 Nov 2021 14:06:28 +0530
Subject: [PATCH 13/42] fix segfault by nullptr check in cufile_shim dtor

---
 cpp/src/io/utilities/file_io_utilities.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index 387452e171a..f6175fae4ec 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -106,8 +106,8 @@ class cufile_shim {
 
   ~cufile_shim()
   {
-    driver_close();
-    dlclose(cf_lib);
+    if (driver_close) driver_close();
+    if (cf_lib) dlclose(cf_lib);
   }
 
   decltype(cuFileHandleRegister)* handle_register     = nullptr;

From 9bdc28b9964b4bd75e3d387829747315bed7ac45 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 29 Nov 2021 14:13:43 +0530
Subject: [PATCH 14/42] enable cuio tests again

---
 cpp/tests/CMakeLists.txt | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 8ae31d7d74d..c1c209b2413 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -191,11 +191,15 @@ ConfigureTest(
 # * io tests --------------------------------------------------------------------------------------
 ConfigureTest(DECOMPRESSION_TEST io/comp/decomp_test.cpp)
 
-# ConfigureTest(CSV_TEST io/csv_test.cpp) ConfigureTest(ORC_TEST io/orc_test.cpp)
-# ConfigureTest(PARQUET_TEST io/parquet_test.cpp) ConfigureTest(JSON_TEST io/json_test.cpp)
-# ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp) ConfigureTest(MULTIBYTE_SPLIT_TEST
-# io/text/multibyte_split_test.cpp) if(CUDF_ENABLE_ARROW_S3)
-# target_compile_definitions(ARROW_IO_SOURCE_TEST PRIVATE "S3_ENABLED") endif()
+ConfigureTest(CSV_TEST io/csv_test.cpp)
+ConfigureTest(ORC_TEST io/orc_test.cpp)
+ConfigureTest(PARQUET_TEST io/parquet_test.cpp)
+ConfigureTest(JSON_TEST io/json_test.cpp)
+ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp)
+ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp)
+if(CUDF_ENABLE_ARROW_S3)
+  target_compile_definitions(ARROW_IO_SOURCE_TEST PRIVATE "S3_ENABLED")
+endif()
 
 # ##################################################################################################
 # * sort tests ------------------------------------------------------------------------------------

From a3ba687b2e53a4fcc2ed9d9502733251ec12096b Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 29 Nov 2021 21:31:55 +0530
Subject: [PATCH 15/42] addres review comments

---
 cpp/src/binaryop/binaryop.cpp             | 10 +++-------
 cpp/tests/binaryop/util/runtime_support.h |  5 +----
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index a89e9795283..7087b71a84e 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -138,16 +138,15 @@ void binary_operation(mutable_column_view& out,
   std::string cuda_source =
     cudf::jit::parse_single_function_ptx(ptx, "GENERIC_BINARY_OP", output_type_name);
 
-  std::string kernel_name = jitify2::reflection::Template("cudf::binops::jit::kernel_v_v")  //
+  std::string kernel_name = jitify2::reflection::Template("cudf::binops::jit::kernel_v_v")
                               .instantiate(output_type_name,  // list of template arguments
                                            cudf::jit::get_type_name(lhs.type()),
                                            cudf::jit::get_type_name(rhs.type()),
                                            std::string("cudf::binops::jit::UserDefinedOp"));
 
   cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
-    .get_kernel(
-      kernel_name, {}, {{"binaryop/jit/operation-udf.hpp", cuda_source}}, {"-arch=sm_."})  //
-    ->configure_1d_max_occupancy(0, 0, 0, stream.value())                                  //
+    .get_kernel(kernel_name, {}, {{"binaryop/jit/operation-udf.hpp", cuda_source}}, {"-arch=sm_."})
+    ->configure_1d_max_occupancy(0, 0, 0, stream.value())
     ->launch(out.size(),
              cudf::jit::get_data_ptr(out),
              cudf::jit::get_data_ptr(lhs),
@@ -221,7 +220,6 @@ namespace detail {
 
 // There are 3 overloads of each of the following functions:
 // - `make_fixed_width_column_for_output`
-// - `fixed_point_binary_operation`
 // - `binary_operation`
 
 // The overloads are overloaded on the first two parameters of each function:
@@ -309,9 +307,7 @@ std::unique_ptr<column> make_fixed_width_column_for_output(column_view const& lh
       output_type, lhs.size(), std::move(new_mask), null_count, stream, mr);
   }
 };
-}  // namespace detail
 
-namespace detail {
 std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          column_view const& rhs,
                                          binary_operator op,
diff --git a/cpp/tests/binaryop/util/runtime_support.h b/cpp/tests/binaryop/util/runtime_support.h
index a7ee0c3a391..250d34a0879 100644
--- a/cpp/tests/binaryop/util/runtime_support.h
+++ b/cpp/tests/binaryop/util/runtime_support.h
@@ -1,8 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Copyright 2018-2019 BlazingDB, Inc.
- *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 0fa0cc48a6b3b93e79f918d419a012b75765561c Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Tue, 30 Nov 2021 07:50:12 -0700
Subject: [PATCH 16/42] Support `min` and `max` in inclusive scan for structs
 (#9725)

This PR continues to address https://github.com/rapidsai/cudf/issues/8974, adding support for structs in `min` and `max` inclusive scan. Exclusive scan support is not needed in the near future.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/9725
---
 cpp/include/cudf/detail/scan.hpp          |  71 ++++----
 cpp/src/groupby/sort/group_scan_util.cuh  |   6 +
 cpp/src/reductions/scan/scan_inclusive.cu |  87 +++++++++-
 cpp/tests/reductions/reduction_tests.cpp  |   2 +-
 cpp/tests/reductions/scan_tests.cpp       | 196 ++++++++++++++++++++++
 5 files changed, 325 insertions(+), 37 deletions(-)

diff --git a/cpp/include/cudf/detail/scan.hpp b/cpp/include/cudf/detail/scan.hpp
index 113c15f19a1..8e3db1c7b10 100644
--- a/cpp/include/cudf/detail/scan.hpp
+++ b/cpp/include/cudf/detail/scan.hpp
@@ -26,22 +26,25 @@ namespace detail {
 /**
  * @brief Computes the exclusive scan of a column.
  *
- * The null values are skipped for the operation, and if an input element
- * at `i` is null, then the output element at `i` will also be null.
+ * The null values are skipped for the operation, and if an input element at `i` is null, then the
+ * output element at `i` will also be null.
  *
- * The identity value for the column type as per the aggregation type
- * is used for the value of the first element in the output column.
+ * The identity value for the column type as per the aggregation type is used for the value of the
+ * first element in the output column.
  *
- * @throws cudf::logic_error if column data_type is not an arithmetic type.
+ * Struct columns are allowed with aggregation types Min and Max.
  *
- * @param input The input column view for the scan
- * @param agg unique_ptr to aggregation operator applied by the scan
- * @param null_handling Exclude null values when computing the result if
- *                      null_policy::EXCLUDE. Include nulls if null_policy::INCLUDE.
- *                      Any operation with a null results in a null.
+ * @throws cudf::logic_error if column data_type is not an arithmetic type or struct type but the
+ *                           `agg` is not Min or Max.
+ *
+ * @param input The input column view for the scan.
+ * @param agg unique_ptr to aggregation operator applied by the scan.
+ * @param null_handling Exclude null values when computing the result if null_policy::EXCLUDE.
+ *                      Include nulls if null_policy::INCLUDE. Any operation with a null results in
+ *                      a null.
  * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned scalar's device memory
- * @returns Column with scan results
+ * @param mr Device memory resource used to allocate the returned scalar's device memory.
+ * @returns Column with scan results.
  */
 std::unique_ptr<column> scan_exclusive(column_view const& input,
                                        std::unique_ptr<aggregation> const& agg,
@@ -52,22 +55,22 @@ std::unique_ptr<column> scan_exclusive(column_view const& input,
 /**
  * @brief Computes the inclusive scan of a column.
  *
- * The null values are skipped for the operation, and if an input element
- * at `i` is null, then the output element at `i` will also be null.
+ * The null values are skipped for the operation, and if an input element at `i` is null, then the
+ * output element at `i` will also be null.
  *
- * String columns are allowed with aggregation types Min and Max.
+ * String and struct columns are allowed with aggregation types Min and Max.
  *
- * @throws cudf::logic_error if column data_type is not an arithmetic type
- *                           or string type but the `agg` is not Min or Max
+ * @throws cudf::logic_error if column data_type is not an arithmetic type or string/struct types
+ *                           but the `agg` is not Min or Max.
  *
- * @param input The input column view for the scan
- * @param agg unique_ptr to aggregation operator applied by the scan
- * @param null_handling Exclude null values when computing the result if
- *                      null_policy::EXCLUDE. Include nulls if null_policy::INCLUDE.
- *                      Any operation with a null results in a null.
+ * @param input The input column view for the scan.
+ * @param agg unique_ptr to aggregation operator applied by the scan.
+ * @param null_handling Exclude null values when computing the result if null_policy::EXCLUDE.
+ *                      Include nulls if null_policy::INCLUDE. Any operation with a null results in
+ *                      a null.
  * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned scalar's device memory
- * @returns Column with scan results
+ * @param mr Device memory resource used to allocate the returned scalar's device memory.
+ * @returns Column with scan results.
  */
 std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        std::unique_ptr<aggregation> const& agg,
@@ -76,24 +79,24 @@ std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        rmm::mr::device_memory_resource* mr);
 
 /**
- * @brief Generate row ranks for a column
+ * @brief Generate row ranks for a column.
  *
- * @param order_by Input column to generate ranks for
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return rank values
+ * @param order_by Input column to generate ranks for.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return rank values.
  */
 std::unique_ptr<column> inclusive_rank_scan(column_view const& order_by,
                                             rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr);
 
 /**
- * @brief Generate row dense ranks for a column
+ * @brief Generate row dense ranks for a column.
  *
- * @param order_by Input column to generate ranks for
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return rank values
+ * @param order_by Input column to generate ranks for.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return rank values.
  */
 std::unique_ptr<column> inclusive_dense_rank_scan(column_view const& order_by,
                                                   rmm::cuda_stream_view stream,
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index b565e8dc6d8..ae3e3232e06 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -239,7 +239,13 @@ struct group_scan_functor<K,
     auto gather_map_view =
       column_view(data_type{type_to_id<offset_type>()}, gather_map.size(), gather_map.data());
 
+    //
     // Gather the children elements of the prefix min/max struct elements first.
+    //
+    // Typically, we should use `get_sliced_child` for each child column to properly handle the
+    // input if it is a sliced view. However, since the input to this function is just generated
+    // from groupby internal APIs which is never a sliced view, we just use `child_begin` and
+    // `child_end` iterators for simplicity.
     auto scanned_children =
       cudf::detail::gather(
         table_view(std::vector<column_view>{values.child_begin(), values.child_end()}),
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index 02ecd6df4d9..70f5ca90539 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -14,13 +14,17 @@
  * limitations under the License.
  */
 
-#include "scan.cuh"
+#include <reductions/arg_minmax_util.cuh>
+#include <reductions/scan/scan.cuh>
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/structs/utilities.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/reduction.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -150,6 +154,72 @@ struct scan_functor<Op, cudf::string_view> {
   }
 };
 
+template <typename Op>
+struct scan_functor<Op, cudf::struct_view> {
+  static std::unique_ptr<column> invoke(column_view const& input,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+  {
+    // Op is used only to determined if we want to find the min or max element.
+    auto constexpr is_min_op = std::is_same_v<Op, DeviceMin>;
+
+    // Build indices of the scan operation results (ARGMIN/ARGMAX).
+    // When finding ARGMIN, we need to consider nulls as larger than non-null elements, and the
+    // opposite for ARGMAX.
+    auto gather_map    = rmm::device_uvector<size_type>(input.size(), stream);
+    auto const do_scan = [&](auto const& binop) {
+      thrust::inclusive_scan(rmm::exec_policy(stream),
+                             thrust::counting_iterator<size_type>(0),
+                             thrust::counting_iterator<size_type>(input.size()),
+                             gather_map.begin(),
+                             binop);
+    };
+
+    auto constexpr null_precedence = is_min_op ? cudf::null_order::AFTER : cudf::null_order::BEFORE;
+    auto const flattened_input     = cudf::structs::detail::flatten_nested_columns(
+      table_view{{input}}, {}, std::vector<null_order>{null_precedence});
+    auto const d_flattened_input_ptr = table_device_view::create(flattened_input, stream);
+    auto const flattened_null_precedences =
+      is_min_op ? cudf::detail::make_device_uvector_async(flattened_input.null_orders(), stream)
+                : rmm::device_uvector<cudf::null_order>(0, stream);
+
+    if (input.has_nulls()) {
+      auto const binop = cudf::reduction::detail::row_arg_minmax_fn<true>(
+        input.size(), *d_flattened_input_ptr, flattened_null_precedences.data(), is_min_op);
+      do_scan(binop);
+    } else {
+      auto const binop = cudf::reduction::detail::row_arg_minmax_fn<false>(
+        input.size(), *d_flattened_input_ptr, flattened_null_precedences.data(), is_min_op);
+      do_scan(binop);
+    }
+
+    // Gather the children columns of the input column. Must use `get_sliced_child` to properly
+    // handle input in case it is a sliced view.
+    auto const input_children = [&] {
+      auto const it = cudf::detail::make_counting_transform_iterator(
+        0, [structs_view = structs_column_view{input}, stream](auto const child_idx) {
+          return structs_view.get_sliced_child(child_idx);
+        });
+      return std::vector<column_view>(it, it + input.num_children());
+    }();
+
+    // Gather the children elements of the prefix min/max struct elements for the output.
+    auto scanned_children = cudf::detail::gather(table_view{input_children},
+                                                 gather_map,
+                                                 out_of_bounds_policy::DONT_CHECK,
+                                                 negative_index_policy::NOT_ALLOWED,
+                                                 stream,
+                                                 mr)
+                              ->release();
+
+    // Don't need to set a null mask because that will be handled at the caller.
+    return make_structs_column(input.size(),
+                               std::move(scanned_children),
+                               UNKNOWN_NULL_COUNT,
+                               rmm::device_buffer{0, stream, mr});
+  }
+};
+
 /**
  * @brief Dispatcher for running a Scan operation on an input column
  *
@@ -161,7 +231,11 @@ struct scan_dispatcher {
   template <typename T>
   static constexpr bool is_supported()
   {
-    return std::is_invocable_v<Op, T, T> && !cudf::is_dictionary<T>();
+    if constexpr (std::is_same_v<T, cudf::struct_view>) {
+      return std::is_same_v<Op, DeviceMin> || std::is_same_v<Op, DeviceMax>;
+    } else {
+      return std::is_invocable_v<Op, T, T> && !cudf::is_dictionary<T>();
+    }
   }
 
  public:
@@ -209,6 +283,15 @@ std::unique_ptr<column> scan_inclusive(
     output->set_null_mask(mask_scan(input, scan_type::INCLUSIVE, stream, mr), UNKNOWN_NULL_COUNT);
   }
 
+  // If the input is a structs column, we also need to push down nulls from the parent output column
+  // into the children columns.
+  if (input.type().id() == type_id::STRUCT && output->has_nulls()) {
+    for (size_type idx = 0; idx < output->num_children(); ++idx) {
+      structs::detail::superimpose_parent_nulls(
+        output->view().null_mask(), output->null_count(), output->child(idx), stream, mr);
+    }
+  }
+
   return output;
 }
 }  // namespace detail
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 2c9279260e7..d8ee8f9d08d 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -17,6 +17,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
@@ -28,7 +29,6 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/types.hpp>
 #include <cudf/wrappers/timestamps.hpp>
-#include <cudf_test/table_utilities.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp
index d1e983460d5..0892436eb47 100644
--- a/cpp/tests/reductions/scan_tests.cpp
+++ b/cpp/tests/reductions/scan_tests.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/iterator.cuh>
@@ -397,3 +398,198 @@ TYPED_TEST(ScanDurationTest, Sum)
   EXPECT_THROW(cudf::scan(col, cudf::make_sum_aggregation(), cudf::scan_type::EXCLUSIVE),
                cudf::logic_error);
 }
+
+struct StructScanTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(StructScanTest, StructScanMinMaxNoNull)
+{
+  using INTS_CW    = cudf::test::fixed_width_column_wrapper<int32_t>;
+  using STRINGS_CW = cudf::test::strings_column_wrapper;
+  using STRUCTS_CW = cudf::test::structs_column_wrapper;
+
+  auto const input = [] {
+    auto child1 = STRINGS_CW{"año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
+    auto child2 = INTS_CW{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    return STRUCTS_CW{{child1, child2}};
+  }();
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{"año", "año", "año", "aaa", "aaa", "aaa", "aaa", "$1", "$1", "$1"};
+      auto child2 = INTS_CW{1, 1, 1, 4, 4, 4, 4, 8, 8, 8};
+      return STRUCTS_CW{{child1, child2}};
+    }();
+    auto const result = cudf::scan(input, cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{"año", "bit", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1"};
+      auto child2 = INTS_CW{1, 2, 3, 3, 3, 3, 3, 3, 3, 3};
+      return STRUCTS_CW{{child1, child2}};
+    }();
+    auto const result = cudf::scan(input, cudf::make_max_aggregation(), cudf::scan_type::INCLUSIVE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
+TEST_F(StructScanTest, StructScanMinMaxSlicedInput)
+{
+  using INTS_CW    = cudf::test::fixed_width_column_wrapper<int>;
+  using STRINGS_CW = cudf::test::strings_column_wrapper;
+  using STRUCTS_CW = cudf::test::structs_column_wrapper;
+  constexpr int32_t dont_care{1};
+
+  auto const input_original = [] {
+    auto child1 = STRINGS_CW{"$dont_care",
+                             "$dont_care",
+                             "año",
+                             "bit",
+                             "₹1",
+                             "aaa",
+                             "zit",
+                             "bat",
+                             "aab",
+                             "$1",
+                             "€1",
+                             "wut",
+                             "₹dont_care"};
+    auto child2 = INTS_CW{dont_care, dont_care, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, dont_care};
+    return STRUCTS_CW{{child1, child2}};
+  }();
+
+  auto const input = cudf::slice(input_original, {2, 12})[0];
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{"año", "año", "año", "aaa", "aaa", "aaa", "aaa", "$1", "$1", "$1"};
+      auto child2 = INTS_CW{1, 1, 1, 4, 4, 4, 4, 8, 8, 8};
+      return STRUCTS_CW{{child1, child2}};
+    }();
+    auto const result = cudf::scan(input, cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{"año", "bit", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1"};
+      auto child2 = INTS_CW{1, 2, 3, 3, 3, 3, 3, 3, 3, 3};
+      return STRUCTS_CW{{child1, child2}};
+    }();
+    auto const result = cudf::scan(input, cudf::make_max_aggregation(), cudf::scan_type::INCLUSIVE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
+TEST_F(StructScanTest, StructScanMinMaxWithNulls)
+{
+  using INTS_CW    = cudf::test::fixed_width_column_wrapper<int>;
+  using STRINGS_CW = cudf::test::strings_column_wrapper;
+  using STRUCTS_CW = cudf::test::structs_column_wrapper;
+  using cudf::test::iterators::nulls_at;
+
+  auto const input = [] {
+    auto child1 = STRINGS_CW{{"año",
+                              "bit",
+                              "₹1" /*NULL*/,
+                              "aaa" /*NULL*/,
+                              "zit",
+                              "bat",
+                              "aab",
+                              "$1" /*NULL*/,
+                              "€1" /*NULL*/,
+                              "wut"},
+                             nulls_at({2, 7})};
+    auto child2 = INTS_CW{{1, 2, 3 /*NULL*/, 4 /*NULL*/, 5, 6, 7, 8 /*NULL*/, 9 /*NULL*/, 10},
+                          nulls_at({2, 7})};
+    return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})};
+  }();
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{
+        "año", "año", "año", "" /*NULL*/, "año", "año", "aab", "aab", "" /*NULL*/, "aab"};
+      auto child2 = INTS_CW{1, 1, 1, 0 /*NULL*/, 1, 1, 7, 7, 0 /*NULL*/, 7};
+      return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})};
+    }();
+
+    auto const result = cudf::scan(
+      input, cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE, null_policy::EXCLUDE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{
+        "año", "bit", "bit", "" /*NULL*/, "zit", "zit", "zit", "zit", "" /*NULL*/, "zit"};
+      auto child2 = INTS_CW{1, 2, 2, 0 /*NULL*/, 5, 5, 5, 5, 0 /*NULL*/, 5};
+      return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})};
+    }();
+
+    auto const result = cudf::scan(
+      input, cudf::make_max_aggregation(), cudf::scan_type::INCLUSIVE, null_policy::EXCLUDE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{"año",
+                               "año",
+                               "año",
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/};
+      auto child2 = INTS_CW{1,
+                            1,
+                            1,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/};
+      return STRUCTS_CW{{child1, child2}, nulls_at({3, 4, 5, 6, 7, 8, 9})};
+    }();
+
+    auto const result = cudf::scan(
+      input, cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE, null_policy::INCLUDE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{"año",
+                               "bit",
+                               "bit",
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/};
+      auto child2 = INTS_CW{1,
+                            2,
+                            2,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/};
+      return STRUCTS_CW{{child1, child2}, nulls_at({3, 4, 5, 6, 7, 8, 9})};
+    }();
+
+    auto const result = cudf::scan(
+      input, cudf::make_max_aggregation(), cudf::scan_type::INCLUSIVE, null_policy::INCLUDE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}

From 74ac6ed5e06be9a1ee37f3ceaa1d45b2224266f2 Mon Sep 17 00:00:00 2001
From: Alfred Xu <lovedreamf@gmail.com>
Date: Tue, 30 Nov 2021 22:58:19 +0800
Subject: [PATCH 17/42] fix make_empty_scalar_like (#9782)

Signed-off-by: sperlingxx <lovedreamf@gmail.com>
---
 cpp/src/io/orc/writer_impl.cu | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 2bf020d08a2..9e493c192e4 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -1496,15 +1496,23 @@ orc_table_view make_orc_table_view(table_view const& table,
           append_orc_column(col.child(lists_column_view::child_column_index),
                             &orc_columns[new_col_idx],
                             col_meta.child(lists_column_view::child_column_index));
-        } else if (kind == TypeKind::STRUCT or kind == TypeKind::MAP) {
-          // MAP: skip to the list child - include grandchildren columns instead of children
-          auto const real_parent_col =
-            kind == TypeKind::MAP ? col.child(lists_column_view::child_column_index) : col;
-          for (auto child_idx = 0; child_idx != real_parent_col.num_children(); ++child_idx) {
-            append_orc_column(real_parent_col.child(child_idx),
-                              &orc_columns[new_col_idx],
-                              col_meta.child(child_idx));
+        } else if (kind == TypeKind::STRUCT) {
+          for (auto child_idx = 0; child_idx != col.num_children(); ++child_idx) {
+            append_orc_column(
+              col.child(child_idx), &orc_columns[new_col_idx], col_meta.child(child_idx));
           }
+        } else if (kind == TypeKind::MAP) {
+          // MAP: skip to the list child - include grandchildren columns instead of children
+          auto const real_parent_col   = col.child(lists_column_view::child_column_index);
+          auto const& real_parent_meta = col_meta.child(lists_column_view::child_column_index);
+          CUDF_EXPECTS(real_parent_meta.num_children() == 2,
+                       "Map struct column should have exactly two children");
+          // process MAP key
+          append_orc_column(
+            real_parent_col.child(0), &orc_columns[new_col_idx], real_parent_meta.child(0));
+          // process MAP value
+          append_orc_column(
+            real_parent_col.child(1), &orc_columns[new_col_idx], real_parent_meta.child(1));
         }
       };
 

From dca8a0a0356e90e2b9dfa2a2cedf38d0c90935cb Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Tue, 30 Nov 2021 10:40:18 -0600
Subject: [PATCH 18/42] Fix dtype-argument bug in dask_cudf read_csv (#9796)

Closes #9719

`dask_cudf.read_csv` currently fails when both `usecols` and `dtype` are specified. This PR is  a simple fix.  In the near future, the `_internal_read_csv` implementation should also be modified to produce a `Blockwise` HLG Layer, but I will leave that for a separate PR.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/9796
---
 python/dask_cudf/dask_cudf/io/csv.py          | 19 +++++++++++--------
 .../dask_cudf/dask_cudf/io/tests/test_csv.py  |  5 +++--
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/io/csv.py b/python/dask_cudf/dask_cudf/io/csv.py
index 132201a349e..ebb02e3b6d4 100644
--- a/python/dask_cudf/dask_cudf/io/csv.py
+++ b/python/dask_cudf/dask_cudf/io/csv.py
@@ -110,9 +110,17 @@ def _internal_read_csv(path, chunksize="256 MiB", **kwargs):
     if chunksize is None:
         return read_csv_without_chunksize(path, **kwargs)
 
+    # Let dask.dataframe generate meta
     dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV")
-    usecols = kwargs.pop("usecols", None)
-    meta = dask_reader(filenames[0], **kwargs)._meta
+    kwargs1 = kwargs.copy()
+    usecols = kwargs1.pop("usecols", None)
+    dtype = kwargs1.pop("dtype", None)
+    meta = dask_reader(filenames[0], **kwargs1)._meta
+    names = meta.columns
+    if usecols or dtype:
+        # Regenerate meta with original kwargs if
+        # `usecols` or `dtype` was specified
+        meta = dask_reader(filenames[0], **kwargs)._meta
 
     dsk = {}
     i = 0
@@ -127,18 +135,13 @@ def _internal_read_csv(path, chunksize="256 MiB", **kwargs):
                 chunksize,
             )  # specify which chunk of the file we care about
             if start != 0:
-                kwargs2[
-                    "names"
-                ] = meta.columns  # no header in the middle of the file
+                kwargs2["names"] = names  # no header in the middle of the file
                 kwargs2["header"] = None
-            kwargs2["usecols"] = usecols
             dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2)
 
             i += 1
 
     divisions = [None] * (len(dsk) + 1)
-    if usecols is not None:
-        meta = meta[usecols]
     return dd.core.new_dd_object(dsk, name, meta, divisions)
 
 
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_csv.py b/python/dask_cudf/dask_cudf/io/tests/test_csv.py
index 98061f6c624..32960a90bd7 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_csv.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_csv.py
@@ -136,7 +136,8 @@ def test_read_csv_chunksize_none(tmp_path, compression, size):
     dd.assert_eq(df, df2)
 
 
-def test_csv_reader_usecols(tmp_path):
+@pytest.mark.parametrize("dtype", [{"b": str, "c": int}, None])
+def test_csv_reader_usecols(tmp_path, dtype):
     df = cudf.DataFrame(
         {
             "a": [1, 2, 3, 4] * 100,
@@ -147,6 +148,6 @@ def test_csv_reader_usecols(tmp_path):
     csv_path = str(tmp_path / "usecols_data.csv")
     df.to_csv(csv_path, index=False)
     ddf = dask_cudf.from_cudf(df[["b", "c"]], npartitions=5)
-    ddf2 = dask_cudf.read_csv(csv_path, usecols=["b", "c"])
+    ddf2 = dask_cudf.read_csv(csv_path, usecols=["b", "c"], dtype=dtype)
 
     dd.assert_eq(ddf, ddf2, check_divisions=False, check_index=False)

From 1db05c9d889d04df113986eeee0356778ce8b003 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Tue, 30 Nov 2021 11:45:54 -0600
Subject: [PATCH 19/42] Use Java classloader to find test resources (#9760)

Updates the Java tests to use the classloader to locate test files rather than reaching directly into the source directory.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Gera Shegalov (https://github.com/gerashegalov)

URL: https://github.com/rapidsai/cudf/pull/9760
---
 .../src/test/java/ai/rapids/cudf/TableTest.java | 14 +++++++-------
 .../src/test/java/ai/rapids/cudf/TestUtils.java | 17 ++++++++++++++++-
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 4512a08430c..b4247e9bb7c 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -70,11 +70,11 @@
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
 public class TableTest extends CudfTestBase {
-  private static final File TEST_PARQUET_FILE = new File("src/test/resources/acq.parquet");
-  private static final File TEST_ORC_FILE = new File("src/test/resources/TestOrcFile.orc");
-  private static final File TEST_ORC_TIMESTAMP_DATE_FILE = new File(
-      "src/test/resources/timestamp-date-test.orc");
-  private static final File TEST_DECIMAL_PARQUET_FILE = new File("src/test/resources/decimal.parquet");
+  private static final File TEST_PARQUET_FILE = TestUtils.getResourceAsFile("acq.parquet");
+  private static final File TEST_ORC_FILE = TestUtils.getResourceAsFile("TestOrcFile.orc");
+  private static final File TEST_ORC_TIMESTAMP_DATE_FILE = TestUtils.getResourceAsFile("timestamp-date-test.orc");
+  private static final File TEST_DECIMAL_PARQUET_FILE = TestUtils.getResourceAsFile("decimal.parquet");
+  private static final File TEST_SIMPLE_CSV_FILE = TestUtils.getResourceAsFile("simple.csv");
 
   private static final Schema CSV_DATA_BUFFER_SCHEMA = Schema.builder()
       .column(DType.INT32, "A")
@@ -548,7 +548,7 @@ void testReadCSVPrune() {
         .column(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
         .column(110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.2, 119.8)
         .build();
-         Table table = Table.readCSV(schema, opts, new File("./src/test/resources/simple.csv"))) {
+         Table table = Table.readCSV(schema, opts, TEST_SIMPLE_CSV_FILE)) {
       assertTablesAreEqual(expected, table);
     }
   }
@@ -675,7 +675,7 @@ void testReadCSV() {
         .column(120L, 121L, 122L, 123L, 124L, 125L, 126L, 127L, 128L, 129L)
         .column("one", "two", "three", "four", "five", "six", "seven\ud801\uddb8", "eight\uBF68", "nine\u03E8", "ten")
         .build();
-         Table table = Table.readCSV(schema, new File("./src/test/resources/simple.csv"))) {
+         Table table = Table.readCSV(schema, TEST_SIMPLE_CSV_FILE)) {
       assertTablesAreEqual(expected, table);
     }
   }
diff --git a/java/src/test/java/ai/rapids/cudf/TestUtils.java b/java/src/test/java/ai/rapids/cudf/TestUtils.java
index 5a799c666c2..a1acab5883b 100644
--- a/java/src/test/java/ai/rapids/cudf/TestUtils.java
+++ b/java/src/test/java/ai/rapids/cudf/TestUtils.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -18,6 +18,9 @@
 
 package ai.rapids.cudf;
 
+import java.io.File;
+import java.net.URISyntaxException;
+import java.net.URL;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Random;
@@ -211,4 +214,16 @@ static Double[] getDoubles(final long seed, final int size, int specialValues) {
     });
     return result;
   }
+
+  public static File getResourceAsFile(String resourceName) {
+    URL url = TestUtils.class.getClassLoader().getResource(resourceName);
+    if (url == null) {
+      throw new IllegalArgumentException("Unable to locate resource: " + resourceName);
+    }
+    try {
+      return new File(url.toURI());
+    } catch (URISyntaxException e) {
+      throw new RuntimeException(e);
+    }
+  }
 }

From 1697f63b9e6e80695cb157f479fada72d053fa1a Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Tue, 30 Nov 2021 23:39:13 +0530
Subject: [PATCH 20/42] Run compute-sanitizer in nightly build (#9641)

Addresses part of https://github.com/rapidsai/cudf/issues/904
- This PR enables run of `compute-sanitizer --tool memcheck` on libcudf unit tests when env `COMPUTE_SANITIZER_ENABLE=true`
  This env `COMPUTE_SANITIZER_ENABLE` will be enabled only in nightly builds of cudf. (To be Enabled in PR https://github.com/rapidsai/gpuci-scripts/pull/675)
- This PR also adds script to parse compute-sanitizer log to junit xml file which can be processed by Jenkins.
  Reports only failures. If no errors, no tests are reported under memcheck results.

Note: Only `memcheck` is enabled now.  when required, other checks of compute-sanitizer could be enabled later.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/9641
---
 ci/gpu/build.sh | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 664e774c68a..8f83c169330 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 ##############################################
 # cuDF GPU build and test script for CI      #
 ##############################################
@@ -176,6 +176,28 @@ else
         ${gt} --gtest_output=xml:"$WORKSPACE/test-results/"
     done
 
+    ################################################################################
+    # MEMCHECK - Run compute-sanitizer on GoogleTest (only in nightly builds)
+    ################################################################################
+    if [[ "$BUILD_MODE" == "branch" && "$BUILD_TYPE" == "gpu" ]]; then
+        if [[ "$COMPUTE_SANITIZER_ENABLE" == "true" ]]; then
+            gpuci_logger "Memcheck on GoogleTests with rmm_mode=cuda"
+            export GTEST_CUDF_RMM_MODE=cuda
+            COMPUTE_SANITIZER_CMD="compute-sanitizer --tool memcheck"
+            mkdir -p "$WORKSPACE/test-results/"
+            for gt in gtests/*; do
+                test_name=$(basename ${gt})
+                if [[ "$test_name" == "ERROR_TEST" ]]; then
+                  continue
+                fi
+                echo "Running GoogleTest $test_name"
+                ${COMPUTE_SANITIZER_CMD} ${gt} | tee "$WORKSPACE/test-results/${test_name}.cs.log"
+            done
+            unset GTEST_CUDF_RMM_MODE
+            # test-results/*.cs.log are processed in gpuci
+        fi
+    fi
+
     CUDF_CONDA_FILE=`find ${CONDA_ARTIFACT_PATH} -name "libcudf-*.tar.bz2"`
     CUDF_CONDA_FILE=`basename "$CUDF_CONDA_FILE" .tar.bz2` #get filename without extension
     CUDF_CONDA_FILE=${CUDF_CONDA_FILE//-/=} #convert to conda install

From 69d576543b5414372f36d02a189a7217d3bb8006 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 30 Nov 2021 14:40:34 -0500
Subject: [PATCH 21/42] Update check for inf/nan strings in libcudf float
 conversion to ignore case (#9694)

Reference https://github.com/rapidsai/cudf/pull/9613/files#r743579126

Add support to ignore case for strings `INF`, `INFINITY` and `NAN` to `cudf::strings::is_float` and `cudf::strings::to_float` for consistency with https://en.cppreference.com/w/cpp/string/basic_string/stof

Also, remove the expensive `replace` call in the cudf  before calling this from Python.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Robert Maynard (https://github.com/robertmaynard)
  - Nghia Truong (https://github.com/ttnghia)
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/9694
---
 cpp/include/cudf/strings/string.cuh           | 64 +++++++++++++---
 cpp/src/strings/convert/convert_floats.cu     | 13 ++--
 cpp/tests/strings/floats_tests.cpp            | 51 ++++---------
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 23 +++---
 python/cudf/cudf/core/column/string.py        | 73 -------------------
 5 files changed, 85 insertions(+), 139 deletions(-)

diff --git a/cpp/include/cudf/strings/string.cuh b/cpp/include/cudf/strings/string.cuh
index 82da5ad8f10..d85d19d7f10 100644
--- a/cpp/include/cudf/strings/string.cuh
+++ b/cpp/include/cudf/strings/string.cuh
@@ -52,6 +52,43 @@ inline __device__ bool is_integer(string_view const& d_str)
            thrust::seq, begin, end, [] __device__(auto chr) { return chr >= '0' && chr <= '9'; });
 }
 
+/**
+ * @brief Returns true if input contains the not-a-number string.
+ *
+ * The following are valid for this function: "NAN" and "NaN"
+ * @param d_str input string
+ * @return true if input is as valid NaN string.
+ */
+inline __device__ bool is_nan_str(string_view const& d_str)
+{
+  auto const ptr = d_str.data();
+  return (d_str.size_bytes() == 3) && (ptr[0] == 'N' || ptr[0] == 'n') &&
+         (ptr[1] == 'A' || ptr[1] == 'a') && (ptr[2] == 'N' || ptr[2] == 'n');
+}
+
+/**
+ * @brief Returns true if input contains the infinity string.
+ *
+ * The following are valid for this function: "INF", "INFINITY", and "Inf"
+ * @param d_str input string
+ * @return true if input is as valid Inf string.
+ */
+inline __device__ bool is_inf_str(string_view const& d_str)
+{
+  auto const ptr  = d_str.data();
+  auto const size = d_str.size_bytes();
+
+  if (size != 3 && size != 8) return false;
+
+  auto const prefix_valid = (ptr[0] == 'I' || ptr[0] == 'i') && (ptr[1] == 'N' || ptr[1] == 'n') &&
+                            (ptr[2] == 'F' || ptr[2] == 'f');
+
+  return prefix_valid &&
+         ((size == 3) || ((ptr[3] == 'I' || ptr[3] == 'i') && (ptr[4] == 'N' || ptr[4] == 'n') &&
+                          (ptr[5] == 'I' || ptr[5] == 'i') && (ptr[6] == 'T' || ptr[6] == 't') &&
+                          (ptr[7] == 'Y' || ptr[7] == 'y')));
+}
+
 /**
  * @brief Returns `true` if all characters in the string
  * are valid for conversion to a float type.
@@ -65,8 +102,8 @@ inline __device__ bool is_integer(string_view const& d_str)
  * An empty string returns `false`.
  * No bounds checking is performed to verify if the value would fit
  * within a specific float type.
- * The following strings are also allowed "NaN", "Inf" and, "-Inf"
- * and will return true.
+ * The following strings are also allowed and will return true:
+ *  "NaN", "NAN", "Inf", "INF", "INFINITY"
  *
  * @param d_str String to check.
  * @return true if string has valid float characters
@@ -74,29 +111,32 @@ inline __device__ bool is_integer(string_view const& d_str)
 inline __device__ bool is_float(string_view const& d_str)
 {
   if (d_str.empty()) return false;
-  // strings allowed by the converter
-  if (d_str.compare("NaN", 3) == 0) return true;
-  if (d_str.compare("Inf", 3) == 0) return true;
-  if (d_str.compare("-Inf", 4) == 0) return true;
   bool decimal_found  = false;
   bool exponent_found = false;
   size_type bytes     = d_str.size_bytes();
   const char* data    = d_str.data();
   // sign character allowed at the beginning of the string
-  size_type chidx = (*data == '-' || *data == '+') ? 1 : 0;
-  bool result     = chidx < bytes;
+  size_type ch_idx = (*data == '-' || *data == '+') ? 1 : 0;
+
+  bool result = ch_idx < bytes;
+  // check for nan and infinity strings
+  if (result && data[ch_idx] > '9') {
+    auto const inf_nan = string_view(data + ch_idx, bytes - ch_idx);
+    if (is_nan_str(inf_nan) || is_inf_str(inf_nan)) return true;
+  }
+
   // check for float chars [0-9] and a single decimal '.'
   // and scientific notation [eE][+-][0-9]
-  for (; chidx < bytes; ++chidx) {
-    auto chr = data[chidx];
+  for (; ch_idx < bytes; ++ch_idx) {
+    auto chr = data[ch_idx];
     if (chr >= '0' && chr <= '9') continue;
     if (!decimal_found && chr == '.') {
       decimal_found = true;  // no more decimals
       continue;
     }
     if (!exponent_found && (chr == 'e' || chr == 'E')) {
-      if (chidx + 1 < bytes) chr = data[chidx + 1];
-      if (chr == '-' || chr == '+') ++chidx;
+      if (ch_idx + 1 < bytes) chr = data[ch_idx + 1];
+      if (chr == '-' || chr == '+') ++ch_idx;
       decimal_found  = true;  // no decimal allowed in exponent
       exponent_found = true;  // no more exponents
       continue;
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index 366d4fe7d42..70b5f528213 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -45,7 +45,7 @@ namespace {
  * @brief This function converts the given string into a
  * floating point double value.
  *
- * This will also map strings containing "NaN", "Inf" and "-Inf"
+ * This will also map strings containing "NaN", "Inf", etc.
  * to the appropriate float values.
  *
  * This function will also handle scientific notation format.
@@ -55,16 +55,19 @@ __device__ inline double stod(string_view const& d_str)
   const char* in_ptr = d_str.data();
   const char* end    = in_ptr + d_str.size_bytes();
   if (end == in_ptr) return 0.0;
-  // special strings
-  if (d_str.compare("NaN", 3) == 0) return std::numeric_limits<double>::quiet_NaN();
-  if (d_str.compare("Inf", 3) == 0) return std::numeric_limits<double>::infinity();
-  if (d_str.compare("-Inf", 4) == 0) return -std::numeric_limits<double>::infinity();
   double sign{1.0};
   if (*in_ptr == '-' || *in_ptr == '+') {
     sign = (*in_ptr == '-' ? -1 : 1);
     ++in_ptr;
   }
 
+  // special strings: NaN, Inf
+  if ((in_ptr < end) && *in_ptr > '9') {
+    auto const inf_nan = string_view(in_ptr, static_cast<size_type>(thrust::distance(in_ptr, end)));
+    if (string::is_nan_str(inf_nan)) return std::numeric_limits<double>::quiet_NaN();
+    if (string::is_inf_str(inf_nan)) return sign * std::numeric_limits<double>::infinity();
+  }
+
   // Parse and store the mantissa as much as we can,
   // until we are about to exceed the limit of uint64_t
   constexpr uint64_t max_holding = (std::numeric_limits<uint64_t>::max() - 9L) / 10L;
diff --git a/cpp/tests/strings/floats_tests.cpp b/cpp/tests/strings/floats_tests.cpp
index 126bffa1e49..e6f4f6bb8d9 100644
--- a/cpp/tests/strings/floats_tests.cpp
+++ b/cpp/tests/strings/floats_tests.cpp
@@ -58,32 +58,20 @@ TEST_F(StringsConvertTest, IsFloat)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);
 
   cudf::test::strings_column_wrapper strings2(
-    {"+175", "-34", "9.8", "1234567890", "6.7e17", "-917.2e5"});
+    {"-34", "9.8", "1234567890", "-917.2e5", "INF", "NAN", "-Inf", "INFINITY"});
   results = cudf::strings::is_float(cudf::strings_column_view(strings2));
-  cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1, 1, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
 }
 
 TEST_F(StringsConvertTest, ToFloats32)
 {
-  std::vector<const char*> h_strings{"1234",
-                                     nullptr,
-                                     "-876",
-                                     "543.2",
-                                     "-0.12",
-                                     ".25",
-                                     "-.002",
-                                     "",
-                                     "-0.0",
-                                     "1.2e4",
-                                     "NaN",
-                                     "abc123",
-                                     "123abc",
-                                     "456e",
-                                     "-1.78e+5",
-                                     "-122.33644782123456789",
-                                     "12e+309",
-                                     "3.4028236E38"};
+  std::vector<const char*> h_strings{
+    "1234",    nullptr,        "-876",     "543.2",
+    "-0.12",   ".25",          "-.002",    "",
+    "-0.0",    "1.2e4",        "NAN",      "abc123",
+    "123abc",  "456e",         "-1.78e+5", "-122.33644782123456789",
+    "12e+309", "3.4028236E38", "INF",      "Infinity"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
@@ -135,24 +123,11 @@ TEST_F(StringsConvertTest, FromFloats32)
 
 TEST_F(StringsConvertTest, ToFloats64)
 {
-  std::vector<const char*> h_strings{"1234",
-                                     nullptr,
-                                     "-876",
-                                     "543.2",
-                                     "-0.12",
-                                     ".25",
-                                     "-.002",
-                                     "",
-                                     "-0.0",
-                                     "1.28e256",
-                                     "NaN",
-                                     "abc123",
-                                     "123abc",
-                                     "456e",
-                                     "-1.78e+5",
-                                     "-122.33644782",
-                                     "12e+309",
-                                     "1.7976931348623159E308"};
+  std::vector<const char*> h_strings{
+    "1234",   nullptr,    "-876",     "543.2",         "-0.12",   ".25",
+    "-.002",  "",         "-0.0",     "1.28e256",      "NaN",     "abc123",
+    "123abc", "456e",     "-1.78e+5", "-122.33644782", "12e+309", "1.7976931348623159E308",
+    "-Inf",   "-INFINITY"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index a582541a0d4..cf602c26717 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -4919,11 +4919,12 @@ void testIsFloat() {
     try (ColumnVector floatStringCV = ColumnVector.fromStrings(floatStrings);
          ColumnVector isFloat = floatStringCV.isFloat();
          ColumnVector floats = floatStringCV.asFloats();
-         ColumnVector expectedFloats = ColumnVector.fromBoxedFloats(0f, 0f, Float.POSITIVE_INFINITY,
-             Float.NEGATIVE_INFINITY, 0f, 0f, -0f, 0f, Float.MAX_VALUE, Float.POSITIVE_INFINITY,
-             -Float.MAX_VALUE, Float.NEGATIVE_INFINITY, 1.2e-24f, 0f, 0f, null, 423f);
-         ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, true, true, false,
-             false, true, true, true, true, true, true, true, false, false, null, true)) {
+         ColumnVector expectedFloats = ColumnVector.fromBoxedFloats(0f, Float.NaN, Float.POSITIVE_INFINITY,
+             Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY, Float.POSITIVE_INFINITY, -0f, 0f,
+             Float.MAX_VALUE, Float.POSITIVE_INFINITY, -Float.MAX_VALUE, Float.NEGATIVE_INFINITY,
+             1.2e-24f, 0f, 0f, null, 423f);
+         ColumnVector expected = ColumnVector.fromBoxedBooleans(false, true, true, true, true,
+             true, true, true, true, true, true, true, true, false, false, null, true)) {
       assertColumnsAreEqual(expected, isFloat);
       assertColumnsAreEqual(expectedFloats, floats);
     }
@@ -4944,12 +4945,12 @@ void testIsDouble() {
     try (ColumnVector doubleStringCV = ColumnVector.fromStrings(doubleStrings);
          ColumnVector isDouble = doubleStringCV.isFloat();
          ColumnVector doubles = doubleStringCV.asDoubles();
-         ColumnVector expectedDoubles = ColumnVector.fromBoxedDoubles(0d, 0d,
-             Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, 0d, 0d, -0d, 0d, Double.MAX_VALUE,
-             Double.POSITIVE_INFINITY, -Double.MAX_VALUE, Double.NEGATIVE_INFINITY, 1.2e-234d, 0d,
-             0d, null, 423d);
-         ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, true, true, false,
-             false, true, true, true, true, true, true, true, false, false, null, true)) {
+         ColumnVector expectedDoubles = ColumnVector.fromBoxedDoubles(0d, Double.NaN,
+             Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY,
+             -0d, 0d, Double.MAX_VALUE, Double.POSITIVE_INFINITY, -Double.MAX_VALUE, Double.NEGATIVE_INFINITY,
+             1.2e-234d, 0d, 0d, null, 423d);
+         ColumnVector expected = ColumnVector.fromBoxedBooleans(false, true, true, true, true,
+             true, true, true, true, true, true, true, true, false, false, null, true)) {
       assertColumnsAreEqual(expected, isDouble);
       assertColumnsAreEqual(expectedDoubles, doubles);
     }
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index a167383c65c..2a91abc5701 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -97,69 +97,6 @@ def str_to_boolean(column: StringColumn):
     cudf.dtype("timedelta64[ns]"): str_cast.int2timedelta,
 }
 
-_NAN_INF_VARIATIONS = [
-    "nan",
-    "NAN",
-    "Nan",
-    "naN",
-    "nAN",
-    "NAn",
-    "nAn",
-    "-inf",
-    "-INF",
-    "-InF",
-    "-inF",
-    "-iNF",
-    "-INf",
-    "-iNf",
-    "+inf",
-    "+INF",
-    "+InF",
-    "+inF",
-    "+iNF",
-    "+INf",
-    "+Inf",
-    "+iNf",
-    "inf",
-    "INF",
-    "InF",
-    "inF",
-    "iNF",
-    "INf",
-    "iNf",
-]
-_LIBCUDF_SUPPORTED_NAN_INF_VARIATIONS = [
-    "NaN",
-    "NaN",
-    "NaN",
-    "NaN",
-    "NaN",
-    "NaN",
-    "NaN",
-    "-Inf",
-    "-Inf",
-    "-Inf",
-    "-Inf",
-    "-Inf",
-    "-Inf",
-    "-Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-]
-
 
 def _is_supported_regex_flags(flags):
     return flags == 0 or (
@@ -5309,16 +5246,6 @@ def as_numerical_column(
                     "type due to presence of non-integer values."
                 )
         elif out_dtype.kind == "f":
-            # TODO: Replace this `replace` call with a
-            # case-insensitive method once following
-            # issue is fixed: https://github.com/rapidsai/cudf/issues/5217
-            old_values = cudf.core.column.as_column(_NAN_INF_VARIATIONS)
-            new_values = cudf.core.column.as_column(
-                _LIBCUDF_SUPPORTED_NAN_INF_VARIATIONS
-            )
-            string_col = libcudf.replace.replace(
-                string_col, old_values, new_values
-            )
             if not libstrings.is_float(string_col).all():
                 raise ValueError(
                     "Could not convert strings to float "

From 00a8845780ae9289f483f1113e5c62d4acd7dfe7 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Tue, 30 Nov 2021 14:02:24 -0600
Subject: [PATCH 22/42] Refactor TableTest assertion methods to a separate
 utility class (#9762)

TableTest has a number of dependencies, e.g.: Parquet, Hadoop, etc., that make it less ideal to be used in an external project. This moves the column and table assertion methods to a separate AssertUtils utility class that avoids the extra dependencies.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Gera Shegalov (https://github.com/gerashegalov)

URL: https://github.com/rapidsai/cudf/pull/9762
---
 .../ai/rapids/cudf/ArrowColumnVectorTest.java |   3 +-
 .../test/java/ai/rapids/cudf/AssertUtils.java | 272 ++++++++++++++++++
 .../java/ai/rapids/cudf/BinaryOpTest.java     |   2 +-
 .../ai/rapids/cudf/ByteColumnVectorTest.java  |   6 +-
 .../java/ai/rapids/cudf/ColumnVectorTest.java |  38 +--
 .../test/java/ai/rapids/cudf/IfElseTest.java  |   2 +-
 .../ai/rapids/cudf/IntColumnVectorTest.java   |   4 +-
 .../test/java/ai/rapids/cudf/ScalarTest.java  |   2 +-
 .../test/java/ai/rapids/cudf/TableTest.java   | 251 +---------------
 .../cudf/TimestampColumnVectorTest.java       |   2 +-
 .../test/java/ai/rapids/cudf/UnaryOpTest.java |   2 +-
 .../cudf/ast/CompiledExpressionTest.java      |   2 +-
 12 files changed, 309 insertions(+), 277 deletions(-)
 create mode 100644 java/src/test/java/ai/rapids/cudf/AssertUtils.java

diff --git a/java/src/test/java/ai/rapids/cudf/ArrowColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ArrowColumnVectorTest.java
index d5d4059d18d..2a11b24b3a8 100644
--- a/java/src/test/java/ai/rapids/cudf/ArrowColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ArrowColumnVectorTest.java
@@ -21,7 +21,6 @@
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 
-import ai.rapids.cudf.HostColumnVector.BasicType;
 import ai.rapids.cudf.HostColumnVector.ListType;
 import ai.rapids.cudf.HostColumnVector.StructType;
 
@@ -40,7 +39,7 @@
 
 import org.junit.jupiter.api.Test;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 
diff --git a/java/src/test/java/ai/rapids/cudf/AssertUtils.java b/java/src/test/java/ai/rapids/cudf/AssertUtils.java
new file mode 100644
index 00000000000..184e7dd0c57
--- /dev/null
+++ b/java/src/test/java/ai/rapids/cudf/AssertUtils.java
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+import java.util.List;
+
+import static org.junit.jupiter.api.Assertions.assertArrayEquals;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+/** Utility methods for asserting in unit tests */
+public class AssertUtils {
+
+  /**
+   * Checks and asserts that passed in columns match
+   * @param expect The expected result column
+   * @param cv The input column
+   */
+  public static void assertColumnsAreEqual(ColumnView expect, ColumnView cv) {
+    assertColumnsAreEqual(expect, cv, "unnamed");
+  }
+
+  /**
+   * Checks and asserts that passed in columns match
+   * @param expected The expected result column
+   * @param cv The input column
+   * @param colName The name of the column
+   */
+  public static void assertColumnsAreEqual(ColumnView expected, ColumnView cv, String colName) {
+    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false);
+  }
+
+  /**
+   * Checks and asserts that passed in host columns match
+   * @param expected The expected result host column
+   * @param cv The input host column
+   * @param colName The name of the host column
+   */
+  public static void assertColumnsAreEqual(HostColumnVector expected, HostColumnVector cv, String colName) {
+    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false);
+  }
+
+  /**
+   * Checks and asserts that passed in Struct columns match
+   * @param expected The expected result Struct column
+   * @param cv The input Struct column
+   */
+  public static void assertStructColumnsAreEqual(ColumnView expected, ColumnView cv) {
+    assertPartialStructColumnsAreEqual(expected, 0, expected.getRowCount(), cv, "unnamed", true, false);
+  }
+
+  /**
+   * Checks and asserts that passed in Struct columns match
+   * @param expected The expected result Struct column
+   * @param rowOffset The row number to look from
+   * @param length The number of rows to consider
+   * @param cv The input Struct column
+   * @param colName The name of the column
+   * @param enableNullCountCheck Whether to check for nulls in the Struct column
+   * @param enableNullabilityCheck Whether the table have a validity mask
+   */
+  public static void assertPartialStructColumnsAreEqual(ColumnView expected, long rowOffset, long length,
+      ColumnView cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) {
+    try (HostColumnVector hostExpected = expected.copyToHost();
+         HostColumnVector hostcv = cv.copyToHost()) {
+      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCountCheck, enableNullabilityCheck);
+    }
+  }
+
+  /**
+   * Checks and asserts that passed in columns match
+   * @param expected The expected result column
+   * @param cv The input column
+   * @param colName The name of the column
+   * @param enableNullCheck Whether to check for nulls in the column
+   * @param enableNullabilityCheck Whether the table have a validity mask
+   */
+  public static void assertPartialColumnsAreEqual(ColumnView expected, long rowOffset, long length,
+      ColumnView cv, String colName, boolean enableNullCheck, boolean enableNullabilityCheck) {
+    try (HostColumnVector hostExpected = expected.copyToHost();
+         HostColumnVector hostcv = cv.copyToHost()) {
+      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCheck, enableNullabilityCheck);
+    }
+  }
+
+  /**
+   * Checks and asserts that passed in host columns match
+   * @param expected The expected result host column
+   * @param rowOffset start row index
+   * @param length  number of rows from starting offset
+   * @param cv The input host column
+   * @param colName The name of the host column
+   * @param enableNullCountCheck Whether to check for nulls in the host column
+   */
+  public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, long rowOffset, long length,
+                                                  HostColumnVectorCore cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) {
+    assertEquals(expected.getType(), cv.getType(), "Type For Column " + colName);
+    assertEquals(length, cv.getRowCount(), "Row Count For Column " + colName);
+    assertEquals(expected.getNumChildren(), cv.getNumChildren(), "Child Count for Column " + colName);
+    if (enableNullCountCheck) {
+      assertEquals(expected.getNullCount(), cv.getNullCount(), "Null Count For Column " + colName);
+    } else {
+      // TODO add in a proper check when null counts are supported by serializing a partitioned column
+    }
+    if (enableNullabilityCheck) {
+      assertEquals(expected.hasValidityVector(), cv.hasValidityVector(), "Column nullability is different than expected");
+    }
+    DType type = expected.getType();
+    for (long expectedRow = rowOffset; expectedRow < (rowOffset + length); expectedRow++) {
+      long tableRow = expectedRow - rowOffset;
+      assertEquals(expected.isNull(expectedRow), cv.isNull(tableRow),
+          "NULL for Column " + colName + " Row " + tableRow);
+      if (!expected.isNull(expectedRow)) {
+        switch (type.typeId) {
+          case BOOL8: // fall through
+          case INT8: // fall through
+          case UINT8:
+            assertEquals(expected.getByte(expectedRow), cv.getByte(tableRow),
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case INT16: // fall through
+          case UINT16:
+            assertEquals(expected.getShort(expectedRow), cv.getShort(tableRow),
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case INT32: // fall through
+          case UINT32: // fall through
+          case TIMESTAMP_DAYS:
+          case DURATION_DAYS:
+          case DECIMAL32:
+            assertEquals(expected.getInt(expectedRow), cv.getInt(tableRow),
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case INT64: // fall through
+          case UINT64: // fall through
+          case DURATION_MICROSECONDS: // fall through
+          case DURATION_MILLISECONDS: // fall through
+          case DURATION_NANOSECONDS: // fall through
+          case DURATION_SECONDS: // fall through
+          case TIMESTAMP_MICROSECONDS: // fall through
+          case TIMESTAMP_MILLISECONDS: // fall through
+          case TIMESTAMP_NANOSECONDS: // fall through
+          case TIMESTAMP_SECONDS:
+          case DECIMAL64:
+            assertEquals(expected.getLong(expectedRow), cv.getLong(tableRow),
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case DECIMAL128:
+            assertEquals(expected.getBigDecimal(expectedRow), cv.getBigDecimal(tableRow),
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case FLOAT32:
+            CudfTestBase.assertEqualsWithinPercentage(expected.getFloat(expectedRow), cv.getFloat(tableRow), 0.0001,
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case FLOAT64:
+            CudfTestBase.assertEqualsWithinPercentage(expected.getDouble(expectedRow), cv.getDouble(tableRow), 0.0001,
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case STRING:
+            assertArrayEquals(expected.getUTF8(expectedRow), cv.getUTF8(tableRow),
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case LIST:
+            HostMemoryBuffer expectedOffsets = expected.getOffsets();
+            HostMemoryBuffer cvOffsets = cv.getOffsets();
+            int expectedChildRows = expectedOffsets.getInt((expectedRow + 1) * 4) -
+                expectedOffsets.getInt(expectedRow * 4);
+            int cvChildRows = cvOffsets.getInt((tableRow + 1) * 4) -
+                cvOffsets.getInt(tableRow * 4);
+            assertEquals(expectedChildRows, cvChildRows, "Child row count for Column " +
+                colName + " Row " + tableRow);
+            break;
+          case STRUCT:
+            // parent column only has validity which was checked above
+            break;
+          default:
+            throw new IllegalArgumentException(type + " is not supported yet");
+        }
+      }
+    }
+
+    if (type.isNestedType()) {
+      switch (type.typeId) {
+        case LIST:
+          int expectedChildRowOffset = 0;
+          int numChildRows = 0;
+          if (length > 0) {
+            HostMemoryBuffer expectedOffsets = expected.getOffsets();
+            HostMemoryBuffer cvOffsets = cv.getOffsets();
+            expectedChildRowOffset = expectedOffsets.getInt(rowOffset * 4);
+            numChildRows = expectedOffsets.getInt((rowOffset + length) * 4) -
+                expectedChildRowOffset;
+          }
+          assertPartialColumnsAreEqual(expected.getNestedChildren().get(0), expectedChildRowOffset,
+              numChildRows, cv.getNestedChildren().get(0), colName + " list child",
+              enableNullCountCheck, enableNullabilityCheck);
+          break;
+        case STRUCT:
+          List<HostColumnVectorCore> expectedChildren = expected.getNestedChildren();
+          List<HostColumnVectorCore> cvChildren = cv.getNestedChildren();
+          for (int i = 0; i < expectedChildren.size(); i++) {
+            HostColumnVectorCore expectedChild = expectedChildren.get(i);
+            HostColumnVectorCore cvChild = cvChildren.get(i);
+            String childName = colName + " child " + i;
+            assertEquals(length, cvChild.getRowCount(), "Row Count for Column " + colName);
+            assertPartialColumnsAreEqual(expectedChild, rowOffset, length, cvChild,
+                colName, enableNullCountCheck, enableNullabilityCheck);
+          }
+          break;
+        default:
+          throw new IllegalArgumentException(type + " is not supported yet");
+      }
+    }
+  }
+
+  /**
+   * Checks and asserts that the two tables from a given rowindex match based on a provided schema.
+   * @param expected the expected result table
+   * @param rowOffset the row number to start checking from
+   * @param length the number of rows to check
+   * @param table the input table to compare against expected
+   * @param enableNullCheck whether to check for nulls or not
+   * @param enableNullabilityCheck whether the table have a validity mask
+   */
+  public static void assertPartialTablesAreEqual(Table expected, long rowOffset, long length, Table table,
+                                                 boolean enableNullCheck, boolean enableNullabilityCheck) {
+    assertEquals(expected.getNumberOfColumns(), table.getNumberOfColumns());
+    assertEquals(length, table.getRowCount(), "ROW COUNT");
+    for (int col = 0; col < expected.getNumberOfColumns(); col++) {
+      ColumnVector expect = expected.getColumn(col);
+      ColumnVector cv = table.getColumn(col);
+      String name = String.valueOf(col);
+      if (rowOffset != 0 || length != expected.getRowCount()) {
+        name = name + " PART " + rowOffset + "-" + (rowOffset + length - 1);
+      }
+      assertPartialColumnsAreEqual(expect, rowOffset, length, cv, name, enableNullCheck, enableNullabilityCheck);
+    }
+  }
+
+  /**
+   * Checks and asserts that the two tables match
+   * @param expected the expected result table
+   * @param table the input table to compare against expected
+   */
+  public static void assertTablesAreEqual(Table expected, Table table) {
+    assertPartialTablesAreEqual(expected, 0, expected.getRowCount(), table, true, false);
+  }
+
+  public static void assertTableTypes(DType[] expectedTypes, Table t) {
+    int len = t.getNumberOfColumns();
+    assertEquals(expectedTypes.length, len);
+    for (int i = 0; i < len; i++) {
+      ColumnVector vec = t.getColumn(i);
+      DType type = vec.getType();
+      assertEquals(expectedTypes[i], type, "Types don't match at " + i);
+    }
+  }
+}
diff --git a/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java b/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java
index 894861b8c44..0ca997d3c80 100644
--- a/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java
+++ b/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java
@@ -27,7 +27,7 @@
 import java.util.Arrays;
 import java.util.stream.IntStream;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 import static ai.rapids.cudf.TestUtils.*;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 
diff --git a/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java
index 878fa7e4516..a26dbec4907 100644
--- a/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java
@@ -127,9 +127,9 @@ public void testCastToByte() {
          ColumnVector expected1 = ColumnVector.fromBytes((byte)4, (byte)3, (byte)8);
          ColumnVector expected2 = ColumnVector.fromBytes((byte)100);
          ColumnVector expected3 = ColumnVector.fromBytes((byte)-23)) {
-      TableTest.assertColumnsAreEqual(expected1, byteColumnVector1);
-      TableTest.assertColumnsAreEqual(expected2, byteColumnVector2);
-      TableTest.assertColumnsAreEqual(expected3, byteColumnVector3);
+      AssertUtils.assertColumnsAreEqual(expected1, byteColumnVector1);
+      AssertUtils.assertColumnsAreEqual(expected2, byteColumnVector2);
+      AssertUtils.assertColumnsAreEqual(expected3, byteColumnVector3);
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index cf602c26717..fa9052029cc 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -34,8 +34,10 @@
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertStructColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertTablesAreEqual;
 import static ai.rapids.cudf.QuantileMethod.*;
-import static ai.rapids.cudf.TableTest.*;
 import static org.junit.jupiter.api.Assertions.*;
 import static org.junit.jupiter.api.Assumptions.assumeTrue;
 
@@ -86,8 +88,8 @@ void testTransformVector() {
          ColumnVector cv1 = cv.transform(ptx, true);
          ColumnVector cv2 = cv.transform(cuda, false);
          ColumnVector expected = ColumnVector.fromBoxedInts(2*2-2, 3*3-3, null, 4*4-4)) {
-      TableTest.assertColumnsAreEqual(expected, cv1);
-      TableTest.assertColumnsAreEqual(expected, cv2);
+      assertColumnsAreEqual(expected, cv1);
+      assertColumnsAreEqual(expected, cv2);
     }
   }
 
@@ -252,7 +254,7 @@ void testStringCreation() {
     try (ColumnVector cv = ColumnVector.fromStrings("d", "sd", "sde", null, "END");
          HostColumnVector host = cv.copyToHost();
          ColumnVector backAgain = host.copyToDevice()) {
-      TableTest.assertColumnsAreEqual(cv, backAgain);
+      assertColumnsAreEqual(cv, backAgain);
     }
   }
 
@@ -265,7 +267,7 @@ void testUTF8StringCreation() {
             null,
             "END".getBytes(StandardCharsets.UTF_8));
          ColumnVector expected = ColumnVector.fromStrings("d", "sd", "sde", null, "END")) {
-      TableTest.assertColumnsAreEqual(expected, cv);
+      assertColumnsAreEqual(expected, cv);
     }
   }
 
@@ -299,7 +301,7 @@ void testConcatNoNulls() {
          ColumnVector v2 = ColumnVector.fromInts(8, 9);
          ColumnVector v = ColumnVector.concatenate(v0, v1, v2);
          ColumnVector expected = ColumnVector.fromInts(1, 2, 3, 4, 5, 6, 7, 8, 9)) {
-      TableTest.assertColumnsAreEqual(expected, v);
+      assertColumnsAreEqual(expected, v);
     }
   }
 
@@ -310,7 +312,7 @@ void testConcatWithNulls() {
          ColumnVector v2 = ColumnVector.fromBoxedDoubles(null, 9.0);
          ColumnVector v = ColumnVector.concatenate(v0, v1, v2);
          ColumnVector expected = ColumnVector.fromBoxedDoubles(1., 2., 3., 4., 5., 6., 7., null, 9.)) {
-      TableTest.assertColumnsAreEqual(expected, v);
+      assertColumnsAreEqual(expected, v);
     }
   }
 
@@ -1882,13 +1884,13 @@ void testSubvector() {
     try (ColumnVector vec = ColumnVector.fromBoxedInts(1, 2, 3, null, 5);
          ColumnVector expected = ColumnVector.fromBoxedInts(2, 3, null, 5);
          ColumnVector found = vec.subVector(1, 5)) {
-      TableTest.assertColumnsAreEqual(expected, found);
+      assertColumnsAreEqual(expected, found);
     }
 
     try (ColumnVector vec = ColumnVector.fromStrings("1", "2", "3", null, "5");
          ColumnVector expected = ColumnVector.fromStrings("2", "3", null, "5");
          ColumnVector found = vec.subVector(1, 5)) {
-      TableTest.assertColumnsAreEqual(expected, found);
+      assertColumnsAreEqual(expected, found);
     }
   }
 
@@ -2014,7 +2016,7 @@ void testTrimStringsWhiteSpace() {
     try (ColumnVector cv = ColumnVector.fromStrings(" 123", "123 ", null, " 123 ", "\t\t123\n\n");
          ColumnVector trimmed = cv.strip();
          ColumnVector expected = ColumnVector.fromStrings("123", "123", null, "123", "123")) {
-      TableTest.assertColumnsAreEqual(expected, trimmed);
+      assertColumnsAreEqual(expected, trimmed);
     }
   }
 
@@ -2024,7 +2026,7 @@ void testTrimStrings() {
          Scalar one = Scalar.fromString(" 1");
          ColumnVector trimmed = cv.strip(one);
          ColumnVector expected = ColumnVector.fromStrings("23", "23", null, "23", "\t\t123\n\n")) {
-      TableTest.assertColumnsAreEqual(expected, trimmed);
+      assertColumnsAreEqual(expected, trimmed);
     }
   }
 
@@ -2033,7 +2035,7 @@ void testLeftTrimStringsWhiteSpace() {
     try (ColumnVector cv = ColumnVector.fromStrings(" 123", "123 ", null, " 123 ", "\t\t123\n\n");
          ColumnVector trimmed = cv.lstrip();
          ColumnVector expected = ColumnVector.fromStrings("123", "123 ", null, "123 ", "123\n\n")) {
-      TableTest.assertColumnsAreEqual(expected, trimmed);
+      assertColumnsAreEqual(expected, trimmed);
     }
   }
 
@@ -2043,7 +2045,7 @@ void testLeftTrimStrings() {
          Scalar one = Scalar.fromString(" 1");
          ColumnVector trimmed = cv.lstrip(one);
          ColumnVector expected = ColumnVector.fromStrings("23", "23 ", null, "231", "\t\t123\n\n")) {
-      TableTest.assertColumnsAreEqual(expected, trimmed);
+      assertColumnsAreEqual(expected, trimmed);
     }
   }
 
@@ -2052,7 +2054,7 @@ void testRightTrimStringsWhiteSpace() {
     try (ColumnVector cv = ColumnVector.fromStrings(" 123", "123 ", null, " 123 ", "\t\t123\n\n");
          ColumnVector trimmed = cv.rstrip();
          ColumnVector expected = ColumnVector.fromStrings(" 123", "123", null, " 123", "\t\t123")) {
-      TableTest.assertColumnsAreEqual(expected, trimmed);
+      assertColumnsAreEqual(expected, trimmed);
     }
   }
 
@@ -2062,7 +2064,7 @@ void testRightTrimStrings() {
          Scalar one = Scalar.fromString(" 1");
          ColumnVector trimmed = cv.rstrip(one);
          ColumnVector expected = ColumnVector.fromStrings("123", "123", null, "123", "\t\t123\n\n")) {
-      TableTest.assertColumnsAreEqual(expected, trimmed);
+      assertColumnsAreEqual(expected, trimmed);
     }
   }
 
@@ -2108,7 +2110,7 @@ void testCountElements() {
         Arrays.asList(1, 2, 3), Arrays.asList(1, 2, 3, 4));
          ColumnVector lengths = cv.countElements();
          ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, null, 2, 3, 4)) {
-      TableTest.assertColumnsAreEqual(expected, lengths);
+      assertColumnsAreEqual(expected, lengths);
     }
   }
 
@@ -2117,7 +2119,7 @@ void testStringLengths() {
     try (ColumnVector cv = ColumnVector.fromStrings("1", "12", null, "123", "1234");
       ColumnVector lengths = cv.getCharLengths();
       ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, null, 3, 4)) {
-      TableTest.assertColumnsAreEqual(expected, lengths);
+      assertColumnsAreEqual(expected, lengths);
     }
   }
 
@@ -2126,7 +2128,7 @@ void testGetByteCount() {
     try (ColumnVector cv = ColumnVector.fromStrings("1", "12", "123", null, "1234");
          ColumnVector byteLengthVector = cv.getByteCount();
          ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, 3, null, 4)) {
-      TableTest.assertColumnsAreEqual(expected, byteLengthVector);
+      assertColumnsAreEqual(expected, byteLengthVector);
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/IfElseTest.java b/java/src/test/java/ai/rapids/cudf/IfElseTest.java
index 86ddcc23416..a078befdf40 100644
--- a/java/src/test/java/ai/rapids/cudf/IfElseTest.java
+++ b/java/src/test/java/ai/rapids/cudf/IfElseTest.java
@@ -25,7 +25,7 @@
 
 import java.util.stream.Stream;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 
 public class IfElseTest extends CudfTestBase {
diff --git a/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java
index dd03c4de69e..2fb8164534b 100644
--- a/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java
@@ -117,8 +117,8 @@ public void testCastToInt() {
          ColumnVector expected1 = ColumnVector.fromInts(4, 3, 8);
          ColumnVector intColumnVector2 = shortColumnVector.asInts();
          ColumnVector expected2 = ColumnVector.fromInts(100)) {
-      TableTest.assertColumnsAreEqual(expected1, intColumnVector1);
-      TableTest.assertColumnsAreEqual(expected2, intColumnVector2);
+      AssertUtils.assertColumnsAreEqual(expected1, intColumnVector1);
+      AssertUtils.assertColumnsAreEqual(expected2, intColumnVector2);
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/ScalarTest.java b/java/src/test/java/ai/rapids/cudf/ScalarTest.java
index 0889363c2d0..86c340bb321 100644
--- a/java/src/test/java/ai/rapids/cudf/ScalarTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ScalarTest.java
@@ -29,7 +29,7 @@
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 import static org.junit.jupiter.api.Assertions.*;
 
 public class ScalarTest extends CudfTestBase {
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index b4247e9bb7c..fa221e19387 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -57,6 +57,11 @@
 import java.util.stream.Collectors;
 
 import static ai.rapids.cudf.ColumnWriterOptions.mapColumn;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertPartialColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertPartialTablesAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertTableTypes;
+import static ai.rapids.cudf.AssertUtils.assertTablesAreEqual;
 import static ai.rapids.cudf.ParquetWriterOptions.listBuilder;
 import static ai.rapids.cudf.ParquetWriterOptions.structBuilder;
 import static ai.rapids.cudf.Table.TestBuilder;
@@ -94,242 +99,6 @@ public class TableTest extends CudfTestBase {
       "8|118.2|128\n" +
       "9|119.8|129").getBytes(StandardCharsets.UTF_8);
 
-  /**
-   * Checks and asserts that passed in columns match
-   * @param expect The expected result column
-   * @param cv The input column
-   */
-  public static void assertColumnsAreEqual(ColumnView expect, ColumnView cv) {
-    assertColumnsAreEqual(expect, cv, "unnamed");
-  }
-
-  /**
-   * Checks and asserts that passed in columns match
-   * @param expected The expected result column
-   * @param cv The input column
-   * @param colName The name of the column
-   */
-  public static void assertColumnsAreEqual(ColumnView expected, ColumnView cv, String colName) {
-    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false);
-  }
-
-  /**
-   * Checks and asserts that passed in host columns match
-   * @param expected The expected result host column
-   * @param cv The input host column
-   * @param colName The name of the host column
-   */
-  public static void assertColumnsAreEqual(HostColumnVector expected, HostColumnVector cv, String colName) {
-    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false);
-  }
-
-  /**
-   * Checks and asserts that passed in Struct columns match
-   * @param expected The expected result Struct column
-   * @param cv The input Struct column
-   */
-  public static void assertStructColumnsAreEqual(ColumnView expected, ColumnView cv) {
-    assertPartialStructColumnsAreEqual(expected, 0, expected.getRowCount(), cv, "unnamed", true, false);
-  }
-
-  /**
-   * Checks and asserts that passed in Struct columns match
-   * @param expected The expected result Struct column
-   * @param rowOffset The row number to look from
-   * @param length The number of rows to consider
-   * @param cv The input Struct column
-   * @param colName The name of the column
-   * @param enableNullCountCheck Whether to check for nulls in the Struct column
-   * @param enableNullabilityCheck Whether the table have a validity mask
-   */
-  public static void assertPartialStructColumnsAreEqual(ColumnView expected, long rowOffset, long length,
-      ColumnView cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) {
-    try (HostColumnVector hostExpected = expected.copyToHost();
-         HostColumnVector hostcv = cv.copyToHost()) {
-      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCountCheck, enableNullabilityCheck);
-    }
-  }
-
-  /**
-   * Checks and asserts that passed in columns match
-   * @param expected The expected result column
-   * @param cv The input column
-   * @param colName The name of the column
-   * @param enableNullCheck Whether to check for nulls in the column
-   * @param enableNullabilityCheck Whether the table have a validity mask
-   */
-  public static void assertPartialColumnsAreEqual(ColumnView expected, long rowOffset, long length,
-      ColumnView cv, String colName, boolean enableNullCheck, boolean enableNullabilityCheck) {
-    try (HostColumnVector hostExpected = expected.copyToHost();
-         HostColumnVector hostcv = cv.copyToHost()) {
-      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCheck, enableNullabilityCheck);
-    }
-  }
-
-  /**
-   * Checks and asserts that passed in host columns match
-   * @param expected The expected result host column
-   * @param rowOffset start row index
-   * @param length  number of rows from starting offset
-   * @param cv The input host column
-   * @param colName The name of the host column
-   * @param enableNullCountCheck Whether to check for nulls in the host column
-   */
-  public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, long rowOffset, long length,
-                                                  HostColumnVectorCore cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) {
-    assertEquals(expected.getType(), cv.getType(), "Type For Column " + colName);
-    assertEquals(length, cv.getRowCount(), "Row Count For Column " + colName);
-    assertEquals(expected.getNumChildren(), cv.getNumChildren(), "Child Count for Column " + colName);
-    if (enableNullCountCheck) {
-      assertEquals(expected.getNullCount(), cv.getNullCount(), "Null Count For Column " + colName);
-    } else {
-      // TODO add in a proper check when null counts are supported by serializing a partitioned column
-    }
-    if (enableNullabilityCheck) {
-      assertEquals(expected.hasValidityVector(), cv.hasValidityVector(), "Column nullability is different than expected");
-    }
-    DType type = expected.getType();
-    for (long expectedRow = rowOffset; expectedRow < (rowOffset + length); expectedRow++) {
-      long tableRow = expectedRow - rowOffset;
-      assertEquals(expected.isNull(expectedRow), cv.isNull(tableRow),
-          "NULL for Column " + colName + " Row " + tableRow);
-      if (!expected.isNull(expectedRow)) {
-        switch (type.typeId) {
-          case BOOL8: // fall through
-          case INT8: // fall through
-          case UINT8:
-            assertEquals(expected.getByte(expectedRow), cv.getByte(tableRow),
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case INT16: // fall through
-          case UINT16:
-            assertEquals(expected.getShort(expectedRow), cv.getShort(tableRow),
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case INT32: // fall through
-          case UINT32: // fall through
-          case TIMESTAMP_DAYS:
-          case DURATION_DAYS:
-          case DECIMAL32:
-            assertEquals(expected.getInt(expectedRow), cv.getInt(tableRow),
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case INT64: // fall through
-          case UINT64: // fall through
-          case DURATION_MICROSECONDS: // fall through
-          case DURATION_MILLISECONDS: // fall through
-          case DURATION_NANOSECONDS: // fall through
-          case DURATION_SECONDS: // fall through
-          case TIMESTAMP_MICROSECONDS: // fall through
-          case TIMESTAMP_MILLISECONDS: // fall through
-          case TIMESTAMP_NANOSECONDS: // fall through
-          case TIMESTAMP_SECONDS:
-          case DECIMAL64:
-            assertEquals(expected.getLong(expectedRow), cv.getLong(tableRow),
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case DECIMAL128:
-            assertEquals(expected.getBigDecimal(expectedRow), cv.getBigDecimal(tableRow),
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case FLOAT32:
-            assertEqualsWithinPercentage(expected.getFloat(expectedRow), cv.getFloat(tableRow), 0.0001,
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case FLOAT64:
-            assertEqualsWithinPercentage(expected.getDouble(expectedRow), cv.getDouble(tableRow), 0.0001,
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case STRING:
-            assertArrayEquals(expected.getUTF8(expectedRow), cv.getUTF8(tableRow),
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case LIST:
-            HostMemoryBuffer expectedOffsets = expected.getOffsets();
-            HostMemoryBuffer cvOffsets = cv.getOffsets();
-            int expectedChildRows = expectedOffsets.getInt((expectedRow + 1) * 4) -
-                expectedOffsets.getInt(expectedRow * 4);
-            int cvChildRows = cvOffsets.getInt((tableRow + 1) * 4) -
-                cvOffsets.getInt(tableRow * 4);
-            assertEquals(expectedChildRows, cvChildRows, "Child row count for Column " +
-                colName + " Row " + tableRow);
-            break;
-          case STRUCT:
-            // parent column only has validity which was checked above
-            break;
-          default:
-            throw new IllegalArgumentException(type + " is not supported yet");
-        }
-      }
-    }
-
-    if (type.isNestedType()) {
-      switch (type.typeId) {
-        case LIST:
-          int expectedChildRowOffset = 0;
-          int numChildRows = 0;
-          if (length > 0) {
-            HostMemoryBuffer expectedOffsets = expected.getOffsets();
-            HostMemoryBuffer cvOffsets = cv.getOffsets();
-            expectedChildRowOffset = expectedOffsets.getInt(rowOffset * 4);
-            numChildRows = expectedOffsets.getInt((rowOffset + length) * 4) -
-                expectedChildRowOffset;
-          }
-          assertPartialColumnsAreEqual(expected.getNestedChildren().get(0), expectedChildRowOffset,
-              numChildRows, cv.getNestedChildren().get(0), colName + " list child",
-              enableNullCountCheck, enableNullabilityCheck);
-          break;
-        case STRUCT:
-          List<HostColumnVectorCore> expectedChildren = expected.getNestedChildren();
-          List<HostColumnVectorCore> cvChildren = cv.getNestedChildren();
-          for (int i = 0; i < expectedChildren.size(); i++) {
-            HostColumnVectorCore expectedChild = expectedChildren.get(i);
-            HostColumnVectorCore cvChild = cvChildren.get(i);
-            String childName = colName + " child " + i;
-            assertEquals(length, cvChild.getRowCount(), "Row Count for Column " + colName);
-            assertPartialColumnsAreEqual(expectedChild, rowOffset, length, cvChild,
-                colName, enableNullCountCheck, enableNullabilityCheck);
-          }
-          break;
-        default:
-          throw new IllegalArgumentException(type + " is not supported yet");
-      }
-    }
-  }
-
-  /**
-   * Checks and asserts that the two tables from a given rowindex match based on a provided schema.
-   * @param expected the expected result table
-   * @param rowOffset the row number to start checking from
-   * @param length the number of rows to check
-   * @param table the input table to compare against expected
-   * @param enableNullCheck whether to check for nulls or not
-   * @param enableNullabilityCheck whether the table have a validity mask
-   */
-  public static void assertPartialTablesAreEqual(Table expected, long rowOffset, long length, Table table,
-                                                 boolean enableNullCheck, boolean enableNullabilityCheck) {
-    assertEquals(expected.getNumberOfColumns(), table.getNumberOfColumns());
-    assertEquals(length, table.getRowCount(), "ROW COUNT");
-    for (int col = 0; col < expected.getNumberOfColumns(); col++) {
-      ColumnVector expect = expected.getColumn(col);
-      ColumnVector cv = table.getColumn(col);
-      String name = String.valueOf(col);
-      if (rowOffset != 0 || length != expected.getRowCount()) {
-        name = name + " PART " + rowOffset + "-" + (rowOffset + length - 1);
-      }
-      assertPartialColumnsAreEqual(expect, rowOffset, length, cv, name, enableNullCheck, enableNullabilityCheck);
-    }
-  }
-
-  /**
-   * Checks and asserts that the two tables match
-   * @param expected the expected result table
-   * @param table the input table to compare against expected
-   */
-  public static void assertTablesAreEqual(Table expected, Table table) {
-    assertPartialTablesAreEqual(expected, 0, expected.getRowCount(), table, true, false);
-  }
-
   void assertTablesHaveSameValues(HashMap<Object, Integer>[] expectedTable, Table table) {
     assertEquals(expectedTable.length, table.getNumberOfColumns());
     int numCols = table.getNumberOfColumns();
@@ -358,16 +127,6 @@ void assertTablesHaveSameValues(HashMap<Object, Integer>[] expectedTable, Table
     }
   }
 
-  public static void assertTableTypes(DType[] expectedTypes, Table t) {
-    int len = t.getNumberOfColumns();
-    assertEquals(expectedTypes.length, len);
-    for (int i = 0; i < len; i++) {
-      ColumnVector vec = t.getColumn(i);
-      DType type = vec.getType();
-      assertEquals(expectedTypes[i], type, "Types don't match at " + i);
-    }
-  }
-
   @Test
   void testMergeSimple() {
     try (Table table1 = new Table.TestBuilder()
diff --git a/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java
index 8bf1370a0f7..9a929cec98d 100644
--- a/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java
@@ -22,7 +22,7 @@
 
 import java.util.function.Function;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 
 public class TimestampColumnVectorTest extends CudfTestBase {
diff --git a/java/src/test/java/ai/rapids/cudf/UnaryOpTest.java b/java/src/test/java/ai/rapids/cudf/UnaryOpTest.java
index 76970e8bf76..7fcb7cbd85b 100644
--- a/java/src/test/java/ai/rapids/cudf/UnaryOpTest.java
+++ b/java/src/test/java/ai/rapids/cudf/UnaryOpTest.java
@@ -22,7 +22,7 @@
 import ai.rapids.cudf.HostColumnVector.Builder;
 import org.junit.jupiter.api.Test;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 
 public class UnaryOpTest extends CudfTestBase {
   private static final Double[] DOUBLES_1 = new Double[]{1.0, 10.0, -100.1, 5.3, 50.0, 100.0, null, Double.NaN, Double.POSITIVE_INFINITY, 1/9.0, Double.NEGATIVE_INFINITY, 500.0, -500.0};
diff --git a/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java b/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java
index 2fb6792b409..e50da0a4d4d 100644
--- a/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java
@@ -36,7 +36,7 @@
 import java.util.function.Function;
 import java.util.stream.Stream;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 
 public class CompiledExpressionTest extends CudfTestBase {
   @Test

From 554ac817498e64ba1c7ef054873fab7dc658d25c Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Tue, 30 Nov 2021 15:50:56 -0600
Subject: [PATCH 23/42] Load native dependencies when Java ColumnView is loaded
 (#9800)

The Java ColumnView class has native methods but does not ensure the corresponding native libraries that implement those methods are loaded.  This adds a static code block to the ColumnView class to load the native libraries when the ColumnView class is loaded.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Kuhu Shukla (https://github.com/kuhushukla)

URL: https://github.com/rapidsai/cudf/pull/9800
---
 java/src/main/java/ai/rapids/cudf/ColumnView.java | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 729444f460c..6d0d24baf99 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -30,6 +30,10 @@
  */
 public class ColumnView implements AutoCloseable, BinaryOperable {
 
+  static {
+    NativeDepsLoader.loadNativeDeps();
+  }
+
   public static final long UNKNOWN_NULL_COUNT = -1;
 
   protected long viewHandle;

From 20d6723fcb5eaffb6398e5cf6c14de8d774ca917 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Tue, 30 Nov 2021 15:51:12 -0600
Subject: [PATCH 24/42] Copy Java native dependencies directly into classpath
 (#9787)

Eliminates the intermediate copy of the native libraries for the Java bindings into target/native-deps, instead copying libcudf.so and libcudfjni.so directly into the classpath resources.  This eliminates the need to search target/native-deps at runtime when the native libraries are not in the classpath in the case of running tests before the jar is built.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/9787
---
 java/pom.xml                                          |  7 ++-----
 .../main/java/ai/rapids/cudf/NativeDepsLoader.java    | 11 ++---------
 2 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/java/pom.xml b/java/pom.xml
index 87d43ec1272..c5a3bc64fad 100755
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -297,9 +297,6 @@
                 <include>LICENSE</include>
               </includes>
             </resource>
-            <resource>
-                <directory>${project.build.directory}/native-deps/</directory>
-            </resource>
         </resources>
         <pluginManagement>
             <plugins>
@@ -499,14 +496,14 @@
                 <executions>
                     <execution>
                         <id>copy-native-libs</id>
-                        <phase>validate</phase>
+                        <phase>generate-resources</phase>
                         <goals>
                             <goal>copy-resources</goal>
                         </goals>
                         <configuration>
                             <overwrite>true</overwrite>
                             <skip>${skipNativeCopy}</skip>
-                            <outputDirectory>${project.build.directory}/native-deps/${os.arch}/${os.name}</outputDirectory>
+                            <outputDirectory>${project.build.outputDirectory}/${os.arch}/${os.name}</outputDirectory>
                             <resources>
                                 <resource>
                                     <directory>${native.build.path}</directory>
diff --git a/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java b/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
index 8780ecc3aa3..9663fbcafb4 100755
--- a/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
+++ b/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
@@ -81,9 +81,7 @@ public static synchronized void loadNativeDeps() {
 
   /**
    * Allows other libraries to reuse the same native deps loading logic. Libraries will be searched
-   * for under ${os.arch}/${os.name}/ in the class path using the class loader for this class. It
-   * will also look for the libraries under ./target/native-deps/${os.arch}/${os.name} to help
-   * facilitate testing while building.
+   * for under ${os.arch}/${os.name}/ in the class path using the class loader for this class.
    * <br/>
    * Because this just loads the libraries and loading the libraries themselves needs to be a
    * singleton operation it is recommended that any library using this provide their own wrapper
@@ -203,12 +201,7 @@ private static File createFile(String os, String arch, String baseName) throws I
     File loc;
     URL resource = loader.getResource(path);
     if (resource == null) {
-      // It looks like we are not running from the jar, or there are issues with the jar
-      File f = new File("./target/native-deps/" + path);
-      if (!f.exists()) {
-        throw new FileNotFoundException("Could not locate native dependency " + path);
-      }
-      resource = f.toURI().toURL();
+      throw new FileNotFoundException("Could not locate native dependency " + path);
     }
     try (InputStream in = resource.openStream()) {
       loc = File.createTempFile(baseName, ".so");

From 991136c78be01d4de20387086a185cfd5a21713b Mon Sep 17 00:00:00 2001
From: Sheilah Kirui <71867292+skirui-source@users.noreply.github.com>
Date: Tue, 30 Nov 2021 15:31:53 -0800
Subject: [PATCH 25/42] Add Pearson correlation for sort groupby (python)
 (#9166)

Fixes: https://github.com/rapidsai/cudf/issues/8691

Authors:
  - Sheilah Kirui (https://github.com/skirui-source)
  - Karthikeyan (https://github.com/karthikeyann)
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Michael Wang (https://github.com/isVoid)
  - Mayank Anand (https://github.com/mayankanand007)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9166
---
 docs/cudf/source/api_docs/groupby.rst     |   1 +
 docs/cudf/source/basics/groupby.rst       |  10 ++
 python/cudf/cudf/_lib/aggregation.pyx     |  55 +++++++++-
 python/cudf/cudf/_lib/cpp/aggregation.pxd |  15 ++-
 python/cudf/cudf/_lib/groupby.pyx         |   4 +-
 python/cudf/cudf/core/groupby/groupby.py  | 121 +++++++++++++++++++++-
 python/cudf/cudf/tests/test_dataframe.py  | 115 ++++++++++++++++++++
 7 files changed, 314 insertions(+), 7 deletions(-)

diff --git a/docs/cudf/source/api_docs/groupby.rst b/docs/cudf/source/api_docs/groupby.rst
index cf08d1d791b..575d7442cdf 100644
--- a/docs/cudf/source/api_docs/groupby.rst
+++ b/docs/cudf/source/api_docs/groupby.rst
@@ -59,6 +59,7 @@ Computations / descriptive stats
    GroupBy.std
    GroupBy.sum
    GroupBy.var
+   GroupBy.corr
    
 The following methods are available in both ``SeriesGroupBy`` and
 ``DataFrameGroupBy`` objects, but may differ slightly, usually in that
diff --git a/docs/cudf/source/basics/groupby.rst b/docs/cudf/source/basics/groupby.rst
index 04c4d42fa2a..f3269768025 100644
--- a/docs/cudf/source/basics/groupby.rst
+++ b/docs/cudf/source/basics/groupby.rst
@@ -127,6 +127,13 @@ Aggregations on groups is supported via the ``agg`` method:
     a
     1   4   1  2.0
     2   5   2  4.5
+    >>> df.groupby("a").corr(method="pearson")
+              b          c
+    a                      
+    1 b  1.000000  0.866025
+      c  0.866025  1.000000
+    2 b  1.000000  1.000000
+      c  1.000000  1.000000
 
 The following table summarizes the available aggregations and the types
 that support them:
@@ -169,6 +176,9 @@ that support them:
    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
    | unique                             | ✅        | ✅         | ✅       | ✅            |        |          |            |           |
    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+   | corr                               | ✅        |            |          |               |        |          |            | ✅        |
+   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+   
 
 GroupBy apply
 -------------
diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 4f703724cef..68f7101b6ee 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -1,6 +1,6 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
-from enum import Enum
+from enum import Enum, IntEnum
 
 import numba
 import numpy as np
@@ -30,6 +30,7 @@ from cudf._lib.types import Interpolation
 
 cimport cudf._lib.cpp.aggregation as libcudf_aggregation
 cimport cudf._lib.cpp.types as libcudf_types
+from cudf._lib.cpp.aggregation cimport underlying_type_t_correlation_type
 
 import cudf
 
@@ -57,6 +58,22 @@ class AggregationKind(Enum):
     UNIQUE = libcudf_aggregation.aggregation.Kind.COLLECT_SET
     PTX = libcudf_aggregation.aggregation.Kind.PTX
     CUDA = libcudf_aggregation.aggregation.Kind.CUDA
+    CORRELATION = libcudf_aggregation.aggregation.Kind.CORRELATION
+
+
+class CorrelationType(IntEnum):
+    PEARSON = (
+        <underlying_type_t_correlation_type>
+        libcudf_aggregation.correlation_type.PEARSON
+    )
+    KENDALL = (
+        <underlying_type_t_correlation_type>
+        libcudf_aggregation.correlation_type.KENDALL
+    )
+    SPEARMAN = (
+        <underlying_type_t_correlation_type>
+        libcudf_aggregation.correlation_type.SPEARMAN
+    )
 
 
 cdef class Aggregation:
@@ -321,6 +338,22 @@ cdef class Aggregation:
             ))
         return agg
 
+    @classmethod
+    def corr(cls, method, libcudf_types.size_type min_periods):
+        cdef Aggregation agg = cls()
+        cdef libcudf_aggregation.correlation_type c_method = (
+            <libcudf_aggregation.correlation_type> (
+                <underlying_type_t_correlation_type> (
+                    CorrelationType[method.upper()]
+                )
+            )
+        )
+        agg.c_obj = move(
+            libcudf_aggregation.make_correlation_aggregation[aggregation](
+                c_method, min_periods
+            ))
+        return agg
+
 cdef class RollingAggregation:
     """A Cython wrapper for rolling window aggregations.
 
@@ -692,6 +725,24 @@ cdef class GroupbyAggregation:
         )
         return agg
 
+    @classmethod
+    def corr(cls, method, libcudf_types.size_type min_periods):
+        cdef GroupbyAggregation agg = cls()
+        cdef libcudf_aggregation.correlation_type c_method = (
+            <libcudf_aggregation.correlation_type> (
+                <underlying_type_t_correlation_type> (
+                    CorrelationType[method.upper()]
+                )
+            )
+        )
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_correlation_aggregation[groupby_aggregation](
+                c_method, min_periods
+            ))
+        return agg
+
+
 cdef class GroupbyScanAggregation:
     """A Cython wrapper for groupby scan aggregations.
 
diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd
index 13bfa49057c..3982b4fecbb 100644
--- a/python/cudf/cudf/_lib/cpp/aggregation.pxd
+++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd
@@ -1,5 +1,5 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
@@ -11,6 +11,7 @@ from cudf._lib.cpp.types cimport (
     size_type,
 )
 
+ctypedef int32_t underlying_type_t_correlation_type
 
 cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
 
@@ -38,6 +39,8 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
             COLLECT_SET 'cudf::aggregation::COLLECT_SET'
             PTX 'cudf::aggregation::PTX'
             CUDA 'cudf::aggregation::CUDA'
+            CORRELATION 'cudf::aggregation::CORRELATION'
+
         Kind kind
 
     cdef cppclass rolling_aggregation:
@@ -53,6 +56,11 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
         CUDA 'cudf::udf_type::CUDA'
         PTX 'cudf::udf_type::PTX'
 
+    ctypedef enum correlation_type:
+        PEARSON 'cudf::correlation_type::PEARSON'
+        KENDALL 'cudf::correlation_type::KENDALL'
+        SPEARMAN 'cudf::correlation_type::SPEARMAN'
+
     cdef unique_ptr[T] make_sum_aggregation[T]() except +
 
     cdef unique_ptr[T] make_product_aggregation[T]() except +
@@ -106,3 +114,6 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
         udf_type type,
         string user_defined_aggregator,
         data_type output_type) except +
+
+    cdef unique_ptr[T] make_correlation_aggregation[T](
+        correlation_type type, size_type min_periods) except +
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 0968d22d465..314542c9549 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 from collections import defaultdict
 
@@ -54,7 +54,7 @@ _CATEGORICAL_AGGS = {"COUNT", "SIZE", "NUNIQUE", "UNIQUE"}
 _STRING_AGGS = {"COUNT", "SIZE", "MAX", "MIN", "NUNIQUE", "NTH", "COLLECT",
                 "UNIQUE"}
 _LIST_AGGS = {"COLLECT"}
-_STRUCT_AGGS = set()
+_STRUCT_AGGS = {"CORRELATION"}
 _INTERVAL_AGGS = set()
 _DECIMAL_AGGS = {"COUNT", "SUM", "ARGMIN", "ARGMAX", "MIN", "MAX", "NUNIQUE",
                  "NTH", "COLLECT"}
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 7f9f61ed3fd..f1d622362e2 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 import collections
+import itertools
 import pickle
 import warnings
 
@@ -13,7 +14,8 @@
 from cudf._typing import DataFrameOrSeries
 from cudf.api.types import is_list_like
 from cudf.core.abc import Serializable
-from cudf.core.column.column import arange
+from cudf.core.column.column import arange, as_column
+from cudf.core.multiindex import MultiIndex
 from cudf.utils.utils import GetAttrGetItemMixin, cached_property
 
 
@@ -69,6 +71,8 @@ def __init__(
         """
         self.obj = obj
         self._as_index = as_index
+        self._by = by
+        self._level = level
         self._sort = sort
         self._dropna = dropna
 
@@ -777,6 +781,121 @@ def median(self):
         """Get the column-wise median of the values in each group."""
         return self.agg("median")
 
+    def corr(self, method="pearson", min_periods=1):
+        """
+        Compute pairwise correlation of columns, excluding NA/null values.
+
+        Parameters
+        ----------
+        method: {"pearson", "kendall", "spearman"} or callable,
+            default "pearson". Currently only the pearson correlation
+            coefficient is supported.
+
+        min_periods: int, optional
+            Minimum number of observations required per pair of columns
+            to have a valid result.
+
+        Returns
+        ----------
+        DataFrame
+            Correlation matrix.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> gdf = cudf.DataFrame({
+        ...             "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+        ...             "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2],
+        ...             "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+        ...             "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1]})
+        >>> gdf
+        id  val1  val2  val3
+        0  a     5     4     4
+        1  a     4     5     5
+        2  a     6     6     6
+        3  b     4     1     1
+        4  b     8     2     2
+        5  b     7     9     9
+        6  c     4     8     8
+        7  c     5     5     5
+        8  c     2     1     1
+        >>> gdf.groupby("id").corr(method="pearson")
+                    val1      val2      val3
+        id
+        a   val1  1.000000  0.500000  0.500000
+            val2  0.500000  1.000000  1.000000
+            val3  0.500000  1.000000  1.000000
+        b   val1  1.000000  0.385727  0.385727
+            val2  0.385727  1.000000  1.000000
+            val3  0.385727  1.000000  1.000000
+        c   val1  1.000000  0.714575  0.714575
+            val2  0.714575  1.000000  1.000000
+            val3  0.714575  1.000000  1.000000
+        """
+
+        if not method.lower() in ("pearson",):
+            raise NotImplementedError(
+                "Only pearson correlation is currently supported"
+            )
+
+        # create expanded dataframe consisting all combinations of the
+        # struct columns-pairs to be correlated
+        # i.e (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2'))
+        _cols = self.grouping.values.columns.tolist()
+        len_cols = len(_cols)
+
+        new_df_data = {}
+        for x, y in itertools.combinations_with_replacement(_cols, 2):
+            new_df_data[(x, y)] = cudf.DataFrame._from_data(
+                {"x": self.obj._data[x], "y": self.obj._data[y]}
+            ).to_struct()
+        new_gb = cudf.DataFrame._from_data(new_df_data).groupby(
+            by=self.grouping.keys
+        )
+
+        try:
+            gb_corr = new_gb.agg(lambda x: x.corr(method, min_periods))
+        except RuntimeError as e:
+            if "Unsupported groupby reduction type-agg combination" in str(e):
+                raise TypeError(
+                    "Correlation accepts only numerical column-pairs"
+                )
+            raise
+
+        # ensure that column-pair labels are arranged in ascending order
+        cols_list = [
+            (y, x) if i > j else (x, y)
+            for j, y in enumerate(_cols)
+            for i, x in enumerate(_cols)
+        ]
+        cols_split = [
+            cols_list[i : i + len_cols]
+            for i in range(0, len(cols_list), len_cols)
+        ]
+
+        # interleave: combine the correlation results for each column-pair
+        # into a single column
+        res = cudf.DataFrame._from_data(
+            {
+                x: gb_corr.loc[:, i].interleave_columns()
+                for i, x in zip(cols_split, _cols)
+            }
+        )
+
+        # create a multiindex for the groupby correlated dataframe,
+        # to match pandas behavior
+        unsorted_idx = gb_corr.index.repeat(len_cols)
+        idx_sort_order = unsorted_idx._get_sorted_inds()
+        sorted_idx = unsorted_idx._gather(idx_sort_order)
+        if len(gb_corr):
+            # TO-DO: Should the operation below be done on the CPU instead?
+            sorted_idx._data[None] = as_column(
+                cudf.Series(_cols).tile(len(gb_corr.index))
+            )
+        res.index = MultiIndex._from_data(sorted_idx._data)
+
+        return res
+
     def var(self, ddof=1):
         """Compute the column-wise variance of the values in each group.
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index d07caef11d5..d555b5c4033 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -8924,3 +8924,118 @@ def test_frame_series_where_other(data):
     expected = gdf.where(gdf["b"] == 1, 0)
     actual = pdf.where(pdf["b"] == 1, 0)
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "data, gkey",
+    [
+        (
+            {
+                "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+                "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2],
+                "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+                "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+            },
+            ["id", "val1", "val2"],
+        ),
+        (
+            {
+                "id": [0] * 4 + [1] * 3,
+                "a": [10, 3, 4, 2, -3, 9, 10],
+                "b": [10, 23, -4, 2, -3, 9, 19],
+            },
+            ["id", "a"],
+        ),
+        (
+            {
+                "id": ["a", "a", "b", "b", "c", "c"],
+                "val": [None, None, None, None, None, None],
+            },
+            ["id"],
+        ),
+        (
+            {
+                "id": ["a", "a", "b", "b", "c", "c"],
+                "val1": [None, 4, 6, 8, None, 2],
+                "val2": [4, 5, None, 2, 9, None],
+            },
+            ["id"],
+        ),
+        ({"id": [1.0], "val1": [2.0], "val2": [3.0]}, ["id"]),
+    ],
+)
+@pytest.mark.parametrize(
+    "min_per", [0, 1, 2, 3, 4],
+)
+def test_pearson_corr_passing(data, gkey, min_per):
+    gdf = cudf.DataFrame(data)
+    pdf = gdf.to_pandas()
+
+    actual = gdf.groupby(gkey).corr(method="pearson", min_periods=min_per)
+    expected = pdf.groupby(gkey).corr(method="pearson", min_periods=min_per)
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("method", ["kendall", "spearman"])
+def test_pearson_corr_unsupported_methods(method):
+    gdf = cudf.DataFrame(
+        {
+            "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+            "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2],
+            "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+            "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+        }
+    )
+
+    with pytest.raises(
+        NotImplementedError,
+        match="Only pearson correlation is currently supported",
+    ):
+        gdf.groupby("id").corr(method)
+
+
+def test_pearson_corr_empty_columns():
+    gdf = cudf.DataFrame(columns=["id", "val1", "val2"])
+    pdf = gdf.to_pandas()
+
+    actual = gdf.groupby("id").corr("pearson")
+    expected = pdf.groupby("id").corr("pearson")
+
+    assert_eq(
+        expected, actual, check_dtype=False, check_index_type=False,
+    )
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {
+            "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+            "val1": ["v", "n", "k", "l", "m", "i", "y", "r", "w"],
+            "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"],
+        },
+        {
+            "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+            "val1": [1, 1, 1, 2, 2, 2, 3, 3, 3],
+            "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"],
+        },
+    ],
+)
+@pytest.mark.parametrize("gkey", ["id", "val1", "val2"])
+def test_pearson_corr_invalid_column_types(data, gkey):
+    with pytest.raises(
+        TypeError, match="Correlation accepts only numerical column-pairs",
+    ):
+        cudf.DataFrame(data).groupby(gkey).corr("pearson")
+
+
+def test_pearson_corr_multiindex_dataframe():
+    gdf = cudf.DataFrame(
+        {"a": [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [2, 3, 4, 5]}
+    ).set_index(["a", "b"])
+
+    actual = gdf.groupby(level="a").corr("pearson")
+    expected = gdf.to_pandas().groupby(level="a").corr("pearson")
+
+    assert_eq(expected, actual)

From 1eabcb73b7df235de9985e207e2087af9dfb0e14 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Wed, 1 Dec 2021 17:03:36 +0530
Subject: [PATCH 26/42] Fix some doxygen warnings and add missing documentation
 (#9770)

fix to ignore `__device__ void` return type warnings.
add missing documentation on some functions
Correct doxygen doc style comment fixes

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/9770
---
 cpp/doxygen/Doxyfile                             |  7 ++++---
 cpp/include/cudf/lists/combine.hpp               |  2 +-
 cpp/include/cudf/scalar/scalar_device_view.cuh   | 16 ++++++++++++++++
 .../cudf/strings/convert/convert_lists.hpp       |  2 +-
 cpp/include/cudf/table/row_operators.cuh         |  3 ++-
 cpp/include/cudf_test/base_fixture.hpp           |  3 +++
 cpp/include/cudf_test/column_wrapper.hpp         |  3 +++
 cpp/include/cudf_test/file_utilities.hpp         |  9 +++++++++
 cpp/include/cudf_test/table_utilities.hpp        |  2 +-
 9 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index 55e5119040e..6a556bb4b34 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -2089,7 +2089,7 @@ ENABLE_PREPROCESSING   = YES
 # The default value is: NO.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-MACRO_EXPANSION        = NO
+MACRO_EXPANSION        = YES
 
 # If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
 # the macro expansion is limited to the macros specified with the PREDEFINED and
@@ -2097,7 +2097,7 @@ MACRO_EXPANSION        = NO
 # The default value is: NO.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-EXPAND_ONLY_PREDEF     = NO
+EXPAND_ONLY_PREDEF     = YES
 
 # If the SEARCH_INCLUDES tag is set to YES, the include files in the
 # INCLUDE_PATH will be searched if a #include is found.
@@ -2129,7 +2129,8 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             =
+PREDEFINED              = __device__= \
+                          __host__=
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
diff --git a/cpp/include/cudf/lists/combine.hpp b/cpp/include/cudf/lists/combine.hpp
index a9407ed57ca..61a81e8a745 100644
--- a/cpp/include/cudf/lists/combine.hpp
+++ b/cpp/include/cudf/lists/combine.hpp
@@ -26,7 +26,7 @@ namespace lists {
  * @file
  */
 
-/*
+/**
  * @brief Flag to specify whether a null list element will be ignored from concatenation, or the
  * entire concatenation result involving null list elements will be a null element.
  */
diff --git a/cpp/include/cudf/scalar/scalar_device_view.cuh b/cpp/include/cudf/scalar/scalar_device_view.cuh
index 884b412d3e2..56afa150dfc 100644
--- a/cpp/include/cudf/scalar/scalar_device_view.cuh
+++ b/cpp/include/cudf/scalar/scalar_device_view.cuh
@@ -91,6 +91,12 @@ class fixed_width_scalar_device_view_base : public detail::scalar_device_view_ba
     return *data<T>();
   }
 
+  /**
+   * @brief Stores the value in scalar
+   *
+   * @tparam T The desired type
+   * @param value The value to store in scalar
+   */
   template <typename T>
   __device__ void set_value(T value)
   {
@@ -159,6 +165,11 @@ class fixed_width_scalar_device_view : public detail::fixed_width_scalar_device_
     return fixed_width_scalar_device_view_base::value<T>();
   }
 
+  /**
+   * @brief Stores the value in scalar
+   *
+   * @param value The value to store in scalar
+   */
   __device__ void set_value(T value) { fixed_width_scalar_device_view_base::set_value<T>(value); }
 
   /**
@@ -218,6 +229,11 @@ class fixed_point_scalar_device_view : public detail::scalar_device_view_base {
   {
   }
 
+  /**
+   * @brief Stores the value in scalar
+   *
+   * @param value The value to store in scalar
+   */
   __device__ void set_value(rep_type value) { *_data = value; }
 
   /**
diff --git a/cpp/include/cudf/strings/convert/convert_lists.hpp b/cpp/include/cudf/strings/convert/convert_lists.hpp
index ec22186ea99..279bf44e7fc 100644
--- a/cpp/include/cudf/strings/convert/convert_lists.hpp
+++ b/cpp/include/cudf/strings/convert/convert_lists.hpp
@@ -50,7 +50,7 @@ namespace strings {
  *
  * @param input Lists column to format.
  * @param na_rep Replacment string for null elements.
- * @param separator Strings to use for enclosing list components and separating elements.
+ * @param separators Strings to use for enclosing list components and separating elements.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column.
  */
diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
index c719c564a87..70ccac2f75d 100644
--- a/cpp/include/cudf/table/row_operators.cuh
+++ b/cpp/include/cudf/table/row_operators.cuh
@@ -67,7 +67,7 @@ __device__ weak_ordering compare_elements(Element lhs, Element rhs)
 }
 }  // namespace detail
 
-/*
+/**
  * @brief A specialization for floating-point `Element` type relational comparison
  * to derive the order of the elements with respect to `lhs`. Specialization is to
  * handle `nan` in the order shown below.
@@ -187,6 +187,7 @@ class element_equality_comparator {
    *
    * @param lhs_element_index The index of the first element
    * @param rhs_element_index The index of the second element
+   * @return True if both lhs and rhs element are both nulls and `nulls_are_equal` is true, or equal
    *
    */
   template <typename Element,
diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp
index 5fa07fd5568..e08bf6aa53a 100644
--- a/cpp/include/cudf_test/base_fixture.hpp
+++ b/cpp/include/cudf_test/base_fixture.hpp
@@ -50,6 +50,7 @@ class BaseFixture : public ::testing::Test {
   /**
    * @brief Returns pointer to `device_memory_resource` that should be used for
    * all tests inheriting from this fixture
+   * @return pointer to memory resource
    */
   rmm::mr::device_memory_resource* mr() { return _mr; }
 };
@@ -170,6 +171,7 @@ class UniformRandomGenerator {
 
   /**
    * @brief Returns the next random number.
+   * @return generated random number
    */
   template <typename TL = T, std::enable_if_t<!cudf::is_timestamp<TL>()>* = nullptr>
   T generate()
@@ -211,6 +213,7 @@ class TempDirTestEnvironment : public ::testing::Environment {
   /**
    * @brief Get a temporary filepath to use for the specified filename
    *
+   * @param filename name of the file to be placed in temporary directory.
    * @return std::string The temporary filepath
    */
   std::string get_temp_filepath(std::string filename) { return tmpdir.path() + filename; }
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index f291b04776a..cd2ac9f3ec1 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -79,6 +79,7 @@ class column_wrapper {
 
   /**
    * @brief Releases internal unique_ptr to wrapped column
+   * @return unique_ptr to wrapped column
    */
   std::unique_ptr<cudf::column> release() { return std::move(wrapped); }
 
@@ -1040,11 +1041,13 @@ class dictionary_column_wrapper<std::string> : public detail::column_wrapper {
 
   /**
    * @brief Access keys column view
+   * @return column_view to keys column
    */
   column_view keys() const { return cudf::dictionary_column_view{wrapped->view()}.keys(); }
 
   /**
    * @brief Access indices column view
+   * @return column_view to indices column
    */
   column_view indices() const { return cudf::dictionary_column_view{wrapped->view()}.indices(); }
 
diff --git a/cpp/include/cudf_test/file_utilities.hpp b/cpp/include/cudf_test/file_utilities.hpp
index 90bf0cd99dc..8e242e5a4f3 100644
--- a/cpp/include/cudf_test/file_utilities.hpp
+++ b/cpp/include/cudf_test/file_utilities.hpp
@@ -24,6 +24,10 @@
 
 #include <cudf/utilities/error.hpp>
 
+/**
+ * @brief RAII class for creating a temporary directory.
+ *
+ */
 class temp_directory {
   std::string _path;
 
@@ -49,5 +53,10 @@ class temp_directory {
     nftw(_path.c_str(), rm_files, 10, FTW_DEPTH | FTW_MOUNT | FTW_PHYS);
   }
 
+  /**
+   * @brief Returns the path of the temporary directory
+   *
+   * @return string path of the temporary directory
+   */
   const std::string& path() const { return _path; }
 };
diff --git a/cpp/include/cudf_test/table_utilities.hpp b/cpp/include/cudf_test/table_utilities.hpp
index 831c9f5ac14..f2427c5b8c6 100644
--- a/cpp/include/cudf_test/table_utilities.hpp
+++ b/cpp/include/cudf_test/table_utilities.hpp
@@ -39,7 +39,7 @@ void expect_table_properties_equal(cudf::table_view lhs, cudf::table_view rhs);
  */
 void expect_tables_equal(cudf::table_view lhs, cudf::table_view rhs);
 
-/*
+/**
  * @brief Verifies the equivalency of two tables.
  *
  * Treats null elements as equivalent.  Columns that have nullability but no nulls,

From 1ceb8ab01120ffe463600db14e6893e196cbb991 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 1 Dec 2021 10:10:10 -0500
Subject: [PATCH 27/42] Improve build time of libcudf iterator tests (#9788)

While working on #9641 I noticed that building the iterator gtests takes alot of time in CI. Here is a link to the individual build times for libcudf including the gtests:
https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/cudf/job/prb/job/cudf-gpu-test/CUDA=11.5,GPU_LABEL=driver-495,LINUX_VER=ubuntu20.04,PYTHON=3.8/5173/testReport/(root)/BuildTime/
(you can sort by Duration by clicking on table colum header).

Here is a table of the top 20 compile time offenders as recorded on my local machine. Note that like the CI build output, 6 of the top 20 are just building the `ITERATOR_TEST`

| rank | time (ms) | file |
| ---:| ---:|:--- |
|  1 | 814334 | /cudf.dir/src/search/search.cu.o
|  2 | 755375 | /cudf.dir/src/sort/sort_column.cu.o
|  3 | 686235 | /ITERATOR_TEST.dir/iterator/optional_iterator_test_numeric.cu.o
|  4 | 670587 | /cudf.dir/src/groupby/sort/group_nunique.cu.o
|  5 | 585524 | /cudf.dir/src/reductions/scan/scan_inclusive.cu.o
|  6 | 582677 | /ITERATOR_TEST.dir/iterator/pair_iterator_test_numeric.cu.o
|  7 | 568418 | /ITERATOR_TEST.dir/iterator/scalar_iterator_test.cu.o
|  8 | 563196 | /cudf.dir/src/sort/sort.cu.o
|  9 | 548816 | /ITERATOR_TEST.dir/iterator/value_iterator_test_numeric.cu.o
| 10 | 535315 | /cudf.dir/src/groupby/sort/sort_helper.cu.o
| 11 | 531384 | /cudf.dir/src/sort/is_sorted.cu.o
| 12 | 530382 | /ITERATOR_TEST.dir/iterator/value_iterator_test_chrono.cu.o
| 13 | 525187 | /cudf.dir/src/join/semi_join.cu.o
| 14 | 523726 | /cudf.dir/src/rolling/rolling.cu.o
| 15 | 517909 | /cudf.dir/src/reductions/product.cu.o
| 16 | 513119 | /cudf.dir/src/stream_compaction/distinct_count.cu.o
| 17 | 512569 | /ITERATOR_TEST.dir/iterator/optional_iterator_test_chrono.cu.o
| 18 | 508978 | /cudf.dir/src/reductions/sum_of_squares.cu.o
| 19 | 508460 | /cudf.dir/src/lists/drop_list_duplicates.cu.o
| 20 | 505247 | /cudf.dir/src/reductions/sum.cu.o

I made some simple changes to the iterator code logic to use different thrust functions along with a temporary device vector. This approach improved the compile time of the `ITERATOR_TEST` by about 3x. Here are the results of compiling the above 6 files with the changes in this PR.

| new rank | new time (ms) | file |
| ---:| ---:|:--- |
| 59 | 232691 (2.9x) | optional_iterator_test_numeric.cu.o |
| 26 | 416951 (1.4x) | pair_iterator_test_numeric.cu.o |
| 92 | 165947 (3.4x) | scalar_iterator_test.cu.o |
| 65 | 216364 (2.5x) | value_iterator_test_numeric.cu.o |
| 77 | 186583 (2.8x) | value_iterator_test_chrono.cu.o |
| 111 | 137789 (3.7x) | optional_iterator_test_chrono.cu.o |

Total overall build time improved locally by ~3m (10%) using `ninja -j48 install` on a Dell 5820.

Here are the build time results of a CI build with these changes.
https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/cudf/job/prb/job/cudf-gpu-test/CUDA=11.5,GPU_LABEL=driver-495,LINUX_VER=ubuntu20.04,PYTHON=3.8/5190/testReport/(root)/BuildTime/

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Devavret Makkar (https://github.com/devavret)

URL: https://github.com/rapidsai/cudf/pull/9788
---
 cpp/tests/iterator/iterator_tests.cuh         | 17 +++++++--
 .../optional_iterator_test_numeric.cu         | 37 +++++++++----------
 2 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/cpp/tests/iterator/iterator_tests.cuh b/cpp/tests/iterator/iterator_tests.cuh
index 4ec347c4bc1..07eb595449c 100644
--- a/cpp/tests/iterator/iterator_tests.cuh
+++ b/cpp/tests/iterator/iterator_tests.cuh
@@ -18,8 +18,8 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/detail/iterator.cuh>                             // include iterator header
-#include <cudf/detail/utilities/transform_unary_functions.cuh>  //for meanvar
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/transform_unary_functions.cuh>  // for meanvar
 #include <cudf/detail/utilities/vector_factories.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -28,6 +28,7 @@
 
 #include <thrust/equal.h>
 #include <thrust/functional.h>
+#include <thrust/logical.h>
 #include <thrust/transform.h>
 
 #include <cub/device/device_reduce.cuh>
@@ -83,7 +84,17 @@ struct IteratorTest : public cudf::test::BaseFixture {
     EXPECT_EQ(thrust::distance(d_in, d_in_last), num_items);
     auto dev_expected = cudf::detail::make_device_uvector_sync(expected);
 
-    bool result = thrust::equal(thrust::device, d_in, d_in_last, dev_expected.begin());
+    // using a temporary vector and calling transform and all_of separately is
+    // equivalent to thrust::equal but compiles ~3x faster
+    auto dev_results = rmm::device_uvector<bool>(num_items, rmm::cuda_stream_default);
+    thrust::transform(thrust::device,
+                      d_in,
+                      d_in_last,
+                      dev_expected.begin(),
+                      dev_results.begin(),
+                      thrust::equal_to{});
+    auto result = thrust::all_of(
+      thrust::device, dev_results.begin(), dev_results.end(), thrust::identity<bool>{});
     EXPECT_TRUE(result) << "thrust test";
   }
 
diff --git a/cpp/tests/iterator/optional_iterator_test_numeric.cu b/cpp/tests/iterator/optional_iterator_test_numeric.cu
index 6d51f4a5c14..a8c135a726f 100644
--- a/cpp/tests/iterator/optional_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/optional_iterator_test_numeric.cu
@@ -50,21 +50,15 @@ struct transformer_optional_meanvar {
   }
 };
 
-struct sum_if_not_null {
-  template <typename T>
-  CUDA_HOST_DEVICE_CALLABLE thrust::optional<T> operator()(const thrust::optional<T>& lhs,
-                                                           const thrust::optional<T>& rhs)
-  {
-    return lhs.value_or(T{0}) + rhs.value_or(T{0});
-  }
+template <typename T>
+struct optional_to_meanvar {
+  CUDA_HOST_DEVICE_CALLABLE T operator()(const thrust::optional<T>& v) { return v.value_or(T{0}); }
 };
 
 // TODO: enable this test also at __CUDACC_DEBUG__
 // This test causes fatal compilation error only at device debug mode.
 // Workaround: exclude this test only at device debug mode.
 #if !defined(__CUDACC_DEBUG__)
-// This test computes `count`, `sum`, `sum_of_squares` at a single reduction call.
-// It would be useful for `var`, `std` operation
 TYPED_TEST(NumericOptionalIteratorTest, mean_var_output)
 {
   using T        = TypeParam;
@@ -104,22 +98,27 @@ TYPED_TEST(NumericOptionalIteratorTest, mean_var_output)
   expected_value.value_squared = std::accumulate(
     replaced_array.begin(), replaced_array.end(), T{0}, [](T acc, T i) { return acc + i * i; });
 
-  // std::cout << "expected <mixed_output> = " << expected_value << std::endl;
-
   // GPU test
   auto it_dev         = d_col->optional_begin<T>(cudf::contains_nulls::YES{});
   auto it_dev_squared = thrust::make_transform_iterator(it_dev, transformer);
-  auto result         = thrust::reduce(it_dev_squared,
-                               it_dev_squared + d_col->size(),
-                               thrust::optional<T_output>{T_output{}},
-                               sum_if_not_null{});
+
+  // this can be computed with a single reduce and without a temporary output vector
+  // but the approach increases the compile time by ~2x
+  auto results = rmm::device_uvector<T_output>(d_col->size(), rmm::cuda_stream_default);
+  thrust::transform(thrust::device,
+                    it_dev_squared,
+                    it_dev_squared + d_col->size(),
+                    results.begin(),
+                    optional_to_meanvar<T_output>{});
+  auto result = thrust::reduce(thrust::device, results.begin(), results.end(), T_output{});
+
   if (not std::is_floating_point<T>()) {
-    EXPECT_EQ(expected_value, *result) << "optional iterator reduction sum";
+    EXPECT_EQ(expected_value, result) << "optional iterator reduction sum";
   } else {
-    EXPECT_NEAR(expected_value.value, result->value, 1e-3) << "optional iterator reduction sum";
-    EXPECT_NEAR(expected_value.value_squared, result->value_squared, 1e-3)
+    EXPECT_NEAR(expected_value.value, result.value, 1e-3) << "optional iterator reduction sum";
+    EXPECT_NEAR(expected_value.value_squared, result.value_squared, 1e-3)
       << "optional iterator reduction sum squared";
-    EXPECT_EQ(expected_value.count, result->count) << "optional iterator reduction count";
+    EXPECT_EQ(expected_value.count, result.count) << "optional iterator reduction count";
   }
 }
 #endif

From 11c3dfef2e7fe6fd67ff93bdf36a47c0a5b2eb37 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Wed, 1 Dec 2021 10:28:24 -0600
Subject: [PATCH 28/42] Remove unused masked udf cython/c++ code (#9792)

This PR removes the c++ side of the original masked UDF code introduced in https://github.com/rapidsai/cudf/pull/8213. These kernels had some limitations and are now superseded by the numba-generated versions we moved to in https://github.com/rapidsai/cudf/pull/9174. As far as I can tell, cuDF python was the only thing consuming this API for the short time it has existed. However I am marking this breaking just in case.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Mark Harris (https://github.com/harrism)
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9792
---
 .../Modules/JitifyPreprocessKernels.cmake     |   4 +-
 cpp/include/cudf/transform.hpp                |   6 --
 cpp/src/transform/jit/masked_udf_kernel.cu    |  85 ---------------
 cpp/src/transform/transform.cpp               | 102 ------------------
 python/cudf/cudf/_lib/cpp/transform.pxd       |   6 --
 python/cudf/cudf/_lib/transform.pyx           |  24 -----
 6 files changed, 2 insertions(+), 225 deletions(-)
 delete mode 100644 cpp/src/transform/jit/masked_udf_kernel.cu

diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
index c2ad25760b8..6ab1293ab6f 100644
--- a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
+++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
@@ -51,8 +51,8 @@ function(jit_preprocess_files)
 endfunction()
 
 jit_preprocess_files(
-  SOURCE_DIRECTORY ${CUDF_SOURCE_DIR}/src FILES binaryop/jit/kernel.cu
-  transform/jit/masked_udf_kernel.cu transform/jit/kernel.cu rolling/jit/kernel.cu
+  SOURCE_DIRECTORY ${CUDF_SOURCE_DIR}/src FILES binaryop/jit/kernel.cu transform/jit/kernel.cu
+  rolling/jit/kernel.cu
 )
 
 add_custom_target(
diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index 55e7bc84dbe..45e8ff1310c 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -54,12 +54,6 @@ std::unique_ptr<column> transform(
   bool is_ptx,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-std::unique_ptr<column> generalized_masked_op(
-  table_view const& data_view,
-  std::string const& binary_udf,
-  data_type output_type,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
 /**
  * @brief Creates a null_mask from `input` by converting `NaN` to null and
  * preserving existing null values and also returns new null_count.
diff --git a/cpp/src/transform/jit/masked_udf_kernel.cu b/cpp/src/transform/jit/masked_udf_kernel.cu
deleted file mode 100644
index 319ad730c53..00000000000
--- a/cpp/src/transform/jit/masked_udf_kernel.cu
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstddef>
-#include <cstdint>
-#include <transform/jit/operation-udf.hpp>
-
-#include <cudf/types.hpp>
-#include <cudf/utilities/bit.hpp>
-
-#include <cuda/std/climits>
-#include <cuda/std/cstddef>
-#include <cuda/std/limits>
-#include <cuda/std/tuple>
-#include <cuda/std/type_traits>
-
-namespace cudf {
-namespace transformation {
-namespace jit {
-
-template <typename T>
-struct Masked {
-  T value;
-  bool valid;
-};
-
-template <typename TypeIn, typename MaskType, typename OffsetType>
-__device__ auto make_args(cudf::size_type id, TypeIn in_ptr, MaskType in_mask, OffsetType in_offset)
-{
-  bool valid = in_mask ? cudf::bit_is_set(in_mask, in_offset + id) : true;
-  return cuda::std::make_tuple(in_ptr[id], valid);
-}
-
-template <typename InType, typename MaskType, typename OffsetType, typename... Arguments>
-__device__ auto make_args(cudf::size_type id,
-                          InType in_ptr,
-                          MaskType in_mask,      // in practice, always cudf::bitmask_type const*
-                          OffsetType in_offset,  // in practice, always cudf::size_type
-                          Arguments... args)
-{
-  bool valid = in_mask ? cudf::bit_is_set(in_mask, in_offset + id) : true;
-  return cuda::std::tuple_cat(cuda::std::make_tuple(in_ptr[id], valid), make_args(id, args...));
-}
-
-template <typename TypeOut, typename... Arguments>
-__global__ void generic_udf_kernel(cudf::size_type size,
-                                   TypeOut* out_data,
-                                   bool* out_mask,
-                                   Arguments... args)
-{
-  int const tid    = threadIdx.x;
-  int const blkid  = blockIdx.x;
-  int const blksz  = blockDim.x;
-  int const gridsz = gridDim.x;
-  int const start  = tid + blkid * blksz;
-  int const step   = blksz * gridsz;
-
-  Masked<TypeOut> output;
-  for (cudf::size_type i = start; i < size; i += step) {
-    auto func_args = cuda::std::tuple_cat(
-      cuda::std::make_tuple(&output.value),
-      make_args(i, args...)  // passed int64*, bool*, int64, int64*, bool*, int64
-    );
-    cuda::std::apply(GENERIC_OP, func_args);
-    out_data[i] = output.value;
-    out_mask[i] = output.valid;
-  }
-}
-
-}  // namespace jit
-}  // namespace transformation
-}  // namespace cudf
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 5230b853a79..0cca6699586 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -19,12 +19,10 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/null_mask.hpp>
-#include <cudf/table/table_view.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <jit_preprocessed_files/transform/jit/kernel.cu.jit.hpp>
-#include <jit_preprocessed_files/transform/jit/masked_udf_kernel.cu.jit.hpp>
 
 #include <jit/cache.hpp>
 #include <jit/parser.hpp>
@@ -65,80 +63,6 @@ void unary_operation(mutable_column_view output,
              cudf::jit::get_data_ptr(input));
 }
 
-std::vector<std::string> make_template_types(column_view outcol_view, table_view const& data_view)
-{
-  std::string mskptr_type =
-    cudf::jit::get_type_name(cudf::data_type(cudf::type_to_id<cudf::bitmask_type>())) + "*";
-  std::string offset_type =
-    cudf::jit::get_type_name(cudf::data_type(cudf::type_to_id<cudf::offset_type>()));
-
-  std::vector<std::string> template_types;
-  template_types.reserve((3 * data_view.num_columns()) + 1);
-
-  template_types.push_back(cudf::jit::get_type_name(outcol_view.type()));
-  for (auto const& col : data_view) {
-    template_types.push_back(cudf::jit::get_type_name(col.type()) + "*");
-    template_types.push_back(mskptr_type);
-    template_types.push_back(offset_type);
-  }
-  return template_types;
-}
-
-void generalized_operation(table_view const& data_view,
-                           std::string const& udf,
-                           data_type output_type,
-                           mutable_column_view outcol_view,
-                           mutable_column_view outmsk_view,
-                           rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr)
-{
-  auto const template_types = make_template_types(outcol_view, data_view);
-
-  std::string generic_kernel_name =
-    jitify2::reflection::Template("cudf::transformation::jit::generic_udf_kernel")
-      .instantiate(template_types);
-
-  std::string generic_cuda_source = cudf::jit::parse_single_function_ptx(
-    udf, "GENERIC_OP", cudf::jit::get_type_name(output_type), {0});
-
-  std::vector<void*> kernel_args;
-  kernel_args.reserve((data_view.num_columns() * 3) + 3);
-
-  cudf::size_type size   = outcol_view.size();
-  const void* outcol_ptr = cudf::jit::get_data_ptr(outcol_view);
-  const void* outmsk_ptr = cudf::jit::get_data_ptr(outmsk_view);
-  kernel_args.insert(kernel_args.begin(), {&size, &outcol_ptr, &outmsk_ptr});
-
-  std::vector<const void*> data_ptrs;
-  std::vector<cudf::bitmask_type const*> mask_ptrs;
-  std::vector<cudf::offset_type> offsets;
-
-  data_ptrs.reserve(data_view.num_columns());
-  mask_ptrs.reserve(data_view.num_columns());
-  offsets.reserve(data_view.num_columns());
-
-  auto const iters = thrust::make_zip_iterator(
-    thrust::make_tuple(data_ptrs.begin(), mask_ptrs.begin(), offsets.begin()));
-
-  std::for_each(iters, iters + data_view.num_columns(), [&](auto const& tuple_vals) {
-    kernel_args.push_back(&thrust::get<0>(tuple_vals));
-    kernel_args.push_back(&thrust::get<1>(tuple_vals));
-    kernel_args.push_back(&thrust::get<2>(tuple_vals));
-  });
-
-  std::transform(data_view.begin(), data_view.end(), iters, [&](column_view const& col) {
-    return thrust::make_tuple(cudf::jit::get_data_ptr(col), col.null_mask(), col.offset());
-  });
-
-  cudf::jit::get_program_cache(*transform_jit_masked_udf_kernel_cu_jit)
-    .get_kernel(generic_kernel_name,
-                {},
-                {{"transform/jit/operation-udf.hpp", generic_cuda_source}},
-                {"-arch=sm_."})
-    ->configure_1d_max_occupancy(0, 0, 0, stream.value())
-    ->launch(kernel_args.data());
-}
-
 }  // namespace jit
 }  // namespace transformation
 
@@ -165,24 +89,6 @@ std::unique_ptr<column> transform(column_view const& input,
   return output;
 }
 
-std::unique_ptr<column> generalized_masked_op(table_view const& data_view,
-                                              std::string const& udf,
-                                              data_type output_type,
-                                              rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
-{
-  std::unique_ptr<column> output = make_fixed_width_column(output_type, data_view.num_rows());
-  std::unique_ptr<column> output_mask =
-    make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, data_view.num_rows());
-
-  transformation::jit::generalized_operation(
-    data_view, udf, output_type, *output, *output_mask, stream, mr);
-
-  auto final_output_mask = cudf::bools_to_mask(*output_mask);
-  output.get()->set_null_mask(std::move(*(final_output_mask.first)));
-  return output;
-}
-
 }  // namespace detail
 
 std::unique_ptr<column> transform(column_view const& input,
@@ -195,12 +101,4 @@ std::unique_ptr<column> transform(column_view const& input,
   return detail::transform(input, unary_udf, output_type, is_ptx, rmm::cuda_stream_default, mr);
 }
 
-std::unique_ptr<column> generalized_masked_op(table_view const& data_view,
-                                              std::string const& udf,
-                                              data_type output_type,
-                                              rmm::mr::device_memory_resource* mr)
-{
-  return detail::generalized_masked_op(data_view, udf, output_type, rmm::cuda_stream_default, mr);
-}
-
 }  // namespace cudf
diff --git a/python/cudf/cudf/_lib/cpp/transform.pxd b/python/cudf/cudf/_lib/cpp/transform.pxd
index 3153427ce3c..590a371ff52 100644
--- a/python/cudf/cudf/_lib/cpp/transform.pxd
+++ b/python/cudf/cudf/_lib/cpp/transform.pxd
@@ -34,12 +34,6 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
         bool is_ptx
     ) except +
 
-    cdef unique_ptr[column] generalized_masked_op(
-        const table_view& data_view,
-        string udf,
-        data_type output_type,
-    ) except +
-
     cdef pair[unique_ptr[table], unique_ptr[column]] encode(
         table_view input
     ) except +
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index a0eb7c68183..96d25cb92c9 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -123,30 +123,6 @@ def transform(Column input, op):
     return Column.from_unique_ptr(move(c_output))
 
 
-def masked_udf(incols, op, output_type):
-    cdef table_view data_view = table_view_from_table(
-        incols, ignore_index=True)
-    cdef string c_str = op.encode("UTF-8")
-    cdef type_id c_tid
-    cdef data_type c_dtype
-
-    c_tid = <type_id> (
-        <underlying_type_t_type_id> SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[
-            output_type
-        ]
-    )
-    c_dtype = data_type(c_tid)
-
-    with nogil:
-        c_output = move(libcudf_transform.generalized_masked_op(
-            data_view,
-            c_str,
-            c_dtype,
-        ))
-
-    return Column.from_unique_ptr(move(c_output))
-
-
 def table_encode(input):
     cdef table_view c_input = table_view_from_table(
         input, ignore_index=True)

From 1904d1a9ff54343471998523816c9e0a00f46797 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Wed, 1 Dec 2021 13:00:16 -0600
Subject: [PATCH 29/42] Fix overflow for min calculation in
 strings::from_timestamps (#9793)

This fixes #9790

When converting a timestamp to a String it is possible for the %M min calculation to overflow an int32_t part way through casting. This moves that result to be an int64_t which avoids the overflow issues.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/9793
---
 cpp/src/strings/convert/convert_datetime.cu | 4 ++--
 cpp/tests/strings/datetime_tests.cpp        | 8 +++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 51a6a796ba3..8d0c5704a7b 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -707,9 +707,9 @@ struct from_timestamp_base {
    *     scale( 61,60) ->  1
    * @endcode
    */
-  __device__ int32_t scale_time(int64_t time, int64_t base) const
+  __device__ int64_t scale_time(int64_t time, int64_t base) const
   {
-    return static_cast<int32_t>((time - ((time < 0) * (base - 1L))) / base);
+    return (time - ((time < 0) * (base - 1L))) / base;
   };
 
   __device__ time_components get_time_components(int64_t tstamp) const
diff --git a/cpp/tests/strings/datetime_tests.cpp b/cpp/tests/strings/datetime_tests.cpp
index 4543607614f..9a01d5dd041 100644
--- a/cpp/tests/strings/datetime_tests.cpp
+++ b/cpp/tests/strings/datetime_tests.cpp
@@ -311,13 +311,14 @@ TEST_F(StringsDatetimeTest, FromTimestampAmPm)
 TEST_F(StringsDatetimeTest, FromTimestampMillisecond)
 {
   cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep> timestamps_ms{
-    1530705600123, 1582934461007, 1451430122421, 1318302183999, -6106017600047};
+    1530705600123, 1582934461007, 1451430122421, 1318302183999, -6106017600047, 128849018880000};
   auto results = cudf::strings::from_timestamps(timestamps_ms, "%Y-%m-%d %H:%M:%S.%3f");
   cudf::test::strings_column_wrapper expected_ms{"2018-07-04 12:00:00.123",
                                                  "2020-02-29 00:01:01.007",
                                                  "2015-12-29 23:02:02.421",
                                                  "2011-10-11 03:03:03.999",
-                                                 "1776-07-04 11:59:59.953"};
+                                                 "1776-07-04 11:59:59.953",
+                                                 "6053-01-23 02:08:00.000"};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_ms);
 
   results = cudf::strings::from_timestamps(timestamps_ms, "%Y-%m-%d %H:%M:%S.%f");
@@ -325,7 +326,8 @@ TEST_F(StringsDatetimeTest, FromTimestampMillisecond)
                                                     "2020-02-29 00:01:01.007000",
                                                     "2015-12-29 23:02:02.421000",
                                                     "2011-10-11 03:03:03.999000",
-                                                    "1776-07-04 11:59:59.953000"};
+                                                    "1776-07-04 11:59:59.953000",
+                                                    "6053-01-23 02:08:00.000000"};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_ms_6f);
 
   cudf::test::fixed_width_column_wrapper<cudf::timestamp_ns, cudf::timestamp_ns::rep> timestamps_ns{

From 836f800e61acafa0fa6b3c7d9826904f0ba2ad06 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <36027403+codereport@users.noreply.github.com>
Date: Wed, 1 Dec 2021 16:46:14 -0500
Subject: [PATCH 30/42] Use CTAD with Thrust function objects (#9768)

While reviewing another PR, I noticed unnecessary usage of explicit template parameters with Thrust function objects and decided to open a small PR to clean this up (CTAD showed up in C++17).

CI depends on https://github.com/rapidsai/cudf/pull/9766

Authors:
  - Conor Hoekstra (https://github.com/codereport)

Approvers:
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/9768
---
 cpp/include/cudf/strings/detail/gather.cuh       |  2 +-
 cpp/include/cudf_test/column_wrapper.hpp         |  7 ++-----
 cpp/src/copying/concatenate.cu                   |  2 +-
 cpp/src/groupby/sort/group_merge_m2.cu           |  4 ++--
 cpp/src/groupby/sort/group_rank_scan.cu          |  2 +-
 cpp/src/groupby/sort/group_scan_util.cuh         |  6 +++---
 .../sort/group_single_pass_reduction_util.cuh    | 16 ++++++++--------
 cpp/src/groupby/sort/group_tdigest.cu            | 10 +++++-----
 cpp/src/join/hash_join.cu                        |  2 +-
 cpp/src/join/join_utils.cu                       |  2 +-
 .../lists/combine/concatenate_list_elements.cu   |  2 +-
 cpp/src/lists/contains.cu                        |  7 ++-----
 cpp/src/lists/interleave_columns.cu              |  8 ++++----
 cpp/src/quantiles/tdigest/tdigest.cu             |  7 ++-----
 cpp/src/reductions/scan/scan_inclusive.cu        |  9 ++++-----
 cpp/src/rolling/grouped_rolling.cu               |  6 +++---
 cpp/src/rolling/rolling_collect_list.cu          |  2 +-
 cpp/src/sort/rank.cu                             | 10 +++++-----
 cpp/src/strings/copying/concatenate.cu           |  2 +-
 cpp/src/strings/findall.cu                       |  7 ++-----
 cpp/src/strings/repeat_strings.cu                |  2 +-
 cpp/src/strings/split/split.cu                   | 14 ++++----------
 cpp/tests/iterator/iterator_tests.cuh            | 11 +++--------
 .../apply_boolean_mask_tests.cpp                 |  4 ++--
 cpp/tests/strings/fixed_point_tests.cpp          |  2 +-
 cpp/tests/transform/row_bit_count_test.cu        |  6 ++----
 26 files changed, 63 insertions(+), 89 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index ec4a88a0e46..eb7258830ce 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -315,7 +315,7 @@ std::unique_ptr<cudf::column> gather(
     d_out_offsets + output_count,
     [] __device__(auto size) { return static_cast<size_t>(size); },
     size_t{0},
-    thrust::plus<size_t>{});
+    thrust::plus{});
   CUDF_EXPECTS(total_bytes < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
                "total size of output strings is too large for a cudf column");
 
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index cd2ac9f3ec1..ccfdde2270c 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -1502,11 +1502,8 @@ class lists_column_wrapper : public detail::column_wrapper {
 
     // concatenate them together, skipping children that are null.
     std::vector<column_view> children;
-    thrust::copy_if(std::cbegin(cols),
-                    std::cend(cols),
-                    valids,  // stencil
-                    std::back_inserter(children),
-                    thrust::identity<bool>{});
+    thrust::copy_if(
+      std::cbegin(cols), std::cend(cols), valids, std::back_inserter(children), thrust::identity{});
 
     auto data = children.empty() ? cudf::empty_like(expected_hierarchy) : concatenate(children);
 
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index f4b6a8bf5fd..34c0cea683e 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -79,7 +79,7 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
     device_views.cend(),
     std::next(offsets.begin()),
     [](auto const& col) { return col.size(); },
-    thrust::plus<size_t>{});
+    thrust::plus{});
   auto d_offsets         = make_device_uvector_async(offsets, stream);
   auto const output_size = offsets.back();
 
diff --git a/cpp/src/groupby/sort/group_merge_m2.cu b/cpp/src/groupby/sort/group_merge_m2.cu
index 4e2a5b68abc..bde7c985df1 100644
--- a/cpp/src/groupby/sort/group_merge_m2.cu
+++ b/cpp/src/groupby/sort/group_merge_m2.cu
@@ -173,8 +173,8 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
 
   // Generate bitmask for the output.
   // Only mean and M2 values can be nullable. Count column must be non-nullable.
-  auto [null_mask, null_count] = cudf::detail::valid_if(
-    validities.begin(), validities.end(), thrust::identity<int8_t>{}, stream, mr);
+  auto [null_mask, null_count] =
+    cudf::detail::valid_if(validities.begin(), validities.end(), thrust::identity{}, stream, mr);
   if (null_count > 0) {
     result_means->set_null_mask(null_mask, null_count);           // copy null_mask
     result_M2s->set_null_mask(std::move(null_mask), null_count);  // take over null_mask
diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu
index 935ef9554a9..f36bdc0a660 100644
--- a/cpp/src/groupby/sort/group_rank_scan.cu
+++ b/cpp/src/groupby/sort/group_rank_scan.cu
@@ -79,7 +79,7 @@ std::unique_ptr<column> rank_generator(column_view const& order_by,
                                 group_labels.end(),
                                 mutable_ranks.begin<size_type>(),
                                 mutable_ranks.begin<size_type>(),
-                                thrust::equal_to<size_type>{},
+                                thrust::equal_to{},
                                 scan_op);
 
   return ranks;
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index ae3e3232e06..e25fdd6fc27 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -115,7 +115,7 @@ struct group_scan_functor<K, T, std::enable_if_t<is_group_scan_supported<K, T>()
                                     group_labels.end(),
                                     inp_iter,
                                     out_iter,
-                                    thrust::equal_to<size_type>{},
+                                    thrust::equal_to{},
                                     binop);
     };
 
@@ -160,7 +160,7 @@ struct group_scan_functor<K,
                                     group_labels.end(),
                                     inp_iter,
                                     out_iter,
-                                    thrust::equal_to<size_type>{},
+                                    thrust::equal_to{},
                                     binop);
     };
 
@@ -214,7 +214,7 @@ struct group_scan_functor<K,
                                     group_labels.end(),
                                     inp_iter,
                                     out_iter,
-                                    thrust::equal_to<size_type>{},
+                                    thrust::equal_to{},
                                     binop);
     };
 
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index decb127b264..95a36f40e57 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -191,7 +191,7 @@ struct group_reduction_functor<K, T, std::enable_if_t<is_group_reduction_support
                             inp_iter,
                             thrust::make_discard_iterator(),
                             out_iter,
-                            thrust::equal_to<size_type>{},
+                            thrust::equal_to{},
                             binop);
     };
 
@@ -215,10 +215,10 @@ struct group_reduction_functor<K, T, std::enable_if_t<is_group_reduction_support
       rmm::device_uvector<bool> validity(num_groups, stream);
       do_reduction(cudf::detail::make_validity_iterator(*d_values_ptr),
                    validity.begin(),
-                   thrust::logical_or<bool>{});
+                   thrust::logical_or{});
 
-      auto [null_mask, null_count] = cudf::detail::valid_if(
-        validity.begin(), validity.end(), thrust::identity<bool>{}, stream, mr);
+      auto [null_mask, null_count] =
+        cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr);
       result->set_null_mask(std::move(null_mask), null_count);
     }
     return result;
@@ -264,7 +264,7 @@ struct group_reduction_functor<
                             inp_iter,
                             thrust::make_discard_iterator(),
                             out_iter,
-                            thrust::equal_to<size_type>{},
+                            thrust::equal_to{},
                             binop);
     };
 
@@ -283,10 +283,10 @@ struct group_reduction_functor<
       auto validity           = rmm::device_uvector<bool>(num_groups, stream);
       do_reduction(cudf::detail::make_validity_iterator(*d_values_ptr),
                    validity.begin(),
-                   thrust::logical_or<bool>{});
+                   thrust::logical_or{});
 
-      auto [null_mask, null_count] = cudf::detail::valid_if(
-        validity.begin(), validity.end(), thrust::identity<bool>{}, stream, mr);
+      auto [null_mask, null_count] =
+        cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr);
       result->set_null_mask(std::move(null_mask), null_count);
     } else {
       auto const binop =
diff --git a/cpp/src/groupby/sort/group_tdigest.cu b/cpp/src/groupby/sort/group_tdigest.cu
index 146a6a8c31c..551eb128231 100644
--- a/cpp/src/groupby/sort/group_tdigest.cu
+++ b/cpp/src/groupby/sort/group_tdigest.cu
@@ -625,7 +625,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
                         centroids_begin,                  // values
                         thrust::make_discard_iterator(),  // key output
                         output,                           // output
-                        thrust::equal_to<size_type>{},    // key equality check
+                        thrust::equal_to{},               // key equality check
                         merge_centroids{});
 
   // create final tdigest column
@@ -850,8 +850,8 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
                         min_iter,
                         thrust::make_discard_iterator(),
                         merged_min_col->mutable_view().begin<double>(),
-                        thrust::equal_to<size_type>{},  // key equality check
-                        thrust::minimum<double>{});
+                        thrust::equal_to{},  // key equality check
+                        thrust::minimum{});
 
   auto merged_max_col = cudf::make_numeric_column(
     data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
@@ -864,8 +864,8 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
                         max_iter,
                         thrust::make_discard_iterator(),
                         merged_max_col->mutable_view().begin<double>(),
-                        thrust::equal_to<size_type>{},  // key equality check
-                        thrust::maximum<double>{});
+                        thrust::equal_to{},  // key equality check
+                        thrust::maximum{});
 
   // for any empty groups, set the min and max to be 0. not technically necessary but it makes
   // testing simpler.
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index e4bd1938ecc..c5b680f129e 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -266,7 +266,7 @@ std::size_t get_full_join_size(cudf::table_device_view build_table,
     left_join_complement_size = thrust::count_if(rmm::exec_policy(stream),
                                                  invalid_index_map->begin(),
                                                  invalid_index_map->end(),
-                                                 thrust::identity<size_type>());
+                                                 thrust::identity());
   }
   return join_size + left_join_complement_size;
 }
diff --git a/cpp/src/join/join_utils.cu b/cpp/src/join/join_utils.cu
index 4aca4b4a9cf..9e98f87e7f0 100644
--- a/cpp/src/join/join_utils.cu
+++ b/cpp/src/join/join_utils.cu
@@ -136,7 +136,7 @@ get_left_join_indices_complement(std::unique_ptr<rmm::device_uvector<size_type>>
                                               thrust::make_counting_iterator(end_counter),
                                               invalid_index_map->begin(),
                                               right_indices_complement->begin(),
-                                              thrust::identity<size_type>()) -
+                                              thrust::identity{}) -
                               right_indices_complement->begin();
     right_indices_complement->resize(indices_count, stream);
   }
diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu
index 4bef312b396..2ddede97ce4 100644
--- a/cpp/src/lists/combine/concatenate_list_elements.cu
+++ b/cpp/src/lists/combine/concatenate_list_elements.cu
@@ -225,7 +225,7 @@ std::unique_ptr<column> concatenate_lists_nullifying_rows(column_view const& inp
   auto list_entries =
     gather_list_entries(input, offsets_view, num_rows, num_output_entries, stream, mr);
   auto [null_mask, null_count] = cudf::detail::valid_if(
-    list_validities.begin(), list_validities.end(), thrust::identity<int8_t>{}, stream, mr);
+    list_validities.begin(), list_validities.end(), thrust::identity{}, stream, mr);
 
   return make_lists_column(num_rows,
                            std::move(list_offsets),
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index bdbc9ae013c..b48982d205a 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -74,11 +74,8 @@ struct lookup_functor {
     if (!search_keys_have_nulls && !input_lists.has_nulls() && !input_lists.child().has_nulls()) {
       return {rmm::device_buffer{0, stream, mr}, size_type{0}};
     } else {
-      return cudf::detail::valid_if(result_validity.begin<bool>(),
-                                    result_validity.end<bool>(),
-                                    thrust::identity<bool>{},
-                                    stream,
-                                    mr);
+      return cudf::detail::valid_if(
+        result_validity.begin<bool>(), result_validity.end<bool>(), thrust::identity{}, stream, mr);
     }
   }
 
diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu
index b9b73d98ed2..220cb25a942 100644
--- a/cpp/src/lists/interleave_columns.cu
+++ b/cpp/src/lists/interleave_columns.cu
@@ -228,8 +228,8 @@ struct interleave_list_entries_impl<T, std::enable_if_t<std::is_same_v<T, cudf::
     auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
       comp_fn, num_output_lists, num_output_entries, stream, mr);
 
-    auto [null_mask, null_count] = cudf::detail::valid_if(
-      validities.begin(), validities.end(), thrust::identity<int8_t>{}, stream, mr);
+    auto [null_mask, null_count] =
+      cudf::detail::valid_if(validities.begin(), validities.end(), thrust::identity{}, stream, mr);
 
     return make_strings_column(num_output_entries,
                                std::move(offsets_column),
@@ -306,7 +306,7 @@ struct interleave_list_entries_impl<T, std::enable_if_t<cudf::is_fixed_width<T>(
 
     if (data_has_null_mask) {
       auto [null_mask, null_count] = cudf::detail::valid_if(
-        validities.begin(), validities.end(), thrust::identity<int8_t>{}, stream, mr);
+        validities.begin(), validities.end(), thrust::identity{}, stream, mr);
       if (null_count > 0) { output->set_null_mask(null_mask, null_count); }
     }
 
@@ -405,7 +405,7 @@ std::unique_ptr<column> interleave_columns(table_view const& input,
   }
 
   auto [null_mask, null_count] = cudf::detail::valid_if(
-    list_validities.begin(), list_validities.end(), thrust::identity<int8_t>{}, stream, mr);
+    list_validities.begin(), list_validities.end(), thrust::identity{}, stream, mr);
   return make_lists_column(num_output_lists,
                            std::move(list_offsets),
                            std::move(list_entries),
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 57c221b15ed..18e7d02d086 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -348,11 +348,8 @@ std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
     if (null_count == 0) {
       return std::pair<rmm::device_buffer, size_type>{rmm::device_buffer{}, null_count};
     }
-    return cudf::detail::valid_if(tdigest_is_empty,
-                                  tdigest_is_empty + tdv.size(),
-                                  thrust::logical_not<size_type>{},
-                                  stream,
-                                  mr);
+    return cudf::detail::valid_if(
+      tdigest_is_empty, tdigest_is_empty + tdv.size(), thrust::logical_not{}, stream, mr);
   }();
 
   return cudf::make_lists_column(
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index 70f5ca90539..b0e761c4c3b 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -50,11 +50,10 @@ rmm::device_buffer mask_scan(column_view const& input_view,
   auto valid_itr = detail::make_validity_iterator(*d_input);
 
   auto first_null_position = [&] {
-    size_type const first_null = thrust::find_if_not(rmm::exec_policy(stream),
-                                                     valid_itr,
-                                                     valid_itr + input_view.size(),
-                                                     thrust::identity<bool>{}) -
-                                 valid_itr;
+    size_type const first_null =
+      thrust::find_if_not(
+        rmm::exec_policy(stream), valid_itr, valid_itr + input_view.size(), thrust::identity{}) -
+      valid_itr;
     size_type const exclusive_offset = (inclusive == scan_type::EXCLUSIVE) ? 1 : 0;
     return std::min(input_view.size(), first_null + exclusive_offset);
   }();
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index 509f67bb5c6..5a7f15148d8 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -142,8 +142,8 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                preceding_window] __device__(size_type idx) {
     auto group_label = d_group_labels[idx];
     auto group_start = d_group_offsets[group_label];
-    return thrust::minimum<size_type>{}(preceding_window,
-                                        idx - group_start + 1);  // Preceding includes current row.
+    return thrust::minimum{}(preceding_window,
+                             idx - group_start + 1);  // Preceding includes current row.
   };
 
   auto following_calculator = [d_group_offsets = group_offsets.data(),
@@ -152,7 +152,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
     auto group_label = d_group_labels[idx];
     auto group_end   = d_group_offsets[group_label + 1];  // Cannot fall off the end, since offsets
                                                           // is capped with `input.size()`.
-    return thrust::minimum<size_type>{}(following_window, (group_end - 1) - idx);
+    return thrust::minimum{}(following_window, (group_end - 1) - idx);
   };
 
   if (aggr.kind == aggregation::CUDA || aggr.kind == aggregation::PTX) {
diff --git a/cpp/src/rolling/rolling_collect_list.cu b/cpp/src/rolling/rolling_collect_list.cu
index ecef90dc8e1..30c39bde7d2 100644
--- a/cpp/src/rolling/rolling_collect_list.cu
+++ b/cpp/src/rolling/rolling_collect_list.cu
@@ -75,7 +75,7 @@ std::unique_ptr<column> get_list_child_to_list_row_mapping(cudf::column_view con
                          per_row_mapping_begin,
                          per_row_mapping_begin + num_child_rows,
                          per_row_mapping_begin,
-                         thrust::maximum<size_type>{});
+                         thrust::maximum{});
   return per_row_mapping;
 }
 
diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index c8a908e44cd..e9589e6c4b3 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -117,7 +117,7 @@ void tie_break_ranks_transform(cudf::device_span<size_type const> dense_rank_sor
                         tie_iter,
                         thrust::make_discard_iterator(),
                         tie_sorted.begin(),
-                        thrust::equal_to<size_type>{},
+                        thrust::equal_to{},
                         tie_breaker);
   auto sorted_tied_rank = thrust::make_transform_iterator(
     dense_rank_sorted.begin(),
@@ -171,8 +171,8 @@ void rank_min(cudf::device_span<size_type const> group_keys,
                                        thrust::make_counting_iterator<size_type>(1),
                                        sorted_order_view,
                                        rank_mutable_view.begin<outputType>(),
-                                       thrust::minimum<size_type>{},
-                                       thrust::identity<outputType>{},
+                                       thrust::minimum{},
+                                       thrust::identity{},
                                        stream);
 }
 
@@ -189,8 +189,8 @@ void rank_max(cudf::device_span<size_type const> group_keys,
                                        thrust::make_counting_iterator<size_type>(1),
                                        sorted_order_view,
                                        rank_mutable_view.begin<outputType>(),
-                                       thrust::maximum<size_type>{},
-                                       thrust::identity<outputType>{},
+                                       thrust::maximum{},
+                                       thrust::identity{},
                                        stream);
 }
 
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index db8b37a9592..3822fa8bf5a 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -96,7 +96,7 @@ auto create_strings_device_views(host_span<column_view const> views, rmm::cuda_s
                                    device_views_ptr + views.size(),
                                    std::next(d_partition_offsets.begin()),
                                    chars_size_transform{},
-                                   thrust::plus<size_t>{});
+                                   thrust::plus{});
   auto const output_chars_size = d_partition_offsets.back_element(stream);
   stream.synchronize();  // ensure copy of output_chars_size is complete before returning
 
diff --git a/cpp/src/strings/findall.cu b/cpp/src/strings/findall.cu
index 3ab5b55020c..8d96f0de415 100644
--- a/cpp/src/strings/findall.cu
+++ b/cpp/src/strings/findall.cu
@@ -153,11 +153,8 @@ std::unique_ptr<table> findall_re(
 
   std::vector<std::unique_ptr<column>> results;
 
-  size_type const columns = thrust::reduce(rmm::exec_policy(stream),
-                                           find_counts.begin(),
-                                           find_counts.end(),
-                                           0,
-                                           thrust::maximum<size_type>{});
+  size_type const columns = thrust::reduce(
+    rmm::exec_policy(stream), find_counts.begin(), find_counts.end(), 0, thrust::maximum{});
   // boundary case: if no columns, return all nulls column (issue #119)
   if (columns == 0)
     results.emplace_back(std::make_unique<column>(
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index 458f3ed885c..7820e0064a6 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -369,7 +369,7 @@ std::pair<std::unique_ptr<column>, int64_t> repeat_strings_output_sizes(
                              thrust::make_counting_iterator<size_type>(strings_count),
                              fn,
                              int64_t{0},
-                             thrust::plus<int64_t>{});
+                             thrust::plus{});
 
   return std::make_pair(std::move(output_sizes), total_bytes);
 }
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index 5113b418501..c6e52a79059 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -490,11 +490,8 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
     });
 
   // the columns_count is the maximum number of tokens for any string
-  auto const columns_count = thrust::reduce(rmm::exec_policy(stream),
-                                            token_counts.begin(),
-                                            token_counts.end(),
-                                            0,
-                                            thrust::maximum<size_type>{});
+  auto const columns_count = thrust::reduce(
+    rmm::exec_policy(stream), token_counts.begin(), token_counts.end(), 0, thrust::maximum{});
   // boundary case: if no columns, return one null column (custrings issue #119)
   if (columns_count == 0) {
     results.push_back(std::make_unique<column>(
@@ -748,11 +745,8 @@ std::unique_ptr<table> whitespace_split_fn(size_type strings_count,
                     [tokenizer] __device__(size_type idx) { return tokenizer.count_tokens(idx); });
 
   // column count is the maximum number of tokens for any string
-  size_type const columns_count = thrust::reduce(rmm::exec_policy(stream),
-                                                 token_counts.begin(),
-                                                 token_counts.end(),
-                                                 0,
-                                                 thrust::maximum<size_type>{});
+  size_type const columns_count = thrust::reduce(
+    rmm::exec_policy(stream), token_counts.begin(), token_counts.end(), 0, thrust::maximum{});
 
   std::vector<std::unique_ptr<column>> results;
   // boundary case: if no columns, return one null column (issue #119)
diff --git a/cpp/tests/iterator/iterator_tests.cuh b/cpp/tests/iterator/iterator_tests.cuh
index 07eb595449c..d93c1275122 100644
--- a/cpp/tests/iterator/iterator_tests.cuh
+++ b/cpp/tests/iterator/iterator_tests.cuh
@@ -51,13 +51,8 @@ struct IteratorTest : public cudf::test::BaseFixture {
 
     // Get temporary storage size
     size_t temp_storage_bytes = 0;
-    cub::DeviceReduce::Reduce(nullptr,
-                              temp_storage_bytes,
-                              d_in,
-                              dev_result.begin(),
-                              num_items,
-                              thrust::minimum<T_output>{},
-                              init);
+    cub::DeviceReduce::Reduce(
+      nullptr, temp_storage_bytes, d_in, dev_result.begin(), num_items, thrust::minimum{}, init);
 
     // Allocate temporary storage
     rmm::device_buffer d_temp_storage(temp_storage_bytes, rmm::cuda_stream_default);
@@ -68,7 +63,7 @@ struct IteratorTest : public cudf::test::BaseFixture {
                               d_in,
                               dev_result.begin(),
                               num_items,
-                              thrust::minimum<T_output>{},
+                              thrust::minimum{},
                               init);
 
     evaluate(expected, dev_result, "cub test");
diff --git a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
index 813cceb0861..c80a8fba55c 100644
--- a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
+++ b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
@@ -204,13 +204,13 @@ TEST_F(ApplyBooleanMask, FixedPointLargeColumnTest)
                   dec32_data.cend(),
                   mask_data.cbegin(),
                   std::back_inserter(expect_dec32_data),
-                  thrust::identity<bool>());
+                  thrust::identity{});
   thrust::copy_if(thrust::seq,
                   dec64_data.cbegin(),
                   dec64_data.cend(),
                   mask_data.cbegin(),
                   std::back_inserter(expect_dec64_data),
-                  thrust::identity<bool>());
+                  thrust::identity{});
 
   decimal32_wrapper expect_col32(
     expect_dec32_data.begin(), expect_dec32_data.end(), numeric::scale_type{-3});
diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index ce4280e0733..5872a9e5bb7 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -329,4 +329,4 @@ TEST_F(StringsConvertTest, DISABLED_FixedPointStringConversionOperator)
 
   auto const c = numeric::decimal128{numeric::scaled_integer{max, numeric::scale_type{-38}}};
   EXPECT_EQ(static_cast<std::string>(c), "1.70141183460469231731687303715884105727");
-}
\ No newline at end of file
+}
diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu
index 7fb7326f221..43d63c9fd22 100644
--- a/cpp/tests/transform/row_bit_count_test.cu
+++ b/cpp/tests/transform/row_bit_count_test.cu
@@ -239,10 +239,8 @@ TEST_F(RowBitCount, StructsWithLists_RowsExceedingASingleBlock)
   // List child column = {0, 1, 2, 3, 4, ..., 2*num_rows};
   auto ints      = make_numeric_column(data_type{type_id::INT32}, num_rows * 2);
   auto ints_view = ints->mutable_view();
-  thrust::tabulate(thrust::device,
-                   ints_view.begin<int32_t>(),
-                   ints_view.end<int32_t>(),
-                   thrust::identity<int32_t>());
+  thrust::tabulate(
+    thrust::device, ints_view.begin<int32_t>(), ints_view.end<int32_t>(), thrust::identity{});
 
   // List offsets = {0, 2, 4, 6, 8, ..., num_rows*2};
   auto list_offsets      = make_numeric_column(data_type{type_id::INT32}, num_rows + 1);

From 677e63236a81ea3c402df993845a1fdc98072c9e Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <36027403+codereport@users.noreply.github.com>
Date: Wed, 1 Dec 2021 16:46:25 -0500
Subject: [PATCH 31/42] Avoid overflow for `fixed_point` `cudf::cast` and
 performance optimization (#9772)

This resolves https://github.com/rapidsai/cudf/issues/9000.

When using `cudf::cast` for a wider decimal type to a narrower decimal type, you can overflow. This PR modifies the code path for this specific use case so that the "rescale" happens for the type cast. A small perf improvement was added when you have identical scales to avoid rescaling.

CI depends on https://github.com/rapidsai/cudf/pull/9766

Authors:
  - Conor Hoekstra (https://github.com/codereport)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/9772
---
 cpp/src/unary/cast_ops.cu      | 49 +++++++++++++++++++++-------------
 cpp/tests/unary/cast_tests.cpp | 13 +++++++++
 2 files changed, 43 insertions(+), 19 deletions(-)

diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index e852b00796a..131fde11cf8 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -305,28 +305,39 @@ struct dispatch_unary_cast_to {
                                      rmm::mr::device_memory_resource* mr)
   {
     using namespace numeric;
-
-    auto const size = input.size();
-    auto temporary =
-      std::make_unique<column>(cudf::data_type{type.id(), input.type().scale()},
-                               size,
-                               rmm::device_buffer{size * cudf::size_of(type), stream},
-                               copy_bitmask(input, stream),
-                               input.null_count());
-
     using SourceDeviceT = device_storage_type_t<SourceT>;
     using TargetDeviceT = device_storage_type_t<TargetT>;
 
-    mutable_column_view output_mutable = *temporary;
-
-    thrust::transform(rmm::exec_policy(stream),
-                      input.begin<SourceDeviceT>(),
-                      input.end<SourceDeviceT>(),
-                      output_mutable.begin<TargetDeviceT>(),
-                      device_cast<SourceDeviceT, TargetDeviceT>{});
-
-    // clearly there is a more efficient way to do this, can optimize in the future
-    return rescale<TargetT>(*temporary, numeric::scale_type{type.scale()}, stream, mr);
+    auto casted = [&]() {
+      auto const size = input.size();
+      auto output     = std::make_unique<column>(cudf::data_type{type.id(), input.type().scale()},
+                                             size,
+                                             rmm::device_buffer{size * cudf::size_of(type), stream},
+                                             copy_bitmask(input, stream),
+                                             input.null_count());
+
+      mutable_column_view output_mutable = *output;
+
+      thrust::transform(rmm::exec_policy(stream),
+                        input.begin<SourceDeviceT>(),
+                        input.end<SourceDeviceT>(),
+                        output_mutable.begin<TargetDeviceT>(),
+                        device_cast<SourceDeviceT, TargetDeviceT>{});
+
+      return output;
+    };
+
+    if (input.type().scale() == type.scale()) return casted();
+
+    if constexpr (sizeof(SourceDeviceT) < sizeof(TargetDeviceT)) {
+      // device_cast BEFORE rescale when SourceDeviceT is < TargetDeviceT
+      auto temporary = casted();
+      return detail::rescale<TargetT>(*temporary, scale_type{type.scale()}, stream, mr);
+    } else {
+      // device_cast AFTER rescale when SourceDeviceT is > TargetDeviceT to avoid overflow
+      auto temporary = detail::rescale<SourceT>(input, scale_type{type.scale()}, stream, mr);
+      return detail::cast(*temporary, type, stream, mr);
+    }
   }
 
   template <typename TargetT,
diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp
index 4d0009ab20a..db457623d8d 100644
--- a/cpp/tests/unary/cast_tests.cpp
+++ b/cpp/tests/unary/cast_tests.cpp
@@ -1003,3 +1003,16 @@ TYPED_TEST(FixedPointTests, Decimal128ToDecimalXXWithLargerScaleAndNullMask)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
+
+TEST_F(FixedPointTestSingleType, Int32ToInt64Convert)
+{
+  using namespace numeric;
+  using fp_wrapperA = cudf::test::fixed_point_column_wrapper<int32_t>;
+  using fp_wrapperB = cudf::test::fixed_point_column_wrapper<int64_t>;
+
+  auto const input    = fp_wrapperB{{141230900000L}, scale_type{-10}};
+  auto const expected = fp_wrapperA{{14123}, scale_type{-3}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimal32>(-3));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}

From 7d8a8e53f495279ae129fa46948c07230d6e77b4 Mon Sep 17 00:00:00 2001
From: Raza Jafri <razajafri@users.noreply.github.com>
Date: Wed, 1 Dec 2021 13:53:05 -0800
Subject: [PATCH 32/42] Allow cast decimal128 to string and add tests (#9756)

Small PR that enables Decimal128 cast

Authors:
  - Raza Jafri (https://github.com/razajafri)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/9756
---
 java/src/main/native/src/ColumnViewJni.cpp       |  3 ++-
 .../java/ai/rapids/cudf/ColumnVectorTest.java    | 16 ++++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 4efac307627..02d5dc4569c 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -916,7 +916,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas
         case cudf::type_id::INT64:
         case cudf::type_id::UINT64: result = cudf::strings::from_integers(*column); break;
         case cudf::type_id::DECIMAL32:
-        case cudf::type_id::DECIMAL64: result = cudf::strings::from_fixed_point(*column); break;
+        case cudf::type_id::DECIMAL64:
+        case cudf::type_id::DECIMAL128: result = cudf::strings::from_fixed_point(*column); break;
         default: JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Invalid data type", 0);
       }
     } else if (column->type().id() == cudf::type_id::STRING) {
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index fa9052029cc..31a52eb2ec0 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -3372,6 +3372,22 @@ void testFixedWidthCast() {
     }
   }
 
+  @Test
+  void testCastBigDecimalToString() {
+    BigDecimal[] bigValues = {new BigDecimal("923121331938210123.321"),
+        new BigDecimal("9223372036854775808.191"),
+        new BigDecimal("9328323982309091029831.002")
+    };
+
+    try (ColumnVector cv = ColumnVector.fromDecimals(bigValues);
+         ColumnVector values = cv.castTo(DType.STRING);
+         ColumnVector expected = ColumnVector.fromStrings("923121331938210123.321",
+             "9223372036854775808.191",
+             "9328323982309091029831.002")) {
+      assertColumnsAreEqual(expected, values);
+    }
+  }
+
   @Test
   void testCastStringToBigDecimal() {
     String[] bigValues = {"923121331938210123.321",

From 5491cc789bbfbaad7099124dcfe004719e7f013c Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Thu, 2 Dec 2021 03:30:50 +0530
Subject: [PATCH 33/42] Fix memory error due to lambda return type deduction
 limitation (#9778)

Fixes #9703
replace device lambda with device functor with return type. (due to [14. extended-lambda-restrictions](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#extended-lambda-restrictions) )
~add `__host__` to lambda for nvcc return type deduction to work properly.~
~replaced `auto` (generic lambda) with `size_type`.~
fixes shared memory write error caused in #9703

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - David Wendt (https://github.com/davidwendt)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: https://github.com/rapidsai/cudf/pull/9778
---
 cpp/src/sort/rank.cu         | 13 +++++++++----
 cpp/tests/sort/rank_test.cpp | 14 ++++++++++++++
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index e9589e6c4b3..de0a44e3234 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -194,6 +194,12 @@ void rank_max(cudf::device_span<size_type const> group_keys,
                                        stream);
 }
 
+// Returns index, count
+template <typename T>
+struct index_counter {
+  __device__ T operator()(size_type i) { return T{i, 1}; }
+};
+
 void rank_average(cudf::device_span<size_type const> group_keys,
                   column_view sorted_order_view,
                   mutable_column_view rank_mutable_view,
@@ -208,10 +214,9 @@ void rank_average(cudf::device_span<size_type const> group_keys,
   using MinCount = thrust::pair<size_type, size_type>;
   tie_break_ranks_transform<MinCount>(
     group_keys,
-    cudf::detail::make_counting_transform_iterator(1,
-                                                   [] __device__(auto i) {
-                                                     return MinCount{i, 1};
-                                                   }),
+    // Use device functor with return type. Cannot use device lambda due to limitation.
+    // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#extended-lambda-restrictions
+    cudf::detail::make_counting_transform_iterator(1, index_counter<MinCount>{}),
     sorted_order_view,
     rank_mutable_view.begin<double>(),
     [] __device__(auto rank_count1, auto rank_count2) {
diff --git a/cpp/tests/sort/rank_test.cpp b/cpp/tests/sort/rank_test.cpp
index 94e389fc7ce..926ad1e203e 100644
--- a/cpp/tests/sort/rank_test.cpp
+++ b/cpp/tests/sort/rank_test.cpp
@@ -410,5 +410,19 @@ TYPED_TEST(Rank, min_desc_bottom_pct)
   this->run_all_tests(rank_method::MIN, desc_bottom, col1_rank, col2_rank, col3_rank, true);
 }
 
+struct RankLarge : public BaseFixture {
+};
+
+TEST_F(RankLarge, average_large)
+{
+  // testcase of https://github.com/rapidsai/cudf/issues/9703
+  auto iter = thrust::counting_iterator<int64_t>(0);
+  fixed_width_column_wrapper<int64_t> col1(iter, iter + 10558);
+  auto result =
+    cudf::rank(col1, rank_method::AVERAGE, {}, null_policy::EXCLUDE, null_order::AFTER, false);
+  fixed_width_column_wrapper<double, int> expected(iter + 1, iter + 10559);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+}
+
 }  // namespace test
 }  // namespace cudf

From c10966cc3847ca9837ddc7ce5df9c4d9b7c743d8 Mon Sep 17 00:00:00 2001
From: Alfred Xu <lovedreamf@gmail.com>
Date: Thu, 2 Dec 2021 18:48:03 +0800
Subject: [PATCH 34/42] Fix make_empty_scalar_like on list_type (#9759)

Fixes #9758

In `make_empty_scalar_like`, we create list scalar with the list column itself, which is wrong. The correct way is with the child of list column.

Authors:
  - Alfred Xu (https://github.com/sperlingxx)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Devavret Makkar (https://github.com/devavret)

URL: https://github.com/rapidsai/cudf/pull/9759
---
 cpp/src/scalar/scalar_factories.cpp      | 7 +++++--
 cpp/tests/reductions/reduction_tests.cpp | 8 ++++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/cpp/src/scalar/scalar_factories.cpp b/cpp/src/scalar/scalar_factories.cpp
index d2876435780..c18b57d220f 100644
--- a/cpp/src/scalar/scalar_factories.cpp
+++ b/cpp/src/scalar/scalar_factories.cpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <cudf/detail/copy.hpp>
+#include <cudf/lists/lists_column_view.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
@@ -184,10 +185,12 @@ std::unique_ptr<scalar> make_empty_scalar_like(column_view const& column,
 {
   std::unique_ptr<scalar> result;
   switch (column.type().id()) {
-    case type_id::LIST:
-      result = make_list_scalar(empty_like(column)->view(), stream, mr);
+    case type_id::LIST: {
+      auto const empty_child = empty_like(lists_column_view(column).child());
+      result                 = make_list_scalar(empty_child->view(), stream, mr);
       result->set_valid_async(false, stream);
       break;
+    }
     case type_id::STRUCT:
       // The input column must have at least 1 row to extract a scalar (row) from it.
       result = detail::get_element(column, 0, stream, mr);
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index d8ee8f9d08d..e138cd6f68e 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -1961,7 +1961,11 @@ struct ListReductionTest : public cudf::test::BaseFixture {
         cudf::reduce(input_data, agg, cudf::data_type(cudf::type_id::LIST));
       auto list_result = dynamic_cast<cudf::list_scalar*>(result.get());
       EXPECT_EQ(is_valid, list_result->is_valid());
-      if (is_valid) { CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_value, list_result->view()); }
+      if (is_valid) {
+        CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_value, list_result->view());
+      } else {
+        CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_value, list_result->view());
+      }
     };
 
     if (succeeded_condition) {
@@ -2047,7 +2051,7 @@ TEST_F(ListReductionTest, NonValidListReductionNthElement)
 
   // test against empty input
   this->reduction_test(LCW{},
-                       ElementCol{{0}, {0}},  // expected_value,
+                       ElementCol{},  // expected_value,
                        true,
                        false,
                        cudf::make_nth_element_aggregation(0, cudf::null_policy::INCLUDE));

From 582cc6e466c7d941e1b34893fd56fbd42fe90d68 Mon Sep 17 00:00:00 2001
From: Chong Gao <gaochong.gc@qq.com>
Date: Thu, 2 Dec 2021 21:12:01 +0800
Subject: [PATCH 35/42] Add sample JNI API (#9728)

Add sample JNI

Signed-off-by: Chong Gao <res_life@163.com>

Authors:
  - Chong Gao (https://github.com/res-life)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/9728
---
 java/src/main/java/ai/rapids/cudf/Table.java  | 30 +++++++++++++++++++
 java/src/main/native/src/TableJni.cpp         | 15 ++++++++++
 .../test/java/ai/rapids/cudf/TableTest.java   | 21 +++++++++++++
 3 files changed, 66 insertions(+)

diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index b0791fb440f..b11808ed023 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -678,6 +678,8 @@ private static native ContiguousTable[] contiguousSplitGroups(long inputTable,
                                                                 boolean[] keysDescending,
                                                                 boolean[] keysNullSmallest);
 
+  private static native long[] sample(long tableHandle, long n, boolean replacement, long seed);
+
   /////////////////////////////////////////////////////////////////////////////
   // TABLE CREATION APIs
   /////////////////////////////////////////////////////////////////////////////
@@ -2801,6 +2803,34 @@ public static Table fromPackedTable(ByteBuffer metadata, DeviceMemoryBuffer data
     return result;
   }
 
+
+  /**
+   * Gather `n` samples from table randomly
+   * Note: does not preserve the ordering
+   * Example:
+   * input: {col1: {1, 2, 3, 4, 5}, col2: {6, 7, 8, 9, 10}}
+   * n: 3
+   * replacement: false
+   *
+   * output:       {col1: {3, 1, 4}, col2: {8, 6, 9}}
+   *
+   * replacement: true
+   *
+   * output:       {col1: {3, 1, 1}, col2: {8, 6, 6}}
+   *
+   * throws "logic_error" if `n` > table rows and `replacement` == FALSE.
+   * throws "logic_error" if `n` < 0.
+   *
+   * @param n non-negative number of samples expected from table
+   * @param replacement Allow or disallow sampling of the same row more than once.
+   * @param seed Seed value to initiate random number generator.
+   *
+   * @return Table containing samples
+   */
+  public Table sample(long n, boolean replacement, long seed) {
+    return new Table(sample(nativeHandle, n, replacement, seed));
+  }
+
   /////////////////////////////////////////////////////////////////////////////
   // HELPER CLASSES
   /////////////////////////////////////////////////////////////////////////////
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index a78d40a58f7..f3377bb002d 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -20,6 +20,7 @@
 #include <arrow/ipc/api.h>
 #include <cudf/aggregation.hpp>
 #include <cudf/concatenate.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/filling.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/hashing.hpp>
@@ -3147,4 +3148,18 @@ JNIEXPORT jobjectArray JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
   CATCH_STD(env, NULL);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_sample(JNIEnv *env, jclass, jlong j_input,
+                                                              jlong n, jboolean replacement,
+                                                              jlong seed) {
+  JNI_NULL_CHECK(env, j_input, "input table is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *input = reinterpret_cast<cudf::table_view *>(j_input);
+    auto sample_with_replacement =
+        replacement ? cudf::sample_with_replacement::TRUE : cudf::sample_with_replacement::FALSE;
+    std::unique_ptr<cudf::table> result = cudf::sample(*input, n, sample_with_replacement, seed);
+    return cudf::jni::convert_table_for_return(env, result);
+  }
+  CATCH_STD(env, 0);
+}
 } // extern "C"
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index fa221e19387..0b2f56895e9 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -7584,4 +7584,25 @@ void testExplodeOuterPosition() {
       }
     }
   }
+
+  @Test
+  void testSample() {
+    try (Table t = new Table.TestBuilder().column("s1", "s2", "s3", "s4", "s5").build()) {
+      try (Table ret = t.sample(3, false, 0);
+           Table expected = new Table.TestBuilder().column("s3", "s4", "s5").build()) {
+        assertTablesAreEqual(expected, ret);
+      }
+
+      try (Table ret = t.sample(5, false, 0);
+           Table expected = new Table.TestBuilder().column("s3", "s4", "s5", "s2", "s1").build()) {
+        assertTablesAreEqual(expected, ret);
+      }
+
+      try (Table ret = t.sample(8, true, 0);
+           Table expected = new Table.TestBuilder()
+               .column("s1", "s1", "s4", "s5", "s5", "s1", "s3", "s2").build()) {
+        assertTablesAreEqual(expected, ret);
+      }
+    }
+  }
 }

From 1077daeaad8ff710de6f4fbb99f2e7371b4af8de Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Thu, 2 Dec 2021 15:51:04 -0600
Subject: [PATCH 36/42] Fix caching in `Series.applymap` (#9821)

The cache key we were generating for these functions didn't take into account the constants that could be different in the bytecode. Hence certain functions were causing cache hits when they actually differ by a constant value somewhere in the logic.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/9821
---
 python/cudf/cudf/tests/test_udf_masked_ops.py | 19 +++++++++++++++++++
 python/cudf/cudf/utils/cudautils.py           |  4 +++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index dc126546f15..c9c2c440632 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -593,3 +593,22 @@ def func(row, c, k):
         return y
 
     run_masked_udf_test(func, data, args=(1, 2), check_dtype=False)
+
+
+def test_masked_udf_caching():
+    # Make sure similar functions that differ
+    # by simple things like constants actually
+    # recompile
+
+    data = cudf.Series([1, 2, 3])
+    expect = data ** 2
+    got = data.applymap(lambda x: x ** 2)
+
+    assert_eq(expect, got, check_dtype=False)
+
+    # update the constant value being used and make sure
+    # it does not result in a cache hit
+
+    expect = data ** 3
+    got = data.applymap(lambda x: x ** 3)
+    assert_eq(expect, got, check_dtype=False)
diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py
index 5fa091a0081..f0533dcaa72 100755
--- a/python/cudf/cudf/utils/cudautils.py
+++ b/python/cudf/cudf/utils/cudautils.py
@@ -216,12 +216,14 @@ def make_cache_key(udf, sig):
     recompiling the same function for the same set of types
     """
     codebytes = udf.__code__.co_code
+    constants = udf.__code__.co_consts
     if udf.__closure__ is not None:
         cvars = tuple([x.cell_contents for x in udf.__closure__])
         cvarbytes = dumps(cvars)
     else:
         cvarbytes = b""
-    return codebytes, cvarbytes, sig
+
+    return constants, codebytes, cvarbytes, sig
 
 
 def compile_udf(udf, type_signature):

From 50acf076d4a35bc57dc00a416f0d9507b1992c0f Mon Sep 17 00:00:00 2001
From: MithunR <mythrocks@gmail.com>
Date: Thu, 2 Dec 2021 14:07:31 -0800
Subject: [PATCH 37/42] Fix stream usage in `segmented_gather()` (#9679)

`detail::segmented_gather()` inadvertently uses `cuda_default_stream` in some parts of its implementation, while using the user-specified stream in others.

This applies to the calls to `copy_range_in_place()`, `allocate_like()`, and `make_lists_column()`. ~This might produce race conditions, which might explain NVIDIA/spark-rapids/issues/4060. It's a rare failure that's quite hard to reproduce.~ This might lead to over-synchronization, though bad output is unlikely.

The commit here should sort this out, by switching to the `detail` APIs corresponding to the calls above.

Authors:
  - MithunR (https://github.com/mythrocks)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/9679
---
 cpp/src/lists/copying/segmented_gather.cu | 21 ++++++++++++---------
 cpp/src/lists/extract.cu                  |  2 +-
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu
index 8cbcddc1c58..41187b96cdb 100644
--- a/cpp/src/lists/copying/segmented_gather.cu
+++ b/cpp/src/lists/copying/segmented_gather.cu
@@ -13,8 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <cudf/detail/copy_range.cuh>
 #include <cudf/detail/gather.cuh>
-#include <cudf/detail/gather.hpp>
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
@@ -88,14 +88,15 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& value_column,
   auto child       = std::move(child_table->release().front());
 
   // Create list offsets from gather_map.
-  auto output_offset = cudf::allocate_like(
-    gather_map.offsets(), gather_map.size() + 1, mask_allocation_policy::RETAIN, mr);
+  auto output_offset = cudf::detail::allocate_like(
+    gather_map.offsets(), gather_map.size() + 1, mask_allocation_policy::RETAIN, stream, mr);
   auto output_offset_view = output_offset->mutable_view();
-  cudf::copy_range_in_place(gather_map.offsets(),
-                            output_offset_view,
-                            gather_map.offset(),
-                            gather_map.offset() + output_offset_view.size(),
-                            0);
+  cudf::detail::copy_range_in_place(gather_map.offsets(),
+                                    output_offset_view,
+                                    gather_map.offset(),
+                                    gather_map.offset() + output_offset_view.size(),
+                                    0,
+                                    stream);
   // Assemble list column & return
   auto null_mask       = cudf::detail::copy_bitmask(value_column.parent(), stream, mr);
   size_type null_count = value_column.null_count();
@@ -103,7 +104,9 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& value_column,
                            std::move(output_offset),
                            std::move(child),
                            null_count,
-                           std::move(null_mask));
+                           std::move(null_mask),
+                           stream,
+                           mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu
index 381864e1a68..7c6c612eb25 100644
--- a/cpp/src/lists/extract.cu
+++ b/cpp/src/lists/extract.cu
@@ -53,7 +53,7 @@ std::unique_ptr<cudf::column> make_index_child(column_view const& indices,
   // `segmented_gather()` on a null index should produce a null row.
   if (not indices.nullable()) { return std::make_unique<column>(indices, stream); }
 
-  auto const d_indices = column_device_view::create(indices);
+  auto const d_indices = column_device_view::create(indices, stream);
   // Replace null indices with MAX_SIZE_TYPE, so that gather() returns null for them.
   auto const null_replaced_iter_begin =
     cudf::detail::make_null_replacement_iterator(*d_indices, std::numeric_limits<size_type>::max());

From b848dd5c9cfef7e3523810d67296e037f31945c1 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 2 Dec 2021 14:40:57 -0800
Subject: [PATCH 38/42] Fix ORC writer crash with empty input columns (#9808)

Fixes https://github.com/rapidsai/cudf/issues/9783

Skip some parts of writing when the input table was zero rows.
Add is_empty to `hostdevice_2dvector`.
Add Python test with empty columns.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Devavret Makkar (https://github.com/devavret)
  - Conor Hoekstra (https://github.com/codereport)

URL: https://github.com/rapidsai/cudf/pull/9808
---
 cpp/src/io/orc/writer_impl.cu              | 338 +++++++++++----------
 cpp/src/io/utilities/hostdevice_vector.hpp |   1 +
 python/cudf/cudf/tests/test_orc.py         |  15 +
 3 files changed, 188 insertions(+), 166 deletions(-)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index e53fb3589bc..db02125ce77 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -579,12 +579,15 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
         }
 
         auto const direct_data_size =
-          std::accumulate(segmentation.stripes.front().cbegin(),
-                          segmentation.stripes.back().cend(),
-                          size_t{0},
-                          [&](auto data_size, auto rg_idx) {
-                            return data_size + column.host_dict_chunk(rg_idx)->string_char_count;
-                          });
+          segmentation.num_stripes() == 0
+            ? 0
+            : std::accumulate(segmentation.stripes.front().cbegin(),
+                              segmentation.stripes.back().cend(),
+                              size_t{0},
+                              [&](auto data_size, auto rg_idx) {
+                                return data_size +
+                                       column.host_dict_chunk(rg_idx)->string_char_count;
+                              });
         if (enable_dict) {
           uint32_t dict_bits = 0;
           for (dict_bits = 1; dict_bits < 32; dict_bits <<= 1) {
@@ -988,17 +991,19 @@ encoded_data encode_columns(orc_table_view const& orc_table,
   }
   chunk_streams.host_to_device(stream);
 
-  if (orc_table.num_string_columns() != 0) {
-    auto d_stripe_dict = orc_table.string_column(0).device_stripe_dict();
-    gpu::EncodeStripeDictionaries(d_stripe_dict,
-                                  chunks,
-                                  orc_table.num_string_columns(),
-                                  segmentation.num_stripes(),
-                                  chunk_streams,
-                                  stream);
-  }
+  if (orc_table.num_rows() > 0) {
+    if (orc_table.num_string_columns() != 0) {
+      auto d_stripe_dict = orc_table.string_column(0).device_stripe_dict();
+      gpu::EncodeStripeDictionaries(d_stripe_dict,
+                                    chunks,
+                                    orc_table.num_string_columns(),
+                                    segmentation.num_stripes(),
+                                    chunk_streams,
+                                    stream);
+    }
 
-  gpu::EncodeOrcColumnData(chunks, chunk_streams, stream);
+    gpu::EncodeOrcColumnData(chunks, chunk_streams, stream);
+  }
   dictionaries.data.clear();
   dictionaries.index.clear();
   stream.synchronize();
@@ -1803,7 +1808,7 @@ void writer::impl::write(table_view const& table)
   auto dictionaries = allocate_dictionaries(orc_table, rowgroup_bounds, stream);
   hostdevice_2dvector<gpu::DictionaryChunk> dict(
     rowgroup_bounds.size().first, orc_table.num_string_columns(), stream);
-  if (orc_table.num_string_columns() != 0) {
+  if (not dict.is_empty()) {
     init_dictionaries(orc_table,
                       rowgroup_bounds,
                       dictionaries.d_data_view,
@@ -1819,7 +1824,7 @@ void writer::impl::write(table_view const& table)
   // Build stripe-level dictionaries
   hostdevice_2dvector<gpu::StripeDictionary> stripe_dict(
     segmentation.num_stripes(), orc_table.num_string_columns(), stream);
-  if (orc_table.num_string_columns() != 0) {
+  if (not stripe_dict.is_empty()) {
     build_dictionaries(orc_table,
                        segmentation.stripes,
                        dict,
@@ -1842,165 +1847,166 @@ void writer::impl::write(table_view const& table)
     segmentation.num_stripes(), num_data_streams, stream);
   auto stripes = gather_stripes(num_index_streams, segmentation, &enc_data.streams, &strm_descs);
 
-  // Gather column statistics
-  std::vector<ColStatsBlob> column_stats;
-  if (enable_statistics_ && table.num_columns() > 0 && num_rows > 0) {
-    column_stats = gather_statistic_blobs(orc_table, segmentation);
-  }
+  if (num_rows > 0) {
+    // Gather column statistics
+    auto const column_stats = enable_statistics_ && table.num_columns() > 0
+                                ? gather_statistic_blobs(orc_table, segmentation)
+                                : std::vector<ColStatsBlob>{};
 
-  // Allocate intermediate output stream buffer
-  size_t compressed_bfr_size       = 0;
-  size_t num_compressed_blocks     = 0;
-  size_t max_compressed_block_size = 0;
-  if (compression_kind_ != NONE) {
-    nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
-      compression_blocksize_, nvcompBatchedSnappyDefaultOpts, &max_compressed_block_size);
-  }
-  auto stream_output = [&]() {
-    size_t max_stream_size = 0;
-    bool all_device_write  = true;
+    // Allocate intermediate output stream buffer
+    size_t compressed_bfr_size       = 0;
+    size_t num_compressed_blocks     = 0;
+    size_t max_compressed_block_size = 0;
+    if (compression_kind_ != NONE) {
+      nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
+        compression_blocksize_, nvcompBatchedSnappyDefaultOpts, &max_compressed_block_size);
+    }
+    auto stream_output = [&]() {
+      size_t max_stream_size = 0;
+      bool all_device_write  = true;
+
+      for (auto& ss : strm_descs.host_view().flat_view()) {
+        if (!out_sink_->is_device_write_preferred(ss.stream_size)) { all_device_write = false; }
+        size_t stream_size = ss.stream_size;
+        if (compression_kind_ != NONE) {
+          ss.first_block = num_compressed_blocks;
+          ss.bfr_offset  = compressed_bfr_size;
+
+          auto num_blocks = std::max<uint32_t>(
+            (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1);
+          stream_size += num_blocks * BLOCK_HEADER_SIZE;
+          num_compressed_blocks += num_blocks;
+          compressed_bfr_size += (max_compressed_block_size + BLOCK_HEADER_SIZE) * num_blocks;
+        }
+        max_stream_size = std::max(max_stream_size, stream_size);
+      }
 
-    for (auto& ss : strm_descs.host_view().flat_view()) {
-      if (!out_sink_->is_device_write_preferred(ss.stream_size)) { all_device_write = false; }
-      size_t stream_size = ss.stream_size;
-      if (compression_kind_ != NONE) {
-        ss.first_block = num_compressed_blocks;
-        ss.bfr_offset  = compressed_bfr_size;
-
-        auto num_blocks = std::max<uint32_t>(
-          (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1);
-        stream_size += num_blocks * BLOCK_HEADER_SIZE;
-        num_compressed_blocks += num_blocks;
-        compressed_bfr_size += (max_compressed_block_size + BLOCK_HEADER_SIZE) * num_blocks;
+      if (all_device_write) {
+        return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
+      } else {
+        return pinned_buffer<uint8_t>{[](size_t size) {
+                                        uint8_t* ptr = nullptr;
+                                        CUDA_TRY(cudaMallocHost(&ptr, size));
+                                        return ptr;
+                                      }(max_stream_size),
+                                      cudaFreeHost};
       }
-      max_stream_size = std::max(max_stream_size, stream_size);
-    }
+    }();
 
-    if (all_device_write) {
-      return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
-    } else {
-      return pinned_buffer<uint8_t>{[](size_t size) {
-                                      uint8_t* ptr = nullptr;
-                                      CUDA_TRY(cudaMallocHost(&ptr, size));
-                                      return ptr;
-                                    }(max_stream_size),
-                                    cudaFreeHost};
+    // Compress the data streams
+    rmm::device_buffer compressed_data(compressed_bfr_size, stream);
+    hostdevice_vector<gpu_inflate_status_s> comp_out(num_compressed_blocks, stream);
+    hostdevice_vector<gpu_inflate_input_s> comp_in(num_compressed_blocks, stream);
+    if (compression_kind_ != NONE) {
+      strm_descs.host_to_device(stream);
+      gpu::CompressOrcDataStreams(static_cast<uint8_t*>(compressed_data.data()),
+                                  num_compressed_blocks,
+                                  compression_kind_,
+                                  compression_blocksize_,
+                                  max_compressed_block_size,
+                                  strm_descs,
+                                  enc_data.streams,
+                                  comp_in,
+                                  comp_out,
+                                  stream);
+      strm_descs.device_to_host(stream);
+      comp_out.device_to_host(stream, true);
     }
-  }();
-
-  // Compress the data streams
-  rmm::device_buffer compressed_data(compressed_bfr_size, stream);
-  hostdevice_vector<gpu_inflate_status_s> comp_out(num_compressed_blocks, stream);
-  hostdevice_vector<gpu_inflate_input_s> comp_in(num_compressed_blocks, stream);
-  if (compression_kind_ != NONE) {
-    strm_descs.host_to_device(stream);
-    gpu::CompressOrcDataStreams(static_cast<uint8_t*>(compressed_data.data()),
-                                num_compressed_blocks,
-                                compression_kind_,
-                                compression_blocksize_,
-                                max_compressed_block_size,
-                                strm_descs,
-                                enc_data.streams,
-                                comp_in,
-                                comp_out,
-                                stream);
-    strm_descs.device_to_host(stream);
-    comp_out.device_to_host(stream, true);
-  }
 
-  ProtobufWriter pbw_(&buffer_);
-
-  // Write stripes
-  std::vector<std::future<void>> write_tasks;
-  for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) {
-    auto const& rowgroups_range = segmentation.stripes[stripe_id];
-    auto& stripe                = stripes[stripe_id];
-
-    stripe.offset = out_sink_->bytes_written();
-
-    // Column (skippable) index streams appear at the start of the stripe
-    for (size_type stream_id = 0; stream_id < num_index_streams; ++stream_id) {
-      write_index_stream(stripe_id,
-                         stream_id,
-                         orc_table.columns,
-                         rowgroups_range,
-                         enc_data.streams,
-                         strm_descs,
-                         comp_out,
-                         &stripe,
-                         &streams,
-                         &pbw_);
-    }
+    ProtobufWriter pbw_(&buffer_);
+
+    // Write stripes
+    std::vector<std::future<void>> write_tasks;
+    for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) {
+      auto const& rowgroups_range = segmentation.stripes[stripe_id];
+      auto& stripe                = stripes[stripe_id];
+
+      stripe.offset = out_sink_->bytes_written();
+
+      // Column (skippable) index streams appear at the start of the stripe
+      for (size_type stream_id = 0; stream_id < num_index_streams; ++stream_id) {
+        write_index_stream(stripe_id,
+                           stream_id,
+                           orc_table.columns,
+                           rowgroups_range,
+                           enc_data.streams,
+                           strm_descs,
+                           comp_out,
+                           &stripe,
+                           &streams,
+                           &pbw_);
+      }
 
-    // Column data consisting one or more separate streams
-    for (auto const& strm_desc : strm_descs[stripe_id]) {
-      write_tasks.push_back(
-        write_data_stream(strm_desc,
-                          enc_data.streams[strm_desc.column_id][rowgroups_range.first],
-                          static_cast<uint8_t const*>(compressed_data.data()),
-                          stream_output.get(),
-                          &stripe,
-                          &streams));
-    }
+      // Column data consisting one or more separate streams
+      for (auto const& strm_desc : strm_descs[stripe_id]) {
+        write_tasks.push_back(
+          write_data_stream(strm_desc,
+                            enc_data.streams[strm_desc.column_id][rowgroups_range.first],
+                            static_cast<uint8_t const*>(compressed_data.data()),
+                            stream_output.get(),
+                            &stripe,
+                            &streams));
+      }
 
-    // Write stripefooter consisting of stream information
-    StripeFooter sf;
-    sf.streams = streams;
-    sf.columns.resize(orc_table.num_columns() + 1);
-    sf.columns[0].kind = DIRECT;
-    for (size_t i = 1; i < sf.columns.size(); ++i) {
-      sf.columns[i].kind = orc_table.column(i - 1).orc_encoding();
-      sf.columns[i].dictionarySize =
-        (sf.columns[i].kind == DICTIONARY_V2)
-          ? orc_table.column(i - 1).host_stripe_dict(stripe_id)->num_strings
-          : 0;
-      if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; }
+      // Write stripefooter consisting of stream information
+      StripeFooter sf;
+      sf.streams = streams;
+      sf.columns.resize(orc_table.num_columns() + 1);
+      sf.columns[0].kind = DIRECT;
+      for (size_t i = 1; i < sf.columns.size(); ++i) {
+        sf.columns[i].kind = orc_table.column(i - 1).orc_encoding();
+        sf.columns[i].dictionarySize =
+          (sf.columns[i].kind == DICTIONARY_V2)
+            ? orc_table.column(i - 1).host_stripe_dict(stripe_id)->num_strings
+            : 0;
+        if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; }
+      }
+      buffer_.resize((compression_kind_ != NONE) ? 3 : 0);
+      pbw_.write(sf);
+      stripe.footerLength = buffer_.size();
+      if (compression_kind_ != NONE) {
+        uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1;
+        buffer_[0]             = static_cast<uint8_t>(uncomp_sf_len >> 0);
+        buffer_[1]             = static_cast<uint8_t>(uncomp_sf_len >> 8);
+        buffer_[2]             = static_cast<uint8_t>(uncomp_sf_len >> 16);
+      }
+      out_sink_->host_write(buffer_.data(), buffer_.size());
     }
-    buffer_.resize((compression_kind_ != NONE) ? 3 : 0);
-    pbw_.write(sf);
-    stripe.footerLength = buffer_.size();
-    if (compression_kind_ != NONE) {
-      uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1;
-      buffer_[0]             = static_cast<uint8_t>(uncomp_sf_len >> 0);
-      buffer_[1]             = static_cast<uint8_t>(uncomp_sf_len >> 8);
-      buffer_[2]             = static_cast<uint8_t>(uncomp_sf_len >> 16);
+    for (auto const& task : write_tasks) {
+      task.wait();
     }
-    out_sink_->host_write(buffer_.data(), buffer_.size());
-  }
-  for (auto const& task : write_tasks) {
-    task.wait();
-  }
 
-  if (column_stats.size() != 0) {
-    // File-level statistics
-    // NOTE: Excluded from chunked write mode to avoid the need for merging stats across calls
-    if (single_write_mode) {
-      // First entry contains total number of rows
-      buffer_.resize(0);
-      pbw_.putb(1 * 8 + PB_TYPE_VARINT);
-      pbw_.put_uint(num_rows);
-      ff.statistics.reserve(1 + orc_table.num_columns());
-      ff.statistics.emplace_back(std::move(buffer_));
-      // Add file stats, stored after stripe stats in `column_stats`
-      ff.statistics.insert(
-        ff.statistics.end(),
-        std::make_move_iterator(column_stats.begin()) + stripes.size() * orc_table.num_columns(),
-        std::make_move_iterator(column_stats.end()));
-    }
-    // Stripe-level statistics
-    size_t first_stripe = md.stripeStats.size();
-    md.stripeStats.resize(first_stripe + stripes.size());
-    for (size_t stripe_id = 0; stripe_id < stripes.size(); stripe_id++) {
-      md.stripeStats[first_stripe + stripe_id].colStats.resize(1 + orc_table.num_columns());
-      buffer_.resize(0);
-      pbw_.putb(1 * 8 + PB_TYPE_VARINT);
-      pbw_.put_uint(stripes[stripe_id].numberOfRows);
-      md.stripeStats[first_stripe + stripe_id].colStats[0] = std::move(buffer_);
-      for (size_t col_idx = 0; col_idx < orc_table.num_columns(); col_idx++) {
-        size_t idx = stripes.size() * col_idx + stripe_id;
-        if (idx < column_stats.size()) {
-          md.stripeStats[first_stripe + stripe_id].colStats[1 + col_idx] =
-            std::move(column_stats[idx]);
+    if (not column_stats.empty()) {
+      // File-level statistics
+      // NOTE: Excluded from chunked write mode to avoid the need for merging stats across calls
+      if (single_write_mode) {
+        // First entry contains total number of rows
+        buffer_.resize(0);
+        pbw_.putb(1 * 8 + PB_TYPE_VARINT);
+        pbw_.put_uint(num_rows);
+        ff.statistics.reserve(1 + orc_table.num_columns());
+        ff.statistics.emplace_back(std::move(buffer_));
+        // Add file stats, stored after stripe stats in `column_stats`
+        ff.statistics.insert(
+          ff.statistics.end(),
+          std::make_move_iterator(column_stats.begin()) + stripes.size() * orc_table.num_columns(),
+          std::make_move_iterator(column_stats.end()));
+      }
+      // Stripe-level statistics
+      size_t first_stripe = md.stripeStats.size();
+      md.stripeStats.resize(first_stripe + stripes.size());
+      for (size_t stripe_id = 0; stripe_id < stripes.size(); stripe_id++) {
+        md.stripeStats[first_stripe + stripe_id].colStats.resize(1 + orc_table.num_columns());
+        buffer_.resize(0);
+        pbw_.putb(1 * 8 + PB_TYPE_VARINT);
+        pbw_.put_uint(stripes[stripe_id].numberOfRows);
+        md.stripeStats[first_stripe + stripe_id].colStats[0] = std::move(buffer_);
+        for (size_t col_idx = 0; col_idx < orc_table.num_columns(); col_idx++) {
+          size_t idx = stripes.size() * col_idx + stripe_id;
+          if (idx < column_stats.size()) {
+            md.stripeStats[first_stripe + stripe_id].colStats[1 + col_idx] =
+              std::move(column_stats[idx]);
+          }
         }
       }
     }
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 283715478a0..a7f9aec7bb4 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -179,6 +179,7 @@ class hostdevice_2dvector {
 
   auto size() const noexcept { return _size; }
   auto count() const noexcept { return _size.first * _size.second; }
+  auto is_empty() const noexcept { return count() == 0; }
 
   T* base_host_ptr(size_t offset = 0) { return _data.host_ptr(offset); }
   T* base_device_ptr(size_t offset = 0) { return _data.device_ptr(offset); }
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 6b02874146e..dc176992434 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1526,3 +1526,18 @@ def test_orc_writer_rle_stream_size(datadir, tmpdir):
     # Segfaults when RLE stream sizes don't account for varint length
     pa_out = pa.orc.ORCFile(reencoded).read()
     assert_eq(df.to_pandas(), pa_out)
+
+
+def test_empty_columns():
+    buffer = BytesIO()
+    # string and decimal columns have additional steps that need to be skipped
+    expected = cudf.DataFrame(
+        {
+            "string": cudf.Series([], dtype="str"),
+            "decimal": cudf.Series([], dtype=cudf.Decimal64Dtype(10, 1)),
+        }
+    )
+    expected.to_orc(buffer, compression="snappy")
+
+    got_df = cudf.read_orc(buffer)
+    assert_eq(expected, got_df)

From 0c08543955a01470baa4fbdbab927298dcf6afd9 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <devavret@users.noreply.github.com>
Date: Fri, 3 Dec 2021 04:53:37 +0530
Subject: [PATCH 39/42] Update cmake and conda to 22.02 (#9746)

Changes related to update to 22.02 in one conda environment recipe (only 11.5) was missed. This adds that.
Also makes project version changes in cmake related to update from 21.12 to 22.02.

Authors:
  - Devavret Makkar (https://github.com/devavret)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Ray Douglass (https://github.com/raydouglass)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9746
---
 ci/release/update-version.sh     | 6 +++---
 cpp/CMakeLists.txt               | 2 +-
 cpp/libcudf_kafka/CMakeLists.txt | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index eeb76a15fcc..86432a92128 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -30,13 +30,13 @@ function sed_runner() {
 }
 
 # cpp update
-sed_runner 's/'"CUDF VERSION .* LANGUAGES"'/'"CUDF VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/CMakeLists.txt
+sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' cpp/CMakeLists.txt
 
 # cpp libcudf_kafka update
-sed_runner 's/'"CUDA_KAFKA VERSION .* LANGUAGES"'/'"CUDA_KAFKA VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/libcudf_kafka/CMakeLists.txt
+sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' cpp/libcudf_kafka/CMakeLists.txt
 
 # cpp cudf_jni update
-sed_runner 's/'"CUDF_JNI VERSION .* LANGUAGES"'/'"CUDF_JNI VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' java/src/main/native/CMakeLists.txt
+sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' java/src/main/native/CMakeLists.txt
 
 # rapids-cmake version
 sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' fetch_rapids.cmake
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 50bdc30b292..e2b317f2e03 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -25,7 +25,7 @@ rapids_cuda_init_architectures(CUDF)
 
 project(
   CUDF
-  VERSION 21.12.00
+  VERSION 22.02.00
   LANGUAGES C CXX CUDA
 )
 
diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt
index 435ff3b5987..d0874b57c2d 100644
--- a/cpp/libcudf_kafka/CMakeLists.txt
+++ b/cpp/libcudf_kafka/CMakeLists.txt
@@ -22,7 +22,7 @@ include(rapids-find)
 
 project(
   CUDA_KAFKA
-  VERSION 21.12.00
+  VERSION 22.02.00
   LANGUAGES CXX
 )
 

From ce64e53264d21c6e59fe98548796a7b6bae24c07 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Thu, 2 Dec 2021 20:19:12 -0600
Subject: [PATCH 40/42] Add directory-partitioned data support to
 cudf.read_parquet (#9720)

Closes #9684
Closes #9690

This PR refactors path handling in `cudf.read_parquet` and uses `pyarrow.dataset` to support for directory-partitioned datasets (with full filterings support at row-group granularity). Since it is my understanding that some users may wish for directory-partitioned columns to be represented as a raw dtype (rather than always becoming categorical), I also added an optional `categorical_partitions` argument (open to suggestions on a better name).

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: https://github.com/rapidsai/cudf/pull/9720
---
 python/cudf/cudf/io/json.py              |   2 +-
 python/cudf/cudf/io/orc.py               |   2 +-
 python/cudf/cudf/io/parquet.py           | 286 +++++++++++++++++++----
 python/cudf/cudf/tests/test_parquet.py   |  94 +++++++-
 python/cudf/cudf/tests/test_s3.py        |   9 +-
 python/cudf/cudf/utils/ioutils.py        |  26 ++-
 python/dask_cudf/dask_cudf/io/parquet.py |   7 +-
 7 files changed, 355 insertions(+), 71 deletions(-)

diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index a48cfd07d3f..1f876214b16 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -37,7 +37,7 @@ def read_json(
         for source in path_or_buf:
             if ioutils.is_directory(source, **kwargs):
                 fs = ioutils._ensure_filesystem(
-                    passed_filesystem=None, path=source
+                    passed_filesystem=None, path=source, **kwargs
                 )
                 source = ioutils.stringify_pathlike(source)
                 source = fs.sep.join([source, "*.json"])
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index ecb1b0cd185..c1cce3f996f 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -316,7 +316,7 @@ def read_orc(
     for source in filepath_or_buffer:
         if ioutils.is_directory(source, **kwargs):
             fs = ioutils._ensure_filesystem(
-                passed_filesystem=None, path=source
+                passed_filesystem=None, path=source, **kwargs,
             )
             source = stringify_path(source)
             source = fs.sep.join([source, "*.orc"])
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 9d665d9a0a5..04d64969a16 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -12,6 +12,7 @@
 import cudf
 from cudf._lib import parquet as libparquet
 from cudf.api.types import is_list_like
+from cudf.core.column import as_column, build_categorical_column
 from cudf.utils import ioutils
 
 
@@ -80,7 +81,7 @@ def write_to_dataset(
         kwargs for to_parquet function.
     """
 
-    fs = ioutils._ensure_filesystem(fs, root_path)
+    fs = ioutils._ensure_filesystem(fs, root_path, **kwargs)
     fs.mkdirs(root_path, exist_ok=True)
     metadata = []
 
@@ -163,11 +164,19 @@ def read_parquet_metadata(path):
     return num_rows, num_row_groups, col_names
 
 
-def _process_row_groups(paths, fs, filters=None, row_groups=None):
+def _process_dataset(
+    paths, fs, filters=None, row_groups=None, categorical_partitions=True,
+):
+    # Returns:
+    #     file_list - Expanded/filtered list of paths
+    #     row_groups - Filtered list of row-group selections
+    #     partition_keys - list of partition keys for each file
+    #     partition_categories - Categories for each partition
 
     # The general purpose of this function is to (1) expand
     # directory input into a list of paths (using the pyarrow
-    # dataset API), and (2) to apply row-group filters.
+    # dataset API), (2) to apply row-group filters, and (3)
+    # to discover directory-partitioning information
 
     # Deal with case that the user passed in a directory name
     file_list = paths
@@ -186,28 +195,107 @@ def _process_row_groups(paths, fs, filters=None, row_groups=None):
     if len(file_list) == 0:
         raise FileNotFoundError(f"{paths} could not be resolved to any files")
 
-    if filters is not None:
-        # Load IDs of filtered row groups for each file in dataset
-        filtered_rg_ids = defaultdict(list)
-        for fragment in dataset.get_fragments(filter=filters):
-            for rg_fragment in fragment.split_by_row_group(filters):
-                for rg_info in rg_fragment.row_groups:
-                    filtered_rg_ids[rg_fragment.path].append(rg_info.id)
-
-        # Initialize row_groups to be selected
-        if row_groups is None:
-            row_groups = [None for _ in dataset.files]
-
-        # Store IDs of selected row groups for each file
-        for i, file in enumerate(dataset.files):
-            if row_groups[i] is None:
-                row_groups[i] = filtered_rg_ids[file]
-            else:
-                row_groups[i] = filter(
-                    lambda id: id in row_groups[i], filtered_rg_ids[file]
+    # Deal with directory partitioning
+    # Get all partition keys (without filters)
+    partition_categories = defaultdict(list)
+    file_fragment = None
+    for file_fragment in dataset.get_fragments():
+        keys = ds._get_partition_keys(file_fragment.partition_expression)
+        if not (keys or partition_categories):
+            # Bail - This is not a directory-partitioned dataset
+            break
+        for k, v in keys.items():
+            if v not in partition_categories[k]:
+                partition_categories[k].append(v)
+        if not categorical_partitions:
+            # Bail - We don't need to discover all categories.
+            # We only need to save the partition keys from this
+            # first `file_fragment`
+            break
+
+    if partition_categories and file_fragment is not None:
+        # Check/correct order of `categories` using last file_frag,
+        # because `_get_partition_keys` does NOT preserve the
+        # partition-hierarchy order of the keys.
+        cat_keys = [
+            part.split("=")[0]
+            for part in file_fragment.path.split(fs.sep)
+            if "=" in part
+        ]
+        if set(partition_categories) == set(cat_keys):
+            partition_categories = {
+                k: partition_categories[k]
+                for k in cat_keys
+                if k in partition_categories
+            }
+
+    # If we do not have partitioned data and
+    # are not filtering, we can return here
+    if filters is None and not partition_categories:
+        return file_list, row_groups, [], {}
+
+    # Record initial row_groups input
+    row_groups_map = {}
+    if row_groups is not None:
+        # Make sure paths and row_groups map 1:1
+        # and save the initial mapping
+        if len(paths) != len(file_list):
+            raise ValueError(
+                "Cannot specify a row_group selection for a directory path."
+            )
+        row_groups_map = {path: rgs for path, rgs in zip(paths, row_groups)}
+
+    # Apply filters and discover partition columns
+    partition_keys = []
+    if partition_categories or filters is not None:
+        file_list = []
+        if filters is not None:
+            row_groups = []
+        for file_fragment in dataset.get_fragments(filter=filters):
+            path = file_fragment.path
+
+            # Extract hive-partition keys, and make sure they
+            # are orederd the same as they are in `partition_categories`
+            if partition_categories:
+                raw_keys = ds._get_partition_keys(
+                    file_fragment.partition_expression
+                )
+                partition_keys.append(
+                    [
+                        (name, raw_keys[name])
+                        for name in partition_categories.keys()
+                    ]
                 )
 
-    return file_list, row_groups
+            # Apply row-group filtering
+            selection = row_groups_map.get(path, None)
+            if selection is not None or filters is not None:
+                filtered_row_groups = [
+                    rg_info.id
+                    for rg_fragment in file_fragment.split_by_row_group(
+                        filters, schema=dataset.schema,
+                    )
+                    for rg_info in rg_fragment.row_groups
+                ]
+            file_list.append(path)
+            if filters is not None:
+                if selection is None:
+                    row_groups.append(filtered_row_groups)
+                else:
+                    row_groups.append(
+                        [
+                            rg_id
+                            for rg_id in filtered_row_groups
+                            if rg_id in selection
+                        ]
+                    )
+
+    return (
+        file_list,
+        row_groups,
+        partition_keys,
+        partition_categories if categorical_partitions else {},
+    )
 
 
 def _get_byte_ranges(file_list, row_groups, columns, fs, **kwargs):
@@ -319,6 +407,7 @@ def read_parquet(
     strings_to_categorical=False,
     use_pandas_metadata=True,
     use_python_file_object=False,
+    categorical_partitions=True,
     *args,
     **kwargs,
 ):
@@ -345,17 +434,29 @@ def read_parquet(
     # Start by trying construct a filesystem object, so we
     # can apply filters on remote file-systems
     fs, paths = ioutils._get_filesystem_and_paths(filepath_or_buffer, **kwargs)
-    filepath_or_buffer = paths if paths else filepath_or_buffer
-    if fs is None and filters is not None:
-        raise ValueError("cudf cannot apply filters to open file objects.")
 
-    # Apply filters now (before converting non-local paths to buffers).
-    # Note that `_process_row_groups` will also expand `filepath_or_buffer`
-    # into a full list of files if it is a directory.
-    if fs is not None:
-        filepath_or_buffer, row_groups = _process_row_groups(
-            filepath_or_buffer, fs, filters=filters, row_groups=row_groups,
+    # Use pyarrow dataset to detect/process directory-partitioned
+    # data and apply filters. Note that we can only support partitioned
+    # data and filtering if the input is a single directory or list of
+    # paths.
+    partition_keys = []
+    partition_categories = {}
+    if fs and paths:
+        (
+            paths,
+            row_groups,
+            partition_keys,
+            partition_categories,
+        ) = _process_dataset(
+            paths,
+            fs,
+            filters=filters,
+            row_groups=row_groups,
+            categorical_partitions=categorical_partitions,
         )
+    elif filters is not None:
+        raise ValueError("cudf cannot apply filters to open file objects.")
+    filepath_or_buffer = paths if paths else filepath_or_buffer
 
     # Check if we should calculate the specific byte-ranges
     # needed for each parquet file. We always do this when we
@@ -380,15 +481,6 @@ def read_parquet(
     filepaths_or_buffers = []
     for i, source in enumerate(filepath_or_buffer):
 
-        if ioutils.is_directory(source, **kwargs):
-            # Note: For now, we know `fs` is an fsspec filesystem
-            # object, but it may be an arrow object in the future
-            fsspec_fs = ioutils._ensure_filesystem(
-                passed_filesystem=fs, path=source
-            )
-            source = ioutils.stringify_pathlike(source)
-            source = fsspec_fs.sep.join([source, "*.parquet"])
-
         tmp_source, compression = ioutils.get_filepath_or_buffer(
             path_or_data=source,
             compression=None,
@@ -410,6 +502,117 @@ def read_parquet(
         else:
             filepaths_or_buffers.append(tmp_source)
 
+    # Warn user if they are not using cudf for IO
+    # (There is a good chance this was not the intention)
+    if engine != "cudf":
+        warnings.warn(
+            "Using CPU via PyArrow to read Parquet dataset."
+            "This option is both inefficient and unstable!"
+        )
+        if filters is not None:
+            warnings.warn(
+                "Parquet row-group filtering is only supported with "
+                "'engine=cudf'. Use pandas or pyarrow API directly "
+                "for full CPU-based filtering functionality."
+            )
+
+    return _parquet_to_frame(
+        filepaths_or_buffers,
+        engine,
+        *args,
+        columns=columns,
+        row_groups=row_groups,
+        skiprows=skiprows,
+        num_rows=num_rows,
+        strings_to_categorical=strings_to_categorical,
+        use_pandas_metadata=use_pandas_metadata,
+        partition_keys=partition_keys,
+        partition_categories=partition_categories,
+        **kwargs,
+    )
+
+
+def _parquet_to_frame(
+    paths_or_buffers,
+    *args,
+    row_groups=None,
+    partition_keys=None,
+    partition_categories=None,
+    **kwargs,
+):
+
+    # If this is not a partitioned read, only need
+    # one call to `_read_parquet`
+    if not partition_keys:
+        return _read_parquet(
+            paths_or_buffers, *args, row_groups=row_groups, **kwargs,
+        )
+
+    # For partitioned data, we need a distinct read for each
+    # unique set of partition keys. Therefore, we start by
+    # aggregating all paths with matching keys using a dict
+    plan = {}
+    for i, (keys, path) in enumerate(zip(partition_keys, paths_or_buffers)):
+        rgs = row_groups[i] if row_groups else None
+        tkeys = tuple(keys)
+        if tkeys in plan:
+            plan[tkeys][0].append(path)
+            if rgs is not None:
+                plan[tkeys][1].append(rgs)
+        else:
+            plan[tkeys] = ([path], None if rgs is None else [rgs])
+
+    dfs = []
+    for part_key, (key_paths, key_row_groups) in plan.items():
+        # Add new DataFrame to our list
+        dfs.append(
+            _read_parquet(
+                key_paths, *args, row_groups=key_row_groups, **kwargs,
+            )
+        )
+        # Add partition columns to the last DataFrame
+        for (name, value) in part_key:
+            if partition_categories and name in partition_categories:
+                # Build the categorical column from `codes`
+                codes = as_column(
+                    partition_categories[name].index(value),
+                    length=len(dfs[-1]),
+                )
+                dfs[-1][name] = build_categorical_column(
+                    categories=partition_categories[name],
+                    codes=codes,
+                    size=codes.size,
+                    offset=codes.offset,
+                    ordered=False,
+                )
+            else:
+                # Not building categorical columns, so
+                # `value` is already what we want
+                dfs[-1][name] = as_column(value, length=len(dfs[-1]))
+
+    # Concatenate dfs and return.
+    # Assume we can ignore the index if it has no name.
+    return (
+        cudf.concat(dfs, ignore_index=dfs[-1].index.name is None)
+        if len(dfs) > 1
+        else dfs[0]
+    )
+
+
+def _read_parquet(
+    filepaths_or_buffers,
+    engine,
+    columns=None,
+    row_groups=None,
+    skiprows=None,
+    num_rows=None,
+    strings_to_categorical=None,
+    use_pandas_metadata=None,
+    *args,
+    **kwargs,
+):
+    # Simple helper function to dispatch between
+    # cudf and pyarrow to read parquet data
     if engine == "cudf":
         return libparquet.read_parquet(
             filepaths_or_buffers,
@@ -421,7 +624,6 @@ def read_parquet(
             use_pandas_metadata=use_pandas_metadata,
         )
     else:
-        warnings.warn("Using CPU via PyArrow to read Parquet dataset.")
         return cudf.DataFrame.from_arrow(
             pq.ParquetDataset(filepaths_or_buffers).read_pandas(
                 columns=columns, *args, **kwargs
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index b6595be9566..516ee0d17d3 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1578,7 +1578,7 @@ def test_parquet_writer_bytes_io(simple_gdf):
 
 @pytest.mark.parametrize("filename", ["myfile.parquet", None])
 @pytest.mark.parametrize("cols", [["b"], ["c", "b"]])
-def test_parquet_write_partitioned(tmpdir_factory, cols, filename):
+def test_parquet_partitioned(tmpdir_factory, cols, filename):
     # Checks that write_to_dataset is wrapping to_parquet
     # as expected
     gdf_dir = str(tmpdir_factory.mktemp("gdf_dir"))
@@ -1597,10 +1597,14 @@ def test_parquet_write_partitioned(tmpdir_factory, cols, filename):
         gdf_dir, index=False, partition_cols=cols, partition_file_name=filename
     )
 
-    # Use pandas since dataset may be partitioned
-    expect = pd.read_parquet(pdf_dir)
-    got = pd.read_parquet(gdf_dir)
-    assert_eq(expect, got)
+    # Read back with pandas to compare
+    expect_pd = pd.read_parquet(pdf_dir)
+    got_pd = pd.read_parquet(gdf_dir)
+    assert_eq(expect_pd, got_pd)
+
+    # Check that cudf and pd return the same read
+    got_cudf = cudf.read_parquet(gdf_dir)
+    assert_eq(got_pd, got_cudf)
 
     # If filename is specified, check that it is correct
     if filename:
@@ -1629,9 +1633,9 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols):
     gdf.to_parquet(dir1, partition_cols=cols)
     cudf.io.write_to_dataset(gdf, dir2, partition_cols=cols)
 
-    # cudf read_parquet cannot handle partitioned dataset
-    expect = pd.read_parquet(dir1)
-    got = pd.read_parquet(dir2)
+    # Read back with cudf
+    expect = cudf.read_parquet(dir1)
+    got = cudf.read_parquet(dir2)
     assert_eq(expect, got)
 
     gdf = cudf.DataFrame(
@@ -1645,6 +1649,80 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols):
         gdf.to_parquet(dir1, partition_cols=cols)
 
 
+@pytest.mark.parametrize(
+    "pfilters", [[("b", "==", "b")], [("b", "==", "a"), ("c", "==", 1)]],
+)
+@pytest.mark.parametrize("selection", ["directory", "files", "row-groups"])
+@pytest.mark.parametrize("use_cat", [True, False])
+def test_read_parquet_partitioned_filtered(
+    tmpdir, pfilters, selection, use_cat
+):
+    path = str(tmpdir)
+    size = 100
+    df = cudf.DataFrame(
+        {
+            "a": np.arange(0, stop=size, dtype="int64"),
+            "b": np.random.choice(list("abcd"), size=size),
+            "c": np.random.choice(np.arange(4), size=size),
+        }
+    )
+    df.to_parquet(path, partition_cols=["c", "b"])
+
+    if selection == "files":
+        # Pass in a list of paths
+        fs = get_fs_token_paths(path)[0]
+        read_path = fs.find(path)
+        row_groups = None
+    elif selection == "row-groups":
+        # Pass in a list of paths AND row-group ids
+        fs = get_fs_token_paths(path)[0]
+        read_path = fs.find(path)
+        row_groups = [[0] for p in read_path]
+    else:
+        # Pass in a directory path
+        # (row-group selection not allowed in this case)
+        read_path = path
+        row_groups = None
+
+    # Filter on partitioned columns
+    expect = pd.read_parquet(read_path, filters=pfilters)
+    got = cudf.read_parquet(
+        read_path,
+        filters=pfilters,
+        row_groups=row_groups,
+        categorical_partitions=use_cat,
+    )
+    if use_cat:
+        assert got.dtypes["b"] == "category"
+        assert got.dtypes["c"] == "category"
+    else:
+        # Check that we didn't get categorical
+        # columns, but convert back to categorical
+        # for comparison with pandas
+        assert got.dtypes["b"] == "object"
+        assert got.dtypes["c"] == "int"
+        got["b"] = pd.Categorical(
+            got["b"].to_pandas(), categories=list("abcd")
+        )
+        got["c"] = pd.Categorical(
+            got["c"].to_pandas(), categories=np.arange(4)
+        )
+    assert_eq(expect, got)
+
+    # Filter on non-partitioned column.
+    # Cannot compare to pandas, since the pyarrow
+    # backend will filter by row (and cudf can
+    # only filter by column, for now)
+    filters = [("a", "==", 10)]
+    got = cudf.read_parquet(read_path, filters=filters, row_groups=row_groups,)
+    assert len(got) < len(df) and 10 in got["a"]
+
+    # Filter on both kinds of columns
+    filters = [[("a", "==", 10)], [("c", "==", 1)]]
+    got = cudf.read_parquet(read_path, filters=filters, row_groups=row_groups,)
+    assert len(got) < len(df) and (1 in got["c"] and 10 in got["a"])
+
+
 def test_parquet_writer_chunked_metadata(tmpdir, simple_pdf, simple_gdf):
     gdf_fname = tmpdir.join("gdf.parquet")
     test_path = "test/path"
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index dea876891f8..5738e1f0d00 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -346,12 +346,17 @@ def test_read_parquet_filters(s3_base, s3so, pdf, python_file):
     assert_eq(pdf.iloc[:0], got.reset_index(drop=True))
 
 
-def test_write_parquet(s3_base, s3so, pdf):
+@pytest.mark.parametrize("partition_cols", [None, ["String"]])
+def test_write_parquet(s3_base, s3so, pdf, partition_cols):
     fname = "test_parquet_writer.parquet"
     bname = "parquet"
     gdf = cudf.from_pandas(pdf)
     with s3_context(s3_base=s3_base, bucket=bname) as s3fs:
-        gdf.to_parquet("s3://{}/{}".format(bname, fname), storage_options=s3so)
+        gdf.to_parquet(
+            "s3://{}/{}".format(bname, fname),
+            partition_cols=partition_cols,
+            storage_options=s3so,
+        )
         assert s3fs.exists("s3://{}/{}".format(bname, fname))
 
         got = pd.read_parquet(s3fs.open("s3://{}/{}".format(bname, fname)))
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 0f9d9d53b23..e6c031acac7 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -154,6 +154,9 @@
 strings_to_categorical : boolean, default False
     If True, return string columns as GDF_CATEGORY dtype; if False, return a
     as GDF_STRING dtype.
+categorical_partitions : boolean, default True
+    Whether directory-partitioned columns should be interpreted as categorical
+    or raw dtypes.
 use_pandas_metadata : boolean, default True
     If True and dataset has custom PANDAS schema metadata, ensure that index
     columns are also loaded.
@@ -1129,7 +1132,7 @@ def ensure_single_filepath_or_buffer(path_or_data, **kwargs):
         storage_options = kwargs.get("storage_options")
         path_or_data = os.path.expanduser(path_or_data)
         try:
-            fs, _, paths = fsspec.get_fs_token_paths(
+            fs, _, paths = get_fs_token_paths(
                 path_or_data, mode="rb", storage_options=storage_options
             )
         except ValueError as e:
@@ -1153,9 +1156,9 @@ def is_directory(path_or_data, **kwargs):
         storage_options = kwargs.get("storage_options")
         path_or_data = os.path.expanduser(path_or_data)
         try:
-            fs, _, paths = fsspec.get_fs_token_paths(
+            fs = get_fs_token_paths(
                 path_or_data, mode="rb", storage_options=storage_options
-            )
+            )[0]
         except ValueError as e:
             if str(e).startswith("Protocol not known"):
                 return False
@@ -1189,10 +1192,8 @@ def _get_filesystem_and_paths(path_or_data, **kwargs):
         else:
             path_or_data = [path_or_data]
 
-        # Pyarrow did not support the protocol or storage options.
-        # Fall back to fsspec
         try:
-            fs, _, fs_paths = fsspec.get_fs_token_paths(
+            fs, _, fs_paths = get_fs_token_paths(
                 path_or_data, mode="rb", storage_options=storage_options
             )
             return_paths = fs_paths
@@ -1322,9 +1323,9 @@ def get_writer_filepath_or_buffer(path_or_data, mode, **kwargs):
     if isinstance(path_or_data, str):
         storage_options = kwargs.get("storage_options", {})
         path_or_data = os.path.expanduser(path_or_data)
-        fs, _, _ = fsspec.get_fs_token_paths(
+        fs = get_fs_token_paths(
             path_or_data, mode=mode or "w", storage_options=storage_options
-        )
+        )[0]
 
         if not _is_local_filesystem(fs):
             filepath_or_buffer = fsspec.open(
@@ -1513,11 +1514,12 @@ def _prepare_filters(filters):
     return filters
 
 
-def _ensure_filesystem(passed_filesystem, path):
+def _ensure_filesystem(passed_filesystem, path, **kwargs):
     if passed_filesystem is None:
-        return get_fs_token_paths(path[0] if isinstance(path, list) else path)[
-            0
-        ]
+        return get_fs_token_paths(
+            path[0] if isinstance(path, list) else path,
+            storage_options=kwargs.get("storage_options", {}),
+        )[0]
     return passed_filesystem
 
 
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index b47a5e78095..a49d73493ec 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -126,11 +126,8 @@ def _read_paths(
 
                 # Build the column from `codes` directly
                 # (since the category is often a larger dtype)
-                codes = (
-                    as_column(partitions[i].keys.index(index2))
-                    .as_frame()
-                    .repeat(len(df))
-                    ._data[None]
+                codes = as_column(
+                    partitions[i].keys.index(index2), length=len(df),
                 )
                 df[name] = build_categorical_column(
                     categories=partitions[i].keys,

From e82cc62e2ea61211c64ba4784cb131d5b535644c Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 3 Dec 2021 04:46:25 -0800
Subject: [PATCH 41/42] Fix join of MultiIndex to Index with one column and
 overlapping name. (#9830)

This PR resolves #9823

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/9830
---
 python/cudf/cudf/core/_base_index.py   |  4 ++--
 python/cudf/cudf/tests/test_joining.py | 13 +++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index d688b75ed14..2fcc976d8e1 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -1147,14 +1147,14 @@ def join(
         if isinstance(lhs, cudf.MultiIndex):
             if level is not None and isinstance(level, int):
                 on = lhs._data.select_by_index(level).names[0]
-            right_names = (on,) or right_names
+            right_names = (on,) if on is not None else right_names
             on = right_names[0]
             if how == "outer":
                 how = "left"
             elif how == "right":
                 how = "inner"
         else:
-            # Both are nomal indices
+            # Both are normal indices
             right_names = left_names
             on = right_names[0]
 
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index 0518cc2c9b9..d25c6130bfb 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -2150,3 +2150,16 @@ def test_join_redundant_params():
         lhs.merge(rhs, right_on="a", left_index=True, right_index=True)
     with pytest.raises(ValueError):
         lhs.merge(rhs, left_on="c", right_on="b")
+
+
+def test_join_multiindex_index():
+    # test joining a MultiIndex with an Index with overlapping name
+    lhs = (
+        cudf.DataFrame({"a": [2, 3, 1], "b": [3, 4, 2]})
+        .set_index(["a", "b"])
+        .index
+    )
+    rhs = cudf.DataFrame({"a": [1, 4, 3]}).set_index("a").index
+    expect = lhs.to_pandas().join(rhs.to_pandas(), how="inner")
+    got = lhs.join(rhs, how="inner")
+    assert_join_results_equal(expect, got, how="inner")

From 69e6dbbf447a951e4b08f15c737eedcbaf3291da Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 3 Dec 2021 10:18:04 -0500
Subject: [PATCH 42/42] Move the binary_ops common dispatcher logic to be
 executed on the CPU (#9816)

* move NullEquals to separate file

* To improve runtime performance move more binary_ops dispatch to host

* make sure to forceinline the operator_dispatcher

* Correct style issues found by ci

* Expand the binary-op compiled benchmark suite

* Ensure forceinline is on binary ops device dispatch functions

* Correct style issues found by ci

Co-authored-by: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Co-authored-by: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
---
 cpp/CMakeLists.txt                            |  1 +
 .../binaryop/compiled_binaryop_benchmark.cpp  | 66 ++++++++++---------
 .../cudf/utilities/type_dispatcher.hpp        | 14 ++--
 cpp/src/binaryop/compiled/NullEquals.cu       | 26 ++++++++
 cpp/src/binaryop/compiled/binary_ops.cu       |  2 +-
 cpp/src/binaryop/compiled/binary_ops.cuh      | 63 ++++++++++++------
 cpp/src/binaryop/compiled/equality_ops.cu     | 41 ++++++++----
 7 files changed, 141 insertions(+), 72 deletions(-)
 create mode 100644 cpp/src/binaryop/compiled/NullEquals.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 59dc3c74af2..37f93f1868b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -185,6 +185,7 @@ add_library(
   src/binaryop/compiled/LogicalOr.cu
   src/binaryop/compiled/Mod.cu
   src/binaryop/compiled/Mul.cu
+  src/binaryop/compiled/NullEquals.cu
   src/binaryop/compiled/NullMax.cu
   src/binaryop/compiled/NullMin.cu
   src/binaryop/compiled/PMod.cu
diff --git a/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
index bc0818ace4b..8d04f8bdcb2 100644
--- a/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
+++ b/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
@@ -50,14 +50,14 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
 }
 
 // TODO tparam boolean for null.
-#define BINARYOP_BENCHMARK_DEFINE(TypeLhs, TypeRhs, binop, TypeOut)                    \
+#define BINARYOP_BENCHMARK_DEFINE(name, TypeLhs, TypeRhs, binop, TypeOut)              \
   BENCHMARK_TEMPLATE_DEFINE_F(                                                         \
-    COMPILED_BINARYOP, binop, TypeLhs, TypeRhs, TypeOut, cudf::binary_operator::binop) \
+    COMPILED_BINARYOP, name, TypeLhs, TypeRhs, TypeOut, cudf::binary_operator::binop)  \
   (::benchmark::State & st)                                                            \
   {                                                                                    \
     BM_compiled_binaryop<TypeLhs, TypeRhs, TypeOut>(st, cudf::binary_operator::binop); \
   }                                                                                    \
-  BENCHMARK_REGISTER_F(COMPILED_BINARYOP, binop)                                       \
+  BENCHMARK_REGISTER_F(COMPILED_BINARYOP, name)                                        \
     ->Unit(benchmark::kMicrosecond)                                                    \
     ->UseManualTime()                                                                  \
     ->Arg(10000)      /* 10k */                                                        \
@@ -70,30 +70,36 @@ using namespace cudf;
 using namespace numeric;
 
 // clang-format off
-BINARYOP_BENCHMARK_DEFINE(float,        int64_t,      ADD,                  int32_t);
-BINARYOP_BENCHMARK_DEFINE(duration_s,   duration_D,   SUB,                  duration_ms);
-BINARYOP_BENCHMARK_DEFINE(float,        float,        MUL,                  int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      DIV,                  int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      TRUE_DIV,             int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      FLOOR_DIV,            int64_t);
-BINARYOP_BENCHMARK_DEFINE(double,       double,       MOD,                  double);
-BINARYOP_BENCHMARK_DEFINE(int32_t,      int64_t,      PMOD,                 double);
-BINARYOP_BENCHMARK_DEFINE(int32_t,      uint8_t,      PYMOD,                int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      POW,                  double);
-BINARYOP_BENCHMARK_DEFINE(float,        double,       LOG_BASE,             double);
-BINARYOP_BENCHMARK_DEFINE(float,        double,       ATAN2,                double);
-BINARYOP_BENCHMARK_DEFINE(int,          int,          SHIFT_LEFT,           int);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      SHIFT_RIGHT,          int);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int32_t,      SHIFT_RIGHT_UNSIGNED, int64_t);
-BINARYOP_BENCHMARK_DEFINE(int64_t,      int32_t,      BITWISE_AND,          int16_t);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int32_t,      BITWISE_OR,           int64_t);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      BITWISE_XOR,          int32_t);
-BINARYOP_BENCHMARK_DEFINE(double,       int8_t,       LOGICAL_AND,          bool);
-BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      LOGICAL_OR,           bool);
-BINARYOP_BENCHMARK_DEFINE(duration_ms,  duration_ns,  EQUAL,                bool);
-BINARYOP_BENCHMARK_DEFINE(decimal32,    decimal32,    NOT_EQUAL,            bool);
-BINARYOP_BENCHMARK_DEFINE(timestamp_s,  timestamp_s,  LESS,                 bool);
-BINARYOP_BENCHMARK_DEFINE(timestamp_ms, timestamp_s,  GREATER,              bool);
-BINARYOP_BENCHMARK_DEFINE(duration_ms,  duration_ns,  NULL_EQUALS,          bool);
-BINARYOP_BENCHMARK_DEFINE(decimal32,    decimal32,    NULL_MAX,             decimal32);
-BINARYOP_BENCHMARK_DEFINE(timestamp_D, timestamp_s,   NULL_MIN,             timestamp_s);
+BINARYOP_BENCHMARK_DEFINE(ADD_1,          float,        float,        ADD,                  float);
+BINARYOP_BENCHMARK_DEFINE(ADD_2,          timestamp_s,  duration_s,   ADD,                  timestamp_s);
+BINARYOP_BENCHMARK_DEFINE(SUB_1,          duration_s,   duration_D,   SUB,                  duration_ms);
+BINARYOP_BENCHMARK_DEFINE(SUB_2,          int64_t,      int64_t,      SUB,                  int64_t);
+BINARYOP_BENCHMARK_DEFINE(MUL_1,          float,        float,        MUL,                  int64_t);
+BINARYOP_BENCHMARK_DEFINE(MUL_2,          duration_s,   int64_t,      MUL,                  duration_s);
+BINARYOP_BENCHMARK_DEFINE(DIV_1,          int64_t,      int64_t,      DIV,                  int64_t);
+BINARYOP_BENCHMARK_DEFINE(DIV_2,          duration_ms,  int32_t,      DIV,                  duration_ms);
+BINARYOP_BENCHMARK_DEFINE(TRUE_DIV,       int64_t,      int64_t,      TRUE_DIV,             int64_t);
+BINARYOP_BENCHMARK_DEFINE(FLOOR_DIV,      int64_t,      int64_t,      FLOOR_DIV,            int64_t);
+BINARYOP_BENCHMARK_DEFINE(MOD_1,          double,       double,       MOD,                  double);
+BINARYOP_BENCHMARK_DEFINE(MOD_2,          duration_ms,  int64_t,      MOD,                  duration_ms);
+BINARYOP_BENCHMARK_DEFINE(PMOD,           int32_t,      int64_t,      PMOD,                 double);
+BINARYOP_BENCHMARK_DEFINE(PYMOD,          int32_t,      uint8_t,      PYMOD,                int64_t);
+BINARYOP_BENCHMARK_DEFINE(POW,            int64_t,      int64_t,      POW,                  double);
+BINARYOP_BENCHMARK_DEFINE(LOG_BASE,       float,        double,       LOG_BASE,             double);
+BINARYOP_BENCHMARK_DEFINE(ATAN2,          float,        double,       ATAN2,                double);
+BINARYOP_BENCHMARK_DEFINE(SHIFT_LEFT,     int,          int,          SHIFT_LEFT,           int);
+BINARYOP_BENCHMARK_DEFINE(SHIFT_RIGHT,    int16_t,      int64_t,      SHIFT_RIGHT,          int);
+BINARYOP_BENCHMARK_DEFINE(USHIFT_RIGHT,   int64_t,      int32_t,      SHIFT_RIGHT_UNSIGNED, int64_t);
+BINARYOP_BENCHMARK_DEFINE(BITWISE_AND,    int64_t,      int32_t,      BITWISE_AND,          int16_t);
+BINARYOP_BENCHMARK_DEFINE(BITWISE_OR,     int16_t,      int32_t,      BITWISE_OR,           int64_t);
+BINARYOP_BENCHMARK_DEFINE(BITWISE_XOR,    int16_t,      int64_t,      BITWISE_XOR,          int32_t);
+BINARYOP_BENCHMARK_DEFINE(LOGICAL_AND,    double,       int8_t,       LOGICAL_AND,          bool);
+BINARYOP_BENCHMARK_DEFINE(LOGICAL_OR,     int16_t,      int64_t,      LOGICAL_OR,           bool);
+BINARYOP_BENCHMARK_DEFINE(EQUAL_1,        int32_t,      int64_t,      EQUAL,                bool);
+BINARYOP_BENCHMARK_DEFINE(EQUAL_2,        duration_ms,  duration_ns,  EQUAL,                bool);
+BINARYOP_BENCHMARK_DEFINE(NOT_EQUAL,      decimal32,    decimal32,    NOT_EQUAL,            bool);
+BINARYOP_BENCHMARK_DEFINE(LESS,           timestamp_s,  timestamp_s,  LESS,                 bool);
+BINARYOP_BENCHMARK_DEFINE(GREATER,        timestamp_ms, timestamp_s,  GREATER,              bool);
+BINARYOP_BENCHMARK_DEFINE(NULL_EQUALS,    duration_ms,  duration_ns,  NULL_EQUALS,          bool);
+BINARYOP_BENCHMARK_DEFINE(NULL_MAX,       decimal32,    decimal32,    NULL_MAX,             decimal32);
+BINARYOP_BENCHMARK_DEFINE(NULL_MIN,       timestamp_D,  timestamp_s,  NULL_MIN,             timestamp_s);
diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp
index a04b8309142..d7d38aba4f3 100644
--- a/cpp/include/cudf/utilities/type_dispatcher.hpp
+++ b/cpp/include/cudf/utilities/type_dispatcher.hpp
@@ -531,7 +531,7 @@ template <typename T1>
 struct double_type_dispatcher_second_type {
 #pragma nv_exec_check_disable
   template <typename T2, typename F, typename... Ts>
-  CUDA_HOST_DEVICE_CALLABLE decltype(auto) operator()(F&& f, Ts&&... args) const
+  CUDF_HDFI decltype(auto) operator()(F&& f, Ts&&... args) const
   {
     return f.template operator()<T1, T2>(std::forward<Ts>(args)...);
   }
@@ -541,9 +541,7 @@ template <template <cudf::type_id> typename IdTypeMap>
 struct double_type_dispatcher_first_type {
 #pragma nv_exec_check_disable
   template <typename T1, typename F, typename... Ts>
-  CUDA_HOST_DEVICE_CALLABLE decltype(auto) operator()(cudf::data_type type2,
-                                                      F&& f,
-                                                      Ts&&... args) const
+  CUDF_HDFI decltype(auto) operator()(cudf::data_type type2, F&& f, Ts&&... args) const
   {
     return type_dispatcher<IdTypeMap>(type2,
                                       detail::double_type_dispatcher_second_type<T1>{},
@@ -568,10 +566,10 @@ struct double_type_dispatcher_first_type {
  */
 #pragma nv_exec_check_disable
 template <template <cudf::type_id> typename IdTypeMap = id_to_type_impl, typename F, typename... Ts>
-CUDA_HOST_DEVICE_CALLABLE constexpr decltype(auto) double_type_dispatcher(cudf::data_type type1,
-                                                                          cudf::data_type type2,
-                                                                          F&& f,
-                                                                          Ts&&... args)
+CUDF_HDFI constexpr decltype(auto) double_type_dispatcher(cudf::data_type type1,
+                                                          cudf::data_type type2,
+                                                          F&& f,
+                                                          Ts&&... args)
 {
   return type_dispatcher<IdTypeMap>(type1,
                                     detail::double_type_dispatcher_first_type<IdTypeMap>{},
diff --git a/cpp/src/binaryop/compiled/NullEquals.cu b/cpp/src/binaryop/compiled/NullEquals.cu
new file mode 100644
index 00000000000..3fc76e804f7
--- /dev/null
+++ b/cpp/src/binaryop/compiled/NullEquals.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::NullEquals>(mutable_column_device_view&,
+                                               column_device_view const&,
+                                               column_device_view const&,
+                                               bool is_lhs_scalar,
+                                               bool is_rhs_scalar,
+                                               rmm::cuda_stream_view);
+}  // namespace cudf::binops::compiled
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index 7b0139a0082..71d9b615153 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -316,7 +316,6 @@ case binary_operator::PYMOD:                apply_binary_op<ops::PyMod>(out, lhs
 case binary_operator::POW:                  apply_binary_op<ops::Pow>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::EQUAL:
 case binary_operator::NOT_EQUAL:
-case binary_operator::NULL_EQUALS:
 if(out.type().id() != type_id::BOOL8) CUDF_FAIL("Output type of Comparison operator should be bool type");
 dispatch_equality_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, op, stream); break;
 case binary_operator::LESS:                 apply_binary_op<ops::Less>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
@@ -337,6 +336,7 @@ case binary_operator::SHIFT_RIGHT_UNSIGNED: apply_binary_op<ops::ShiftRightUnsig
 case binary_operator::LOG_BASE:             apply_binary_op<ops::LogBase>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::ATAN2:                apply_binary_op<ops::ATan2>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::PMOD:                 apply_binary_op<ops::PMod>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::NULL_EQUALS:          apply_binary_op<ops::NullEquals>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::NULL_MAX:             apply_binary_op<ops::NullMax>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::NULL_MIN:             apply_binary_op<ops::NullMin>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 default:;
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index 84147fc9220..f10d3dd1a62 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -177,35 +177,51 @@ struct ops2_wrapper {
 };
 
 /**
- * @brief Functor which does single, and double type dispatcher in device code
+ * @brief Functor which does single type dispatcher in device code
  *
  * single type dispatcher for lhs and rhs with common types.
+ *
+ * @tparam BinaryOperator binary operator functor
+ */
+template <class BinaryOperator>
+struct binary_op_device_dispatcher {
+  data_type common_data_type;
+  mutable_column_device_view out;
+  column_device_view lhs;
+  column_device_view rhs;
+  bool is_lhs_scalar;
+  bool is_rhs_scalar;
+
+  __forceinline__ __device__ void operator()(size_type i)
+  {
+    type_dispatcher(common_data_type,
+                    ops_wrapper<BinaryOperator>{out, lhs, rhs, is_lhs_scalar, is_rhs_scalar},
+                    i);
+  }
+};
+
+/**
+ * @brief Functor which does double type dispatcher in device code
+ *
  * double type dispatcher for lhs and rhs without common types.
  *
  * @tparam BinaryOperator binary operator functor
  */
 template <class BinaryOperator>
-struct device_type_dispatcher {
+struct binary_op_double_device_dispatcher {
   mutable_column_device_view out;
   column_device_view lhs;
   column_device_view rhs;
   bool is_lhs_scalar;
   bool is_rhs_scalar;
-  std::optional<data_type> common_data_type;
 
-  __device__ void operator()(size_type i)
+  __forceinline__ __device__ void operator()(size_type i)
   {
-    if (common_data_type) {
-      type_dispatcher(*common_data_type,
-                      ops_wrapper<BinaryOperator>{out, lhs, rhs, is_lhs_scalar, is_rhs_scalar},
-                      i);
-    } else {
-      double_type_dispatcher(
-        lhs.type(),
-        rhs.type(),
-        ops2_wrapper<BinaryOperator>{out, lhs, rhs, is_lhs_scalar, is_rhs_scalar},
-        i);
-    }
+    double_type_dispatcher(
+      lhs.type(),
+      rhs.type(),
+      ops2_wrapper<BinaryOperator>{out, lhs, rhs, is_lhs_scalar, is_rhs_scalar},
+      i);
   }
 };
 
@@ -263,10 +279,19 @@ void apply_binary_op(mutable_column_device_view& outd,
   auto common_dtype = get_common_type(outd.type(), lhsd.type(), rhsd.type());
 
   // Create binop functor instance
-  auto binop_func = device_type_dispatcher<BinaryOperator>{
-    outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype};
-  // Execute it on every element
-  for_each(stream, outd.size(), binop_func);
+  if (common_dtype) {
+    // Execute it on every element
+    for_each(stream,
+             outd.size(),
+             binary_op_device_dispatcher<BinaryOperator>{
+               *common_dtype, outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+  } else {
+    // Execute it on every element
+    for_each(stream,
+             outd.size(),
+             binary_op_double_device_dispatcher<BinaryOperator>{
+               outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+  }
 }
 
 }  // namespace compiled
diff --git a/cpp/src/binaryop/compiled/equality_ops.cu b/cpp/src/binaryop/compiled/equality_ops.cu
index feee310716a..03c3e373476 100644
--- a/cpp/src/binaryop/compiled/equality_ops.cu
+++ b/cpp/src/binaryop/compiled/equality_ops.cu
@@ -28,19 +28,32 @@ void dispatch_equality_op(mutable_column_device_view& outd,
   auto common_dtype = get_common_type(outd.type(), lhsd.type(), rhsd.type());
 
   // Execute it on every element
-  for_each(
-    stream,
-    outd.size(),
-    [op, outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype] __device__(size_type i) {
-      // clang-format off
-      // Similar enabled template types should go together (better performance)
-      switch (op) {
-      case binary_operator::EQUAL:         device_type_dispatcher<ops::Equal>{outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype}(i); break;
-      case binary_operator::NOT_EQUAL:     device_type_dispatcher<ops::NotEqual>{outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype}(i); break;
-      case binary_operator::NULL_EQUALS:   device_type_dispatcher<ops::NullEquals>{outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype}(i); break;
-      default:;
-      }
-      // clang-format on
-    });
+
+  if (common_dtype) {
+    if (op == binary_operator::EQUAL) {
+      for_each(stream,
+               outd.size(),
+               binary_op_device_dispatcher<ops::Equal>{
+                 *common_dtype, outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+    } else if (op == binary_operator::NOT_EQUAL) {
+      for_each(stream,
+               outd.size(),
+               binary_op_device_dispatcher<ops::NotEqual>{
+                 *common_dtype, outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+    }
+  } else {
+    if (op == binary_operator::EQUAL) {
+      for_each(stream,
+               outd.size(),
+               binary_op_double_device_dispatcher<ops::Equal>{
+                 outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+    } else if (op == binary_operator::NOT_EQUAL) {
+      for_each(stream,
+               outd.size(),
+               binary_op_double_device_dispatcher<ops::NotEqual>{
+                 outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+    }
+  }
 }
+
 }  // namespace cudf::binops::compiled