diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
index 88a48ea2e3b..29f6265ec63 100755
--- a/ci/cpu/upload.sh
+++ b/ci/cpu/upload.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 # Copyright (c) 2018-2022, NVIDIA CORPORATION.
 # Adopted from https://github.com/tmcdonell/travis-scripts/blob/dfaac280ac2082cd6bcaba3217428347899f2975/update-accelerate-buildbot.sh
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 set -e
 
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 5575b69c226..f07d7984cd1 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 ########################
 # cuDF Version Updater #
 ########################
diff --git a/ci/utils/nbtestlog2junitxml.py b/ci/utils/nbtestlog2junitxml.py
index 6a421279112..14384af3225 100644
--- a/ci/utils/nbtestlog2junitxml.py
+++ b/ci/utils/nbtestlog2junitxml.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 # Generate a junit-xml file from parsing a nbtest log
 
 import re
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 9936db5b2fa..d9422edaa8f 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -156,6 +156,21 @@ include(cmake/Modules/JitifyPreprocessKernels.cmake)
 # find cuFile
 include(cmake/Modules/FindcuFile.cmake)
 
+# Workaround until https://github.com/rapidsai/rapids-cmake/issues/176 is resolved
+if(NOT BUILD_SHARED_LIBS)
+  include("${rapids-cmake-dir}/export/find_package_file.cmake")
+  list(APPEND METADATA_KINDS BUILD INSTALL)
+  foreach(METADATA_KIND IN LISTS METADATA_KINDS)
+    rapids_export_find_package_file(
+      ${METADATA_KIND} "${CUDF_SOURCE_DIR}/cmake/Modules/FindcuFile.cmake" cudf-exports
+    )
+    rapids_export_package(${METADATA_KIND} cuco cudf-exports)
+    rapids_export_package(${METADATA_KIND} ZLIB cudf-exports)
+    rapids_export_package(${METADATA_KIND} cuFile cudf-exports)
+    rapids_export_package(${METADATA_KIND} nvcomp cudf-exports)
+  endforeach()
+endif()
+
 # ##################################################################################################
 # * library targets -------------------------------------------------------------------------------
 
diff --git a/cpp/benchmarks/column/concatenate.cpp b/cpp/benchmarks/column/concatenate.cpp
index 21e5db8ca8f..67ea6129a74 100644
--- a/cpp/benchmarks/column/concatenate.cpp
+++ b/cpp/benchmarks/column/concatenate.cpp
@@ -45,7 +45,7 @@ static void BM_concatenate(benchmark::State& state)
   auto input_columns = input->view();
   std::vector<cudf::column_view> column_views(input_columns.begin(), input_columns.end());
 
-  CHECK_CUDA(0);
+  CUDF_CHECK_CUDA(0);
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
@@ -87,7 +87,7 @@ static void BM_concatenate_tables(benchmark::State& state)
     return table->view();
   });
 
-  CHECK_CUDA(0);
+  CUDF_CHECK_CUDA(0);
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
@@ -146,7 +146,7 @@ static void BM_concatenate_strings(benchmark::State& state)
       return static_cast<cudf::column_view>(col);
     });
 
-  CHECK_CUDA(0);
+  CUDF_CHECK_CUDA(0);
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index 3af64b0945a..b6a37453a13 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -87,6 +87,15 @@ T get_distribution_mean(distribution_params<T> const& dist)
   }
 }
 
+/**
+ * @brief Computes the average element size in a column, given the data profile.
+ *
+ * Random distribution parameters like average string length and maximum list nesting level affect
+ * the element size of non-fixed-width columns. For lists and structs, `avg_element_size` is called
+ * recursively to determine the size of nested columns.
+ */
+size_t avg_element_size(data_profile const& profile, cudf::data_type dtype);
+
 // Utilities to determine the mean size of an element, given the data profile
 template <typename T, CUDF_ENABLE_IF(cudf::is_fixed_width<T>())>
 size_t non_fixed_width_size(data_profile const& profile)
@@ -112,10 +121,22 @@ size_t non_fixed_width_size<cudf::list_view>(data_profile const& profile)
 {
   auto const dist_params       = profile.get_distribution_params<cudf::list_view>();
   auto const single_level_mean = get_distribution_mean(dist_params.length_params);
-  auto const element_size      = cudf::size_of(cudf::data_type{dist_params.element_type});
+  auto const element_size = avg_element_size(profile, cudf::data_type{dist_params.element_type});
   return element_size * pow(single_level_mean, dist_params.max_depth);
 }
 
+template <>
+size_t non_fixed_width_size<cudf::struct_view>(data_profile const& profile)
+{
+  auto const dist_params = profile.get_distribution_params<cudf::struct_view>();
+  return std::accumulate(dist_params.leaf_types.cbegin(),
+                         dist_params.leaf_types.cend(),
+                         0ul,
+                         [&](auto& sum, auto type_id) {
+                           return sum + avg_element_size(profile, cudf::data_type{type_id});
+                         });
+}
+
 struct non_fixed_width_size_fn {
   template <typename T>
   size_t operator()(data_profile const& profile)
@@ -527,14 +548,6 @@ std::unique_ptr<cudf::column> create_random_column<cudf::dictionary32>(data_prof
   CUDF_FAIL("not implemented yet");
 }
 
-template <>
-std::unique_ptr<cudf::column> create_random_column<cudf::struct_view>(data_profile const& profile,
-                                                                      thrust::minstd_rand& engine,
-                                                                      cudf::size_type num_rows)
-{
-  CUDF_FAIL("not implemented yet");
-}
-
 /**
  * @brief Functor to dispatch create_random_column calls.
  */
@@ -549,6 +562,93 @@ struct create_rand_col_fn {
   }
 };
 
+/**
+ * @brief Calculates the number of direct parents needed to generate a struct column hierarchy with
+ * lowest maximum number of children in any nested column.
+ *
+ * Used to generate an "evenly distributed" struct column hierarchy with the given number of leaf
+ * columns and nesting levels. The column tree is considered evenly distributed if all columns have
+ * nearly the same number of child columns (difference not larger than one).
+ */
+int num_direct_parents(int num_lvls, int num_leaf_columns)
+{
+  // Estimated average number of children in the hierarchy;
+  auto const num_children_avg = std::pow(num_leaf_columns, 1. / num_lvls);
+  // Minimum number of children columns for any column in the hierarchy
+  int const num_children_min = std::floor(num_children_avg);
+  // Maximum number of children columns for any column in the hierarchy
+  int const num_children_max = num_children_min + 1;
+
+  // Minimum number of columns needed so that their number of children does not exceed the maximum
+  int const min_for_current_nesting = std::ceil((double)num_leaf_columns / num_children_max);
+  // Minimum number of columns needed so that columns at the higher levels have at least the minimum
+  // number of children
+  int const min_for_upper_nesting = std::pow(num_children_min, num_lvls - 1);
+  // Both conditions need to be satisfied
+  return std::max(min_for_current_nesting, min_for_upper_nesting);
+}
+
+template <>
+std::unique_ptr<cudf::column> create_random_column<cudf::struct_view>(data_profile const& profile,
+                                                                      thrust::minstd_rand& engine,
+                                                                      cudf::size_type num_rows)
+{
+  auto const dist_params = profile.get_distribution_params<cudf::struct_view>();
+
+  // Generate leaf columns
+  std::vector<std::unique_ptr<cudf::column>> children;
+  children.reserve(dist_params.leaf_types.size());
+  std::transform(dist_params.leaf_types.cbegin(),
+                 dist_params.leaf_types.cend(),
+                 std::back_inserter(children),
+                 [&](auto& type_id) {
+                   return cudf::type_dispatcher(
+                     cudf::data_type(type_id), create_rand_col_fn{}, profile, engine, num_rows);
+                 });
+
+  auto valid_dist =
+    random_value_fn<bool>(distribution_params<bool>{1. - profile.get_null_frequency().value_or(0)});
+
+  // Generate the column bottom-up
+  for (int lvl = dist_params.max_depth; lvl > 0; --lvl) {
+    // Generating the next level
+    std::vector<std::unique_ptr<cudf::column>> parents;
+    parents.resize(num_direct_parents(lvl, children.size()));
+
+    auto current_child = children.begin();
+    for (auto current_parent = parents.begin(); current_parent != parents.end(); ++current_parent) {
+      auto [null_mask, null_count] = [&]() {
+        if (profile.get_null_frequency().has_value()) {
+          auto valids = valid_dist(engine, num_rows);
+          return cudf::detail::valid_if(valids.begin(), valids.end(), thrust::identity<bool>{});
+        }
+        return std::pair<rmm::device_buffer, cudf::size_type>{};
+      }();
+
+      // Adopt remaining children as evenly as possible
+      auto const num_to_adopt = cudf::util::div_rounding_up_unsafe(
+        std::distance(current_child, children.end()), std::distance(current_parent, parents.end()));
+      CUDF_EXPECTS(num_to_adopt > 0, "No children columns left to adopt");
+
+      std::vector<std::unique_ptr<cudf::column>> children_to_adopt;
+      children_to_adopt.insert(children_to_adopt.end(),
+                               std::make_move_iterator(current_child),
+                               std::make_move_iterator(current_child + num_to_adopt));
+      current_child += children_to_adopt.size();
+
+      *current_parent = cudf::make_structs_column(
+        num_rows, std::move(children_to_adopt), null_count, std::move(null_mask));
+    }
+
+    if (lvl == 1) {
+      CUDF_EXPECTS(parents.size() == 1, "There should be one top-level column");
+      return std::move(parents.front());
+    }
+    children = std::move(parents);
+  }
+  CUDF_FAIL("Reached unreachable code in struct column creation");
+}
+
 template <typename T>
 struct clamp_down : public thrust::unary_function<T, T> {
   T max;
diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp
index c955f60f97e..8a4e3783da5 100644
--- a/cpp/benchmarks/common/generate_input.hpp
+++ b/cpp/benchmarks/common/generate_input.hpp
@@ -171,6 +171,15 @@ struct distribution_params<T, std::enable_if_t<std::is_same_v<T, cudf::list_view
   cudf::size_type max_depth;
 };
 
+/**
+ * @brief Structs are parameterized by the maximal nesting level, and the leaf column types.
+ */
+template <typename T>
+struct distribution_params<T, std::enable_if_t<std::is_same_v<T, cudf::struct_view>>> {
+  std::vector<cudf::type_id> leaf_types;
+  cudf::size_type max_depth;
+};
+
 // Present for compilation only. To be implemented once reader/writers support the fixed width type.
 template <typename T>
 struct distribution_params<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
@@ -214,6 +223,8 @@ class data_profile {
   distribution_params<cudf::string_view> string_dist_desc{{distribution_id::NORMAL, 0, 32}};
   distribution_params<cudf::list_view> list_dist_desc{
     cudf::type_id::INT32, {distribution_id::GEOMETRIC, 0, 100}, 2};
+  distribution_params<cudf::struct_view> struct_dist_desc{
+    {cudf::type_id::INT32, cudf::type_id::FLOAT32, cudf::type_id::STRING}, 2};
   std::map<cudf::type_id, distribution_params<__uint128_t>> decimal_params;
 
   double bool_probability              = 0.5;
@@ -281,6 +292,12 @@ class data_profile {
     return list_dist_desc;
   }
 
+  template <typename T, std::enable_if_t<std::is_same_v<T, cudf::struct_view>>* = nullptr>
+  distribution_params<T> get_distribution_params() const
+  {
+    return struct_dist_desc;
+  }
+
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
   distribution_params<typename T::rep> get_distribution_params() const
   {
@@ -357,8 +374,28 @@ class data_profile {
   void set_cardinality(cudf::size_type c) { cardinality = c; }
   void set_avg_run_length(cudf::size_type avg_rl) { avg_run_length = avg_rl; }
 
-  void set_list_depth(cudf::size_type max_depth) { list_dist_desc.max_depth = max_depth; }
+  void set_list_depth(cudf::size_type max_depth)
+  {
+    CUDF_EXPECTS(max_depth > 0, "List depth must be positive");
+    list_dist_desc.max_depth = max_depth;
+  }
+
   void set_list_type(cudf::type_id type) { list_dist_desc.element_type = type; }
+
+  void set_struct_depth(cudf::size_type max_depth)
+  {
+    CUDF_EXPECTS(max_depth > 0, "Struct depth must be positive");
+    struct_dist_desc.max_depth = max_depth;
+  }
+
+  void set_struct_types(std::vector<cudf::type_id> const& types)
+  {
+    CUDF_EXPECTS(
+      std::none_of(
+        types.cbegin(), types.cend(), [](auto& type) { return type == cudf::type_id::STRUCT; }),
+      "Cannot include STRUCT as its own subtype");
+    struct_dist_desc.leaf_types = types;
+  }
 };
 
 /**
diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp
index 5f23cbbafdd..e153abee3a3 100644
--- a/cpp/benchmarks/fixture/benchmark_fixture.hpp
+++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
index 3743be8bd5a..afe0cc77a4c 100644
--- a/cpp/benchmarks/io/cuio_common.cpp
+++ b/cpp/benchmarks/io/cuio_common.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/benchmarks/io/cuio_common.hpp b/cpp/benchmarks/io/cuio_common.hpp
index c74ee191d4e..2ed534d5333 100644
--- a/cpp/benchmarks/io/cuio_common.hpp
+++ b/cpp/benchmarks/io/cuio_common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/benchmarks/io/orc/orc_reader.cpp b/cpp/benchmarks/io/orc/orc_reader.cpp
index 29d4860a0e5..0fc2238a272 100644
--- a/cpp/benchmarks/io/orc/orc_reader.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader.cpp
@@ -166,6 +166,7 @@ RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, decimal, type_group_id:
 RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
 RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING);
 RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, list, cudf::type_id::LIST);
+RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, struct, cudf::type_id::STRUCT);
 
 BENCHMARK_DEFINE_F(OrcRead, column_selection)
 (::benchmark::State& state) { BM_orc_read_varying_options(state); }
diff --git a/cpp/benchmarks/io/orc/orc_writer.cpp b/cpp/benchmarks/io/orc/orc_writer.cpp
index e24ca7f749d..525c13af5c0 100644
--- a/cpp/benchmarks/io/orc/orc_writer.cpp
+++ b/cpp/benchmarks/io/orc/orc_writer.cpp
@@ -116,6 +116,7 @@ WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, decimal, type_group_id::F
 WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
 WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING);
 WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, list, cudf::type_id::LIST);
+WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, struct, cudf::type_id::STRUCT);
 
 BENCHMARK_DEFINE_F(OrcWrite, writer_options)
 (::benchmark::State& state) { BM_orc_write_varying_options(state); }
diff --git a/cpp/benchmarks/io/parquet/parquet_reader.cpp b/cpp/benchmarks/io/parquet/parquet_reader.cpp
index 74613e50158..8a97fd35c31 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader.cpp
@@ -166,6 +166,7 @@ RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, decimal, type_group_id
 RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
 RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING);
 RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, list, cudf::type_id::LIST);
+RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, struct, cudf::type_id::STRUCT);
 
 BENCHMARK_DEFINE_F(ParquetRead, column_selection)
 (::benchmark::State& state) { BM_parq_read_varying_options(state); }
diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp
index d203f0d27c8..d25fae42d0e 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp
@@ -111,6 +111,7 @@ WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, decimal, type_group_id::
 WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
 WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING);
 WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, list, cudf::type_id::LIST);
+WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, struct, cudf::type_id::STRUCT);
 
 BENCHMARK_DEFINE_F(ParquetWrite, writer_options)
 (::benchmark::State& state) { BM_parq_write_varying_options(state); }
diff --git a/cpp/benchmarks/join/conditional_join.cu b/cpp/benchmarks/join/conditional_join.cu
index 69fb28d29b2..3c4208bf0fc 100644
--- a/cpp/benchmarks/join/conditional_join.cu
+++ b/cpp/benchmarks/join/conditional_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh
index e846317f472..5df77ac4319 100644
--- a/cpp/benchmarks/join/generate_input_tables.cuh
+++ b/cpp/benchmarks/join/generate_input_tables.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -139,18 +139,18 @@ void generate_input_tables(key_type* const build_tbl,
 
   // Maximize exposed parallelism while minimizing storage for curand state
   int num_blocks_init_build_tbl{-1};
-  CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
     &num_blocks_init_build_tbl, init_build_tbl<key_type, size_type>, block_size, 0));
 
   int num_blocks_init_probe_tbl{-1};
-  CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
     &num_blocks_init_probe_tbl, init_probe_tbl<key_type, size_type>, block_size, 0));
 
   int dev_id{-1};
-  CUDA_TRY(cudaGetDevice(&dev_id));
+  CUDF_CUDA_TRY(cudaGetDevice(&dev_id));
 
   int num_sms{-1};
-  CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
+  CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
 
   const int num_states =
     num_sms * std::max(num_blocks_init_build_tbl, num_blocks_init_probe_tbl) * block_size;
@@ -158,12 +158,12 @@ void generate_input_tables(key_type* const build_tbl,
 
   init_curand<<<(num_states - 1) / block_size + 1, block_size>>>(devStates.data(), num_states);
 
-  CHECK_CUDA(0);
+  CUDF_CHECK_CUDA(0);
 
   init_build_tbl<key_type, size_type><<<num_sms * num_blocks_init_build_tbl, block_size>>>(
     build_tbl, build_tbl_size, multiplicity, devStates.data(), num_states);
 
-  CHECK_CUDA(0);
+  CUDF_CHECK_CUDA(0);
 
   auto const rand_max = std::numeric_limits<key_type>::max();
 
@@ -177,5 +177,5 @@ void generate_input_tables(key_type* const build_tbl,
                                                           devStates.data(),
                                                           num_states);
 
-  CHECK_CUDA(0);
+  CUDF_CHECK_CUDA(0);
 }
diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp
index 27339248968..6ff2543cf7d 100644
--- a/cpp/benchmarks/join/join_common.hpp
+++ b/cpp/benchmarks/join/join_common.hpp
@@ -116,7 +116,7 @@ static void BM_join(state_type& state, Join JoinFunc)
   auto build_payload_column = cudf::sequence(build_table_size, *init);
   auto probe_payload_column = cudf::sequence(probe_table_size, *init);
 
-  CHECK_CUDA(0);
+  CUDF_CHECK_CUDA(0);
 
   cudf::table_view build_table({build_key_column->view(), *build_payload_column});
   cudf::table_view probe_table({probe_key_column->view(), *probe_payload_column});
diff --git a/cpp/benchmarks/join/left_join.cu b/cpp/benchmarks/join/left_join.cu
index e332b70e30a..58a1c2d7f29 100644
--- a/cpp/benchmarks/join/left_join.cu
+++ b/cpp/benchmarks/join/left_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/benchmarks/lists/copying/scatter_lists.cu b/cpp/benchmarks/lists/copying/scatter_lists.cu
index 22e4be9ce9d..7f6d5cc5468 100644
--- a/cpp/benchmarks/lists/copying/scatter_lists.cu
+++ b/cpp/benchmarks/lists/copying/scatter_lists.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/benchmarks/null_mask/set_null_mask.cpp b/cpp/benchmarks/null_mask/set_null_mask.cpp
index d48d49c205d..2057951ff8d 100644
--- a/cpp/benchmarks/null_mask/set_null_mask.cpp
+++ b/cpp/benchmarks/null_mask/set_null_mask.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/benchmarks/string/convert_durations.cpp b/cpp/benchmarks/string/convert_durations.cpp
index 5259ac29f99..dc9a1e991b2 100644
--- a/cpp/benchmarks/string/convert_durations.cpp
+++ b/cpp/benchmarks/string/convert_durations.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/benchmarks/synchronization/synchronization.cpp b/cpp/benchmarks/synchronization/synchronization.cpp
index bd8a4d1de76..bbf90e6f68a 100644
--- a/cpp/benchmarks/synchronization/synchronization.cpp
+++ b/cpp/benchmarks/synchronization/synchronization.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,32 +29,32 @@ cuda_event_timer::cuda_event_timer(benchmark::State& state,
   // flush all of L2$
   if (flush_l2_cache) {
     int current_device = 0;
-    CUDA_TRY(cudaGetDevice(&current_device));
+    CUDF_CUDA_TRY(cudaGetDevice(&current_device));
 
     int l2_cache_bytes = 0;
-    CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));
+    CUDF_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));
 
     if (l2_cache_bytes > 0) {
       const int memset_value = 0;
       rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream);
-      CUDA_TRY(
+      CUDF_CUDA_TRY(
         cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value()));
     }
   }
 
-  CUDA_TRY(cudaEventCreate(&start));
-  CUDA_TRY(cudaEventCreate(&stop));
-  CUDA_TRY(cudaEventRecord(start, stream.value()));
+  CUDF_CUDA_TRY(cudaEventCreate(&start));
+  CUDF_CUDA_TRY(cudaEventCreate(&stop));
+  CUDF_CUDA_TRY(cudaEventRecord(start, stream.value()));
 }
 
 cuda_event_timer::~cuda_event_timer()
 {
-  CUDA_TRY(cudaEventRecord(stop, stream.value()));
-  CUDA_TRY(cudaEventSynchronize(stop));
+  CUDF_CUDA_TRY(cudaEventRecord(stop, stream.value()));
+  CUDF_CUDA_TRY(cudaEventSynchronize(stop));
 
   float milliseconds = 0.0f;
-  CUDA_TRY(cudaEventElapsedTime(&milliseconds, start, stop));
+  CUDF_CUDA_TRY(cudaEventElapsedTime(&milliseconds, start, stop));
   p_state->SetIterationTime(milliseconds / (1000.0f));
-  CUDA_TRY(cudaEventDestroy(start));
-  CUDA_TRY(cudaEventDestroy(stop));
+  CUDF_CUDA_TRY(cudaEventDestroy(start));
+  CUDF_CUDA_TRY(cudaEventDestroy(stop));
 }
diff --git a/cpp/benchmarks/text/replace.cpp b/cpp/benchmarks/text/replace.cpp
index 0a0e6a1667c..3fbb6054d5c 100644
--- a/cpp/benchmarks/text/replace.cpp
+++ b/cpp/benchmarks/text/replace.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/benchmarks/text/subword.cpp b/cpp/benchmarks/text/subword.cpp
index 2406ddd39ae..150f578a22a 100644
--- a/cpp/benchmarks/text/subword.cpp
+++ b/cpp/benchmarks/text/subword.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
index 6ab6f9a2095..aba78dad3fe 100644
--- a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
+++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
@@ -196,13 +196,13 @@ void type_dispatcher_benchmark(::benchmark::State& state)
   rmm::device_uvector<TypeParam*> d_vec(n_cols, rmm::cuda_stream_default);
 
   if (dispatching_type == NO_DISPATCHING) {
-    CUDA_TRY(cudaMemcpy(
+    CUDF_CUDA_TRY(cudaMemcpy(
       d_vec.data(), h_vec_p.data(), sizeof(TypeParam*) * n_cols, cudaMemcpyHostToDevice));
   }
 
   // Warm up
   launch_kernel<functor_type, dispatching_type>(source_table, d_vec.data(), work_per_thread);
-  CUDA_TRY(cudaDeviceSynchronize());
+  CUDF_CUDA_TRY(cudaDeviceSynchronize());
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
index 5a20f78b798..1639655d1e9 100644
--- a/cpp/cmake/thirdparty/get_cucollections.cmake
+++ b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -18,10 +18,13 @@ function(find_and_configure_cucollections)
   # Find or install cuCollections
   rapids_cpm_find(
     # cuCollections doesn't have a version yet
-    cuco 0.0
+    cuco 0.0.1
     GLOBAL_TARGETS cuco::cuco
+    BUILD_EXPORT_SET cudf-exports
+    INSTALL_EXPORT_SET cudf-exports
     CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections
-    GIT_TAG 6ec8b6dcdeceea07ab4456d32461a05c18864411
+    GIT_TAG fb58a38701f1c24ecfe07d8f1f208bbe80930da5
+    EXCLUDE_FROM_ALL ${BUILD_SHARED_LIBS}
     OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF"
   )
 
diff --git a/cpp/cmake/thirdparty/get_nvcomp.cmake b/cpp/cmake/thirdparty/get_nvcomp.cmake
index c1765408d62..0356725548b 100644
--- a/cpp/cmake/thirdparty/get_nvcomp.cmake
+++ b/cpp/cmake/thirdparty/get_nvcomp.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -13,14 +13,15 @@
 # =============================================================================
 
 # This function finds nvcomp and sets any additional necessary environment variables.
-function(find_and_configure_nvcomp VERSION)
-
-  # Find or install nvcomp
+function(find_and_configure_nvcomp VERSION_MIN VERSION_MAX)
+  # Search for latest version of nvComp
+  rapids_find_package(nvcomp ${VERSION_MAX} QUIET)
+  # If latest isn't found, fall back to building oldest support from source
   rapids_cpm_find(
-    nvcomp ${VERSION}
+    nvcomp ${VERSION_MIN}
     GLOBAL_TARGETS nvcomp::nvcomp
     CPM_ARGS GITHUB_REPOSITORY NVIDIA/nvcomp
-    GIT_TAG c435afaf4ba8a8d12f379d688effcb185886cec1
+    GIT_TAG v${VERSION_MIN}
     OPTIONS "BUILD_STATIC ON" "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF"
   )
 
@@ -32,9 +33,8 @@ function(find_and_configure_nvcomp VERSION)
   if(TARGET nvcomp AND PER_THREAD_DEFAULT_STREAM)
     target_compile_definitions(nvcomp PRIVATE CUDA_API_PER_THREAD_DEFAULT_STREAM)
   endif()
-
 endfunction()
 
-set(CUDF_MIN_VERSION_nvCOMP 2.1.0)
-
-find_and_configure_nvcomp(${CUDF_MIN_VERSION_nvCOMP})
+set(CUDF_MIN_VERSION_nvCOMP 2.2.0)
+set(CUDF_MAX_VERSION_nvCOMP 2.3.0)
+find_and_configure_nvcomp(${CUDF_MIN_VERSION_nvCOMP} ${CUDF_MAX_VERSION_nvCOMP})
diff --git a/cpp/docs/DEVELOPER_GUIDE.md b/cpp/docs/DEVELOPER_GUIDE.md
index eeebe38d873..1599c81cbe5 100644
--- a/cpp/docs/DEVELOPER_GUIDE.md
+++ b/cpp/docs/DEVELOPER_GUIDE.md
@@ -373,7 +373,7 @@ namespace detail{
     void external_function(..., rmm::cuda_stream_view stream){
         // Implementation uses the stream with async APIs.
         rmm::device_buffer buff(...,stream);
-        CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
+        CUDF_CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
         kernel<<<..., stream>>>(...);
         thrust::algorithm(rmm::exec_policy(stream), ...);
     }
@@ -777,7 +777,7 @@ CUDF_FAIL("This code path should not be reached.");
 
 ### CUDA Error Checking
 
-Use the `CUDA_TRY` macro to check for the successful completion of CUDA runtime API functions. This
+Use the `CUDF_CUDA_TRY` macro to check for the successful completion of CUDA runtime API functions. This
 macro throws a `cudf::cuda_error` exception if the CUDA API return value is not `cudaSuccess`. The
 thrown exception includes a description of the CUDA error code in its `what()` message.
 
diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt
index 0ada2977ead..0871778db81 100644
--- a/cpp/examples/basic/CMakeLists.txt
+++ b/cpp/examples/basic/CMakeLists.txt
@@ -1,3 +1,5 @@
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+
 cmake_minimum_required(VERSION 3.18)
 
 project(
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index 0087dd1b173..ecaa4a30cf0 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -254,10 +254,10 @@ struct scatter_gather_functor {
     if (output.nullable()) {
       // Have to initialize the output mask to all zeros because we may update
       // it with atomicOr().
-      CUDA_TRY(cudaMemsetAsync(static_cast<void*>(output.null_mask()),
-                               0,
-                               cudf::bitmask_allocation_size_bytes(output.size()),
-                               stream.value()));
+      CUDF_CUDA_TRY(cudaMemsetAsync(static_cast<void*>(output.null_mask()),
+                                    0,
+                                    cudf::bitmask_allocation_size_bytes(output.size()),
+                                    stream.value()));
     }
 
     auto output_device_view = cudf::mutable_column_device_view::create(output, stream);
@@ -344,7 +344,7 @@ std::unique_ptr<table> copy_if(
 
   // initialize just the first element of block_offsets to 0 since the InclusiveSum below
   // starts at the second element.
-  CUDA_TRY(cudaMemsetAsync(block_offsets.begin(), 0, sizeof(cudf::size_type), stream.value()));
+  CUDF_CUDA_TRY(cudaMemsetAsync(block_offsets.begin(), 0, sizeof(cudf::size_type), stream.value()));
 
   // 2. Find the offset for each block's output using a scan of block counts
   if (grid.num_blocks > 1) {
@@ -370,7 +370,7 @@ std::unique_ptr<table> copy_if(
   // As it is InclusiveSum, last value in block_offsets will be output_size
   // unless num_blocks == 1, in which case output_size is just block_counts[0]
   cudf::size_type output_size{0};
-  CUDA_TRY(cudaMemcpyAsync(
+  CUDF_CUDA_TRY(cudaMemcpyAsync(
     &output_size,
     grid.num_blocks > 1 ? block_offsets.begin() + grid.num_blocks : block_counts.begin(),
     sizeof(cudf::size_type),
diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh
index ac59b429a2c..6703db305a1 100644
--- a/cpp/include/cudf/detail/copy_range.cuh
+++ b/cpp/include/cudf/detail/copy_range.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -185,7 +185,7 @@ void copy_range(SourceValueIterator source_value_begin,
       nullptr);
   }
 
-  CHECK_CUDA(stream.value());
+  CUDF_CHECK_CUDA(stream.value());
 }
 
 /**
diff --git a/cpp/include/cudf/detail/get_value.cuh b/cpp/include/cudf/detail/get_value.cuh
index 56c0289dc0a..49a406ab5f0 100644
--- a/cpp/include/cudf/detail/get_value.cuh
+++ b/cpp/include/cudf/detail/get_value.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,11 +49,11 @@ T get_value(column_view const& col_view, size_type element_index, rmm::cuda_stre
   CUDF_EXPECTS(element_index >= 0 && element_index < col_view.size(),
                "invalid element_index value");
   T result;
-  CUDA_TRY(cudaMemcpyAsync(&result,
-                           col_view.data<T>() + element_index,
-                           sizeof(T),
-                           cudaMemcpyDeviceToHost,
-                           stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(&result,
+                                col_view.data<T>() + element_index,
+                                sizeof(T),
+                                cudaMemcpyDeviceToHost,
+                                stream.value()));
   stream.synchronize();
   return result;
 }
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index be010689847..7aec56fdc51 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -170,20 +170,20 @@ size_type inplace_bitmask_binop(
   rmm::device_uvector<bitmask_type const*> d_masks(masks.size(), stream, mr);
   rmm::device_uvector<size_type> d_begin_bits(masks_begin_bits.size(), stream, mr);
 
-  CUDA_TRY(cudaMemcpyAsync(
+  CUDF_CUDA_TRY(cudaMemcpyAsync(
     d_masks.data(), masks.data(), masks.size_bytes(), cudaMemcpyHostToDevice, stream.value()));
-  CUDA_TRY(cudaMemcpyAsync(d_begin_bits.data(),
-                           masks_begin_bits.data(),
-                           masks_begin_bits.size_bytes(),
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(d_begin_bits.data(),
+                                masks_begin_bits.data(),
+                                masks_begin_bits.size_bytes(),
+                                cudaMemcpyHostToDevice,
+                                stream.value()));
 
   auto constexpr block_size = 256;
   cudf::detail::grid_1d config(dest_mask.size(), block_size);
   offset_bitmask_binop<block_size>
     <<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
       op, dest_mask, d_masks, d_begin_bits, mask_size_bits, d_counter.data());
-  CHECK_CUDA(stream.value());
+  CUDF_CHECK_CUDA(stream.value());
   return d_counter.value(stream);
 }
 
@@ -298,27 +298,25 @@ rmm::device_uvector<size_type> segmented_count_bits(bitmask_type const* bitmask,
 
   // Allocate temporary memory.
   size_t temp_storage_bytes{0};
-  CUDA_TRY(cub::DeviceSegmentedReduce::Sum(nullptr,
-                                           temp_storage_bytes,
-                                           num_set_bits_in_word,
-                                           d_bit_counts.begin(),
-                                           num_ranges,
-                                           first_word_indices,
-                                           last_word_indices,
-                                           stream.value()));
+  CUDF_CUDA_TRY(cub::DeviceSegmentedReduce::Sum(nullptr,
+                                                temp_storage_bytes,
+                                                num_set_bits_in_word,
+                                                d_bit_counts.begin(),
+                                                num_ranges,
+                                                first_word_indices,
+                                                last_word_indices,
+                                                stream.value()));
   rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
 
   // Perform segmented reduction.
-  CUDA_TRY(cub::DeviceSegmentedReduce::Sum(d_temp_storage.data(),
-                                           temp_storage_bytes,
-                                           num_set_bits_in_word,
-                                           d_bit_counts.begin(),
-                                           num_ranges,
-                                           first_word_indices,
-                                           last_word_indices,
-                                           stream.value()));
-
-  CHECK_CUDA(stream.value());
+  CUDF_CUDA_TRY(cub::DeviceSegmentedReduce::Sum(d_temp_storage.data(),
+                                                temp_storage_bytes,
+                                                num_set_bits_in_word,
+                                                d_bit_counts.begin(),
+                                                num_ranges,
+                                                first_word_indices,
+                                                last_word_indices,
+                                                stream.value()));
 
   // Adjust counts in segment boundaries (if segments are not word-aligned).
   constexpr size_type block_size{256};
@@ -350,7 +348,7 @@ rmm::device_uvector<size_type> segmented_count_bits(bitmask_type const* bitmask,
                       });
   }
 
-  CHECK_CUDA(stream.value());
+  CUDF_CHECK_CUDA(stream.value());
   return d_bit_counts;
 }
 
diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
index a00bd64caa3..23d0ff26e0f 100644
--- a/cpp/include/cudf/detail/utilities/cuda.cuh
+++ b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -127,12 +127,12 @@ cudf::size_type elements_per_thread(Kernel kernel,
 
   // calculate theoretical occupancy
   int max_blocks = 0;
-  CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks, kernel, block_size, 0));
+  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks, kernel, block_size, 0));
 
   int device = 0;
-  CUDA_TRY(cudaGetDevice(&device));
+  CUDF_CUDA_TRY(cudaGetDevice(&device));
   int num_sms = 0;
-  CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device));
+  CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device));
   int per_thread = total_size / (max_blocks * num_sms * block_size);
   return std::clamp(per_thread, 1, max_per_thread);
 }
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index e3f44ce0bee..63ac48f6060 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -53,7 +53,7 @@ rmm::device_uvector<T> make_zeroed_device_uvector_async(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   rmm::device_uvector<T> ret(size, stream, mr);
-  CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value()));
+  CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value()));
   return ret;
 }
 
@@ -75,7 +75,7 @@ rmm::device_uvector<T> make_zeroed_device_uvector_sync(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   rmm::device_uvector<T> ret(size, stream, mr);
-  CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value()));
+  CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value()));
   stream.synchronize();
   return ret;
 }
@@ -99,11 +99,11 @@ rmm::device_uvector<T> make_device_uvector_async(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
-  CUDA_TRY(cudaMemcpyAsync(ret.data(),
-                           source_data.data(),
-                           source_data.size() * sizeof(T),
-                           cudaMemcpyDefault,
-                           stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(),
+                                source_data.data(),
+                                source_data.size() * sizeof(T),
+                                cudaMemcpyDefault,
+                                stream.value()));
   return ret;
 }
 
@@ -151,11 +151,11 @@ rmm::device_uvector<T> make_device_uvector_async(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
-  CUDA_TRY(cudaMemcpyAsync(ret.data(),
-                           source_data.data(),
-                           source_data.size() * sizeof(T),
-                           cudaMemcpyDefault,
-                           stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(),
+                                source_data.data(),
+                                source_data.size() * sizeof(T),
+                                cudaMemcpyDefault,
+                                stream.value()));
   return ret;
 }
 
@@ -286,7 +286,7 @@ template <typename T, typename OutContainer>
 OutContainer make_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
 {
   OutContainer result(v.size());
-  CUDA_TRY(cudaMemcpyAsync(
+  CUDF_CUDA_TRY(cudaMemcpyAsync(
     result.data(), v.data(), v.size() * sizeof(T), cudaMemcpyDeviceToHost, stream.value()));
   return result;
 }
diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
index ffe159b59dc..56db3fd6216 100644
--- a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
+++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
@@ -78,14 +78,14 @@ class istream_data_chunk_reader : public data_chunk_reader {
   {
     // create an event to track the completion of the last device-to-host copy.
     for (std::size_t i = 0; i < _tickets.size(); i++) {
-      CUDA_TRY(cudaEventCreate(&(_tickets[i].event)));
+      CUDF_CUDA_TRY(cudaEventCreate(&(_tickets[i].event)));
     }
   }
 
   ~istream_data_chunk_reader()
   {
     for (std::size_t i = 0; i < _tickets.size(); i++) {
-      CUDA_TRY(cudaEventDestroy(_tickets[i].event));
+      CUDF_CUDA_TRY(cudaEventDestroy(_tickets[i].event));
     }
   }
 
@@ -101,7 +101,7 @@ class istream_data_chunk_reader : public data_chunk_reader {
     _next_ticket_idx = (_next_ticket_idx + 1) % _tickets.size();
 
     // synchronize on the last host-to-device copy, so we don't clobber the host buffer.
-    CUDA_TRY(cudaEventSynchronize(h_ticket.event));
+    CUDF_CUDA_TRY(cudaEventSynchronize(h_ticket.event));
 
     // resize the host buffer as necessary to contain the requested number of bytes
     if (h_ticket.buffer.size() < read_size) { h_ticket.buffer.resize(read_size); }
@@ -116,7 +116,7 @@ class istream_data_chunk_reader : public data_chunk_reader {
     auto chunk = rmm::device_uvector<char>(read_size, stream);
 
     // copy the host-pinned data on to device
-    CUDA_TRY(cudaMemcpyAsync(  //
+    CUDF_CUDA_TRY(cudaMemcpyAsync(  //
       chunk.data(),
       h_ticket.buffer.data(),
       read_size,
@@ -124,7 +124,7 @@ class istream_data_chunk_reader : public data_chunk_reader {
       stream.value()));
 
     // record the host-to-device copy.
-    CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value()));
+    CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value()));
 
     // return the view over device memory so it can be processed.
     return std::make_unique<device_uvector_data_chunk>(std::move(chunk));
diff --git a/cpp/include/cudf/strings/detail/utilities.cuh b/cpp/include/cudf/strings/detail/utilities.cuh
index 4b036fb7f0e..b9ea2d9ecff 100644
--- a/cpp/include/cudf/strings/detail/utilities.cuh
+++ b/cpp/include/cudf/strings/detail/utilities.cuh
@@ -67,7 +67,7 @@ std::unique_ptr<column> make_offsets_child_column(
   // we use inclusive-scan on a shifted output (d_offsets+1) and then set the first
   // offset values to zero manually.
   thrust::inclusive_scan(rmm::exec_policy(stream), begin, end, d_offsets + 1);
-  CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(int32_t), stream.value()));
+  CUDF_CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(int32_t), stream.value()));
   return offsets_column;
 }
 
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index 27ee5cf95cd..a486a5a765c 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -99,7 +99,8 @@ CUDF_HOST_DEVICE inline string_view string_view::max()
 #if defined(__CUDA_ARCH__)
   psentinel = &cudf::strings::detail::max_string_sentinel[0];
 #else
-  CUDA_TRY(cudaGetSymbolAddress((void**)&psentinel, cudf::strings::detail::max_string_sentinel));
+  CUDF_CUDA_TRY(
+    cudaGetSymbolAddress((void**)&psentinel, cudf::strings::detail::max_string_sentinel));
 #endif
   return string_view(psentinel, 4);
 }
diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh
index 3ed18099463..8d08a3fd0b0 100644
--- a/cpp/include/cudf/table/table_device_view.cuh
+++ b/cpp/include/cudf/table/table_device_view.cuh
@@ -145,7 +145,7 @@ auto contiguous_copy_column_device_views(HostTableView source_view, rmm::cuda_st
   auto d_columns = detail::child_columns_to_device_array<ColumnDeviceView>(
     source_view.begin(), source_view.end(), h_ptr, d_ptr);
 
-  CUDA_TRY(cudaMemcpyAsync(d_ptr, h_ptr, views_size_bytes, cudaMemcpyDefault, stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(d_ptr, h_ptr, views_size_bytes, cudaMemcpyDefault, stream.value()));
   stream.synchronize();
   return std::make_tuple(std::move(descendant_storage), d_columns);
 }
diff --git a/cpp/include/cudf/utilities/error.hpp b/cpp/include/cudf/utilities/error.hpp
index 2036723a6ed..8be1a7e3a32 100644
--- a/cpp/include/cudf/utilities/error.hpp
+++ b/cpp/include/cudf/utilities/error.hpp
@@ -1,3 +1,19 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #pragma once
 
 #include <cuda.h>
@@ -99,7 +115,7 @@ inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int l
  * cudaSuccess, invokes cudaGetLastError() to clear the error and throws an
  * exception detailing the CUDA error that occurred
  */
-#define CUDA_TRY(call)                                            \
+#define CUDF_CUDA_TRY(call)                                       \
   do {                                                            \
     cudaError_t const status = (call);                            \
     if (cudaSuccess != status) {                                  \
@@ -122,12 +138,12 @@ inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int l
  * asynchronous kernel launch.
  */
 #ifndef NDEBUG
-#define CHECK_CUDA(stream)                   \
-  do {                                       \
-    CUDA_TRY(cudaStreamSynchronize(stream)); \
-    CUDA_TRY(cudaPeekAtLastError());         \
+#define CUDF_CHECK_CUDA(stream)                   \
+  do {                                            \
+    CUDF_CUDA_TRY(cudaStreamSynchronize(stream)); \
+    CUDF_CUDA_TRY(cudaPeekAtLastError());         \
   } while (0);
 #else
-#define CHECK_CUDA(stream) CUDA_TRY(cudaPeekAtLastError());
+#define CUDF_CHECK_CUDA(stream) CUDF_CUDA_TRY(cudaPeekAtLastError());
 #endif
 /** @} */
diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp
index 4c2d4d429eb..b28ed4f70fa 100644
--- a/cpp/include/cudf_test/column_utilities.hpp
+++ b/cpp/include/cudf_test/column_utilities.hpp
@@ -183,7 +183,8 @@ template <typename T, std::enable_if_t<not cudf::is_fixed_point<T>()>* = nullptr
 std::pair<thrust::host_vector<T>, std::vector<bitmask_type>> to_host(column_view c)
 {
   thrust::host_vector<T> host_data(c.size());
-  CUDA_TRY(cudaMemcpy(host_data.data(), c.data<T>(), c.size() * sizeof(T), cudaMemcpyDeviceToHost));
+  CUDF_CUDA_TRY(
+    cudaMemcpy(host_data.data(), c.data<T>(), c.size() * sizeof(T), cudaMemcpyDeviceToHost));
   return {host_data, bitmask_to_host(c)};
 }
 
@@ -206,7 +207,7 @@ std::pair<thrust::host_vector<T>, std::vector<bitmask_type>> to_host(column_view
 
   auto host_rep_types = thrust::host_vector<Rep>(c.size());
 
-  CUDA_TRY(cudaMemcpy(
+  CUDF_CUDA_TRY(cudaMemcpy(
     host_rep_types.data(), c.begin<Rep>(), c.size() * sizeof(Rep), cudaMemcpyDeviceToHost));
 
   auto to_fp = [&](Rep val) { return T{scaled_integer<Rep>{val, scale_type{c.type().scale()}}}; };
diff --git a/cpp/scripts/run-clang-format.py b/cpp/scripts/run-clang-format.py
index 3d462d65fb8..2a732dfdc67 100755
--- a/cpp/scripts/run-clang-format.py
+++ b/cpp/scripts/run-clang-format.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/cpp/scripts/run-clang-tidy.py b/cpp/scripts/run-clang-tidy.py
index 30e937d7f4d..f6b24ccb4bb 100644
--- a/cpp/scripts/run-clang-tidy.py
+++ b/cpp/scripts/run-clang-tidy.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index 9b3e33f491e..ec41fbb8883 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -265,7 +265,7 @@ void for_each(rmm::cuda_stream_view stream, cudf::size_type size, Functor f)
 {
   int block_size;
   int min_grid_size;
-  CUDA_TRY(
+  CUDF_CUDA_TRY(
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, for_each_kernel<decltype(f)>));
   // 2 elements per thread.
   const int grid_size = util::div_rounding_up_safe(size, 2 * block_size);
diff --git a/cpp/src/bitmask/is_element_valid.cpp b/cpp/src/bitmask/is_element_valid.cpp
index 47870e01567..4a94ec9759c 100644
--- a/cpp/src/bitmask/is_element_valid.cpp
+++ b/cpp/src/bitmask/is_element_valid.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,11 +34,11 @@ bool is_element_valid_sync(column_view const& col_view,
   bitmask_type word;
   // null_mask() returns device ptr to bitmask without offset
   size_type index = element_index + col_view.offset();
-  CUDA_TRY(cudaMemcpyAsync(&word,
-                           col_view.null_mask() + word_index(index),
-                           sizeof(bitmask_type),
-                           cudaMemcpyDeviceToHost,
-                           stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(&word,
+                                col_view.null_mask() + word_index(index),
+                                sizeof(bitmask_type),
+                                cudaMemcpyDeviceToHost,
+                                stream.value()));
   stream.synchronize();
   return static_cast<bool>(word & (bitmask_type{1} << intra_word_index(index)));
 }
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index d1107ad3cfd..756cf3421c9 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -90,7 +90,7 @@ rmm::device_buffer create_null_mask(size_type size,
 
   if (state != mask_state::UNINITIALIZED) {
     uint8_t fill_value = (state == mask_state::ALL_VALID) ? 0xff : 0x00;
-    CUDA_TRY(cudaMemsetAsync(
+    CUDF_CUDA_TRY(cudaMemsetAsync(
       static_cast<bitmask_type*>(mask.data()), fill_value, mask_size, stream.value()));
   }
 
@@ -146,7 +146,7 @@ void set_null_mask(bitmask_type* bitmask,
     cudf::detail::grid_1d config(number_of_mask_words, 256);
     set_null_mask_kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
       static_cast<bitmask_type*>(bitmask), begin_bit, end_bit, valid, number_of_mask_words);
-    CHECK_CUDA(stream.value());
+    CUDF_CHECK_CUDA(stream.value());
   }
 }
 
@@ -220,7 +220,7 @@ rmm::device_buffer copy_bitmask(bitmask_type const* mask,
     cudf::detail::grid_1d config(number_of_mask_words, 256);
     copy_offset_bitmask<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
       static_cast<bitmask_type*>(dest_mask.data()), mask, begin_bit, end_bit, number_of_mask_words);
-    CHECK_CUDA(stream.value());
+    CUDF_CHECK_CUDA(stream.value());
   }
   return dest_mask;
 }
diff --git a/cpp/src/column/column_device_view.cu b/cpp/src/column/column_device_view.cu
index dd1803f4b90..fc244521617 100644
--- a/cpp/src/column/column_device_view.cu
+++ b/cpp/src/column/column_device_view.cu
@@ -77,11 +77,11 @@ create_device_view_from_view(ColumnView const& source, rmm::cuda_stream_view str
     new ColumnDeviceView(source, staging_buffer.data(), descendant_storage->data()), deleter};
 
   // copy the CPU memory with all the children into device memory
-  CUDA_TRY(cudaMemcpyAsync(descendant_storage->data(),
-                           staging_buffer.data(),
-                           descendant_storage->size(),
-                           cudaMemcpyDefault,
-                           stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(descendant_storage->data(),
+                                staging_buffer.data(),
+                                descendant_storage->size(),
+                                cudaMemcpyDefault,
+                                stream.value()));
 
   stream.synchronize();
 
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 46470e69611..514374d450d 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1046,11 +1046,11 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
   setup_source_buf_info(input.begin(), input.end(), h_src_buf_info, h_src_buf_info);
 
   // HtoD indices and source buf info to device
-  CUDA_TRY(cudaMemcpyAsync(d_indices,
-                           h_indices,
-                           indices_size + src_buf_info_size,
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(d_indices,
+                                h_indices,
+                                indices_size + src_buf_info_size,
+                                cudaMemcpyHostToDevice,
+                                stream.value()));
 
   // packed block of memory 2. partition buffer sizes and dst_buf_info structs
   std::size_t const buf_sizes_size =
@@ -1180,11 +1180,11 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
   }
 
   // DtoH buf sizes and col info back to the host
-  CUDA_TRY(cudaMemcpyAsync(h_buf_sizes,
-                           d_buf_sizes,
-                           buf_sizes_size + dst_buf_info_size,
-                           cudaMemcpyDeviceToHost,
-                           stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(h_buf_sizes,
+                                d_buf_sizes,
+                                buf_sizes_size + dst_buf_info_size,
+                                cudaMemcpyDeviceToHost,
+                                stream.value()));
   stream.synchronize();
 
   // allocate output partition buffers
@@ -1224,14 +1224,14 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
   });
 
   // HtoD src and dest buffers
-  CUDA_TRY(cudaMemcpyAsync(
+  CUDF_CUDA_TRY(cudaMemcpyAsync(
     d_src_bufs, h_src_bufs, src_bufs_size + dst_bufs_size, cudaMemcpyHostToDevice, stream.value()));
 
   // perform the copy.
   copy_data(num_bufs, num_src_bufs, d_src_bufs, d_dst_bufs, d_dst_buf_info, stream);
 
   // DtoH dst info (to retrieve null counts)
-  CUDA_TRY(cudaMemcpyAsync(
+  CUDF_CUDA_TRY(cudaMemcpyAsync(
     h_dst_buf_info, d_dst_buf_info, dst_buf_info_size, cudaMemcpyDeviceToHost, stream.value()));
 
   stream.synchronize();
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 49ed0b7fc1d..44df981f5bf 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -645,10 +645,14 @@ bool can_use_hash_groupby(table_view const& keys, host_span<aggregation_request
     // Currently, structs are not supported in any of hash-based aggregations.
     // Therefore, if any request contains structs then we must fallback to sort-based aggregations.
     // TODO: Support structs in hash-based aggregations.
+    auto const v_type = is_dictionary(r.values.type())
+                          ? cudf::dictionary_column_view(r.values).keys().type()
+                          : r.values.type();
+
     return not(r.values.type().id() == type_id::STRUCT) and
-           cudf::has_atomic_support(r.values.type()) and
-           std::all_of(r.aggregations.begin(), r.aggregations.end(), [](auto const& a) {
-             return is_hash_aggregation(a->kind);
+           std::all_of(r.aggregations.begin(), r.aggregations.end(), [v_type](auto const& a) {
+             return cudf::has_atomic_support(cudf::detail::target_type(v_type, a->kind)) and
+                    is_hash_aggregation(a->kind);
            });
   });
 }
diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh
index 0ae0baa9908..76f3fba4689 100644
--- a/cpp/src/hash/concurrent_unordered_map.cuh
+++ b/cpp/src/hash/concurrent_unordered_map.cuh
@@ -432,11 +432,11 @@ class concurrent_unordered_map {
 
       m_hashtbl_values = m_allocator.allocate(m_capacity, stream);
     }
-    CUDA_TRY(cudaMemcpyAsync(m_hashtbl_values,
-                             other.m_hashtbl_values,
-                             m_capacity * sizeof(value_type),
-                             cudaMemcpyDefault,
-                             stream.value()));
+    CUDF_CUDA_TRY(cudaMemcpyAsync(m_hashtbl_values,
+                                  other.m_hashtbl_values,
+                                  m_capacity * sizeof(value_type),
+                                  cudaMemcpyDefault,
+                                  stream.value()));
   }
 
   void clear_async(rmm::cuda_stream_view stream = rmm::cuda_stream_default)
@@ -460,10 +460,10 @@ class concurrent_unordered_map {
     cudaError_t status = cudaPointerGetAttributes(&hashtbl_values_ptr_attributes, m_hashtbl_values);
 
     if (cudaSuccess == status && isPtrManaged(hashtbl_values_ptr_attributes)) {
-      CUDA_TRY(cudaMemPrefetchAsync(
+      CUDF_CUDA_TRY(cudaMemPrefetchAsync(
         m_hashtbl_values, m_capacity * sizeof(value_type), dev_id, stream.value()));
     }
-    CUDA_TRY(cudaMemPrefetchAsync(this, sizeof(*this), dev_id, stream.value()));
+    CUDF_CUDA_TRY(cudaMemPrefetchAsync(this, sizeof(*this), dev_id, stream.value()));
   }
 
   /**
@@ -532,8 +532,8 @@ class concurrent_unordered_map {
 
       if (cudaSuccess == status && isPtrManaged(hashtbl_values_ptr_attributes)) {
         int dev_id = 0;
-        CUDA_TRY(cudaGetDevice(&dev_id));
-        CUDA_TRY(cudaMemPrefetchAsync(
+        CUDF_CUDA_TRY(cudaGetDevice(&dev_id));
+        CUDF_CUDA_TRY(cudaMemPrefetchAsync(
           m_hashtbl_values, m_capacity * sizeof(value_type), dev_id, stream.value()));
       }
     }
@@ -543,6 +543,6 @@ class concurrent_unordered_map {
         m_hashtbl_values, m_capacity, m_unused_key, m_unused_element);
     }
 
-    CUDA_TRY(cudaGetLastError());
+    CUDF_CHECK_CUDA(stream.value());
   }
 };
diff --git a/cpp/src/hash/concurrent_unordered_multimap.cuh b/cpp/src/hash/concurrent_unordered_multimap.cuh
index cdf5b6a8649..aa5b8db393f 100644
--- a/cpp/src/hash/concurrent_unordered_multimap.cuh
+++ b/cpp/src/hash/concurrent_unordered_multimap.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -488,11 +488,11 @@ class concurrent_unordered_multimap {
 
       m_hashtbl_values = m_allocator.allocate(m_hashtbl_capacity, stream);
     }
-    CUDA_TRY(cudaMemcpyAsync(m_hashtbl_values,
-                             other.m_hashtbl_values,
-                             m_hashtbl_size * sizeof(value_type),
-                             cudaMemcpyDefault,
-                             stream.value()));
+    CUDF_CUDA_TRY(cudaMemcpyAsync(m_hashtbl_values,
+                                  other.m_hashtbl_values,
+                                  m_hashtbl_size * sizeof(value_type),
+                                  cudaMemcpyDefault,
+                                  stream.value()));
   }
 
   void clear_async(rmm::cuda_stream_view stream = rmm::cuda_stream_default)
@@ -519,7 +519,7 @@ class concurrent_unordered_multimap {
     cudaError_t status = cudaPointerGetAttributes(&hashtbl_values_ptr_attributes, m_hashtbl_values);
 
     if (cudaSuccess == status && isPtrManaged(hashtbl_values_ptr_attributes)) {
-      CUDA_TRY(cudaMemPrefetchAsync(
+      CUDF_CUDA_TRY(cudaMemPrefetchAsync(
         m_hashtbl_values, m_hashtbl_size * sizeof(value_type), dev_id, stream.value()));
     }
   }
@@ -575,8 +575,8 @@ class concurrent_unordered_multimap {
 
       if (cudaSuccess == status && isPtrManaged(hashtbl_values_ptr_attributes)) {
         int dev_id = 0;
-        CUDA_TRY(cudaGetDevice(&dev_id));
-        CUDA_TRY(cudaMemPrefetchAsync(
+        CUDF_CUDA_TRY(cudaGetDevice(&dev_id));
+        CUDF_CUDA_TRY(cudaMemPrefetchAsync(
           m_hashtbl_values, m_hashtbl_size * sizeof(value_type), dev_id, stream.value()));
       }
     }
@@ -584,7 +584,7 @@ class concurrent_unordered_multimap {
     if (init) {
       init_hashtbl<<<((m_hashtbl_size - 1) / block_size) + 1, block_size, 0, stream.value()>>>(
         m_hashtbl_values, m_hashtbl_size, unused_key, unused_element);
-      CUDA_TRY(cudaGetLastError());
+      CUDF_CHECK_CUDA(stream.value());
     }
   }
 };
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index 01ca32e6a2f..e5da4794ca3 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -144,7 +144,7 @@ std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
   // Make sure the current device ID matches the Tensor's device ID
   if (tensor.device.device_type != kDLCPU) {
     int device_id = 0;
-    CUDA_TRY(cudaGetDevice(&device_id));
+    CUDF_CUDA_TRY(cudaGetDevice(&device_id));
     CUDF_EXPECTS(tensor.device.device_id == device_id, "DLTensor device ID must be current device");
   }
 
@@ -184,11 +184,11 @@ std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
   for (auto& col : columns) {
     col = make_numeric_column(dtype, num_rows, mask_state::UNALLOCATED, stream, mr);
 
-    CUDA_TRY(cudaMemcpyAsync(col->mutable_view().head<void>(),
-                             reinterpret_cast<void*>(tensor_data),
-                             bytes,
-                             cudaMemcpyDefault,
-                             stream.value()));
+    CUDF_CUDA_TRY(cudaMemcpyAsync(col->mutable_view().head<void>(),
+                                  reinterpret_cast<void*>(tensor_data),
+                                  bytes,
+                                  cudaMemcpyDefault,
+                                  stream.value()));
 
     tensor_data += col_stride;
   }
@@ -234,7 +234,7 @@ DLManagedTensor* to_dlpack(table_view const& input,
     tensor.strides[1] = num_rows;
   }
 
-  CUDA_TRY(cudaGetDevice(&tensor.device.device_id));
+  CUDF_CUDA_TRY(cudaGetDevice(&tensor.device.device_id));
   tensor.device.device_type = kDLCUDA;
 
   // If there is only one column, then a 1D tensor can just copy the pointer
@@ -254,11 +254,11 @@ DLManagedTensor* to_dlpack(table_view const& input,
 
   auto tensor_data = reinterpret_cast<uintptr_t>(tensor.data);
   for (auto const& col : input) {
-    CUDA_TRY(cudaMemcpyAsync(reinterpret_cast<void*>(tensor_data),
-                             get_column_data(col),
-                             stride_bytes,
-                             cudaMemcpyDefault,
-                             stream.value()));
+    CUDF_CUDA_TRY(cudaMemcpyAsync(reinterpret_cast<void*>(tensor_data),
+                                  get_column_data(col),
+                                  stride_bytes,
+                                  cudaMemcpyDefault,
+                                  stream.value()));
     tensor_data += stride_bytes;
   }
 
diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu
index 99b657fb9d5..6c5cd56d2a7 100644
--- a/cpp/src/interop/from_arrow.cu
+++ b/cpp/src/interop/from_arrow.cu
@@ -108,11 +108,11 @@ struct dispatch_to_cudf_column {
       stream,
       mr);
     auto mask_buffer = array.null_bitmap();
-    CUDA_TRY(cudaMemcpyAsync(mask->data(),
-                             reinterpret_cast<const uint8_t*>(mask_buffer->address()),
-                             array.null_bitmap()->size(),
-                             cudaMemcpyDefault,
-                             stream.value()));
+    CUDF_CUDA_TRY(cudaMemcpyAsync(mask->data(),
+                                  reinterpret_cast<const uint8_t*>(mask_buffer->address()),
+                                  array.null_bitmap()->size(),
+                                  cudaMemcpyDefault,
+                                  stream.value()));
     return mask;
   }
 
@@ -135,7 +135,7 @@ struct dispatch_to_cudf_column {
     auto const has_nulls     = skip_mask ? false : array.null_bitmap_data() != nullptr;
     auto col = make_fixed_width_column(type, num_rows, mask_state::UNALLOCATED, stream, mr);
     auto mutable_column_view = col->mutable_view();
-    CUDA_TRY(cudaMemcpyAsync(
+    CUDF_CUDA_TRY(cudaMemcpyAsync(
       mutable_column_view.data<T>(),
       reinterpret_cast<const uint8_t*>(data_buffer->address()) + array.offset() * sizeof(T),
       sizeof(T) * num_rows,
@@ -191,7 +191,7 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<numeric::decimal128>
   auto col = make_fixed_width_column(type, num_rows, mask_state::UNALLOCATED, stream, mr);
   auto mutable_column_view = col->mutable_view();
 
-  CUDA_TRY(cudaMemcpyAsync(
+  CUDF_CUDA_TRY(cudaMemcpyAsync(
     mutable_column_view.data<DeviceType>(),
     reinterpret_cast<const uint8_t*>(data_buffer->address()) + array.offset() * sizeof(DeviceType),
     sizeof(DeviceType) * num_rows,
@@ -227,11 +227,11 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<bool>(
 {
   auto data_buffer = array.data()->buffers[1];
   auto data        = rmm::device_buffer(data_buffer->size(), stream, mr);
-  CUDA_TRY(cudaMemcpyAsync(data.data(),
-                           reinterpret_cast<const uint8_t*>(data_buffer->address()),
-                           data_buffer->size(),
-                           cudaMemcpyDefault,
-                           stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(data.data(),
+                                reinterpret_cast<const uint8_t*>(data_buffer->address()),
+                                data_buffer->size(),
+                                cudaMemcpyDefault,
+                                stream.value()));
   auto out_col = mask_to_bools(static_cast<bitmask_type*>(data.data()),
                                array.offset(),
                                array.offset() + array.length(),
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index c7409978bb2..517a83c716e 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -53,11 +53,11 @@ std::shared_ptr<arrow::Buffer> fetch_data_buffer(column_view input_view,
 
   auto data_buffer = allocate_arrow_buffer(data_size_in_bytes, ar_mr);
 
-  CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
-                           input_view.data<T>(),
-                           data_size_in_bytes,
-                           cudaMemcpyDeviceToHost,
-                           stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
+                                input_view.data<T>(),
+                                data_size_in_bytes,
+                                cudaMemcpyDeviceToHost,
+                                stream.value()));
 
   return std::move(data_buffer);
 }
@@ -73,7 +73,7 @@ std::shared_ptr<arrow::Buffer> fetch_mask_buffer(column_view input_view,
 
   if (input_view.has_nulls()) {
     auto mask_buffer = allocate_arrow_bitmap(static_cast<int64_t>(input_view.size()), ar_mr);
-    CUDA_TRY(cudaMemcpyAsync(
+    CUDF_CUDA_TRY(cudaMemcpyAsync(
       mask_buffer->mutable_data(),
       (input_view.offset() > 0) ? cudf::copy_bitmask(input_view).data() : input_view.null_mask(),
       mask_size_in_bytes,
@@ -163,11 +163,11 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(
   auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType);
   auto data_buffer             = allocate_arrow_buffer(buf_size_in_bytes, ar_mr);
 
-  CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
-                           buf.data(),
-                           buf_size_in_bytes,
-                           cudaMemcpyDeviceToHost,
-                           stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
+                                buf.data(),
+                                buf_size_in_bytes,
+                                cudaMemcpyDeviceToHost,
+                                stream.value()));
 
   auto type    = arrow::decimal(18, -input.type().scale());
   auto mask    = fetch_mask_buffer(input, ar_mr, stream);
@@ -197,11 +197,11 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal128>
   auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType);
   auto data_buffer             = allocate_arrow_buffer(buf_size_in_bytes, ar_mr);
 
-  CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
-                           buf.data(),
-                           buf_size_in_bytes,
-                           cudaMemcpyDeviceToHost,
-                           stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
+                                buf.data(),
+                                buf_size_in_bytes,
+                                cudaMemcpyDeviceToHost,
+                                stream.value()));
 
   auto type    = arrow::decimal(18, -input.type().scale());
   auto mask    = fetch_mask_buffer(input, ar_mr, stream);
@@ -222,11 +222,11 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<bool>(column_view in
 
   auto data_buffer = allocate_arrow_buffer(static_cast<int64_t>(bitmask.first->size()), ar_mr);
 
-  CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
-                           bitmask.first->data(),
-                           bitmask.first->size(),
-                           cudaMemcpyDeviceToHost,
-                           stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
+                                bitmask.first->data(),
+                                bitmask.first->size(),
+                                cudaMemcpyDeviceToHost,
+                                stream.value()));
   return to_arrow_array(id,
                         static_cast<int64_t>(input.size()),
                         std::move(data_buffer),
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index b5b76c2def8..5885b61b35b 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -192,9 +192,9 @@ rmm::device_buffer decompress_data(datasource& source,
 
     for (int loop_cnt = 0; loop_cnt < 2; loop_cnt++) {
       inflate_in.host_to_device(stream);
-      CUDA_TRY(
+      CUDF_CUDA_TRY(
         cudaMemsetAsync(inflate_out.device_ptr(), 0, inflate_out.memory_size(), stream.value()));
-      CUDA_TRY(gpuinflate(
+      CUDF_CUDA_TRY(gpuinflate(
         inflate_in.device_ptr(), inflate_out.device_ptr(), inflate_in.size(), 0, stream));
       inflate_out.device_to_host(stream, true);
 
@@ -424,11 +424,11 @@ std::vector<column_buffer> decode_data(metadata& meta,
   // Copy valid bits that are shared between columns
   for (size_t i = 0; i < out_buffers.size(); i++) {
     if (valid_alias[i] != nullptr) {
-      CUDA_TRY(cudaMemcpyAsync(out_buffers[i].null_mask(),
-                               valid_alias[i],
-                               out_buffers[i].null_mask_size(),
-                               cudaMemcpyHostToDevice,
-                               stream.value()));
+      CUDF_CUDA_TRY(cudaMemcpyAsync(out_buffers[i].null_mask(),
+                                    valid_alias[i],
+                                    out_buffers[i].null_mask_size(),
+                                    cudaMemcpyHostToDevice,
+                                    stream.value()));
     }
   }
   schema_desc.device_to_host(stream, true);
diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu
index b4a42a66133..631cf19b2aa 100644
--- a/cpp/src/io/comp/debrotli.cu
+++ b/cpp/src/io/comp/debrotli.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -2048,7 +2048,7 @@ size_t __host__ get_gpu_debrotli_scratch_size(int max_num_inputs)
   int sm_count = 0;
   int dev      = 0;
   uint32_t max_fb_size, min_fb_size, fb_size;
-  CUDA_TRY(cudaGetDevice(&dev));
+  CUDF_CUDA_TRY(cudaGetDevice(&dev));
   if (cudaSuccess == cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev)) {
     // printf("%d SMs on device %d\n", sm_count, dev);
     max_num_inputs =
@@ -2092,14 +2092,14 @@ cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s* inputs,
   scratch_size = min(scratch_size, (size_t)0xffffffffu);
   fb_heap_size = (uint32_t)((scratch_size - sizeof(brotli_dictionary_s)) & ~0xf);
 
-  CUDA_TRY(cudaMemsetAsync(scratch_u8, 0, 2 * sizeof(uint32_t), stream.value()));
+  CUDF_CUDA_TRY(cudaMemsetAsync(scratch_u8, 0, 2 * sizeof(uint32_t), stream.value()));
   // NOTE: The 128KB dictionary copy can have a relatively large overhead since source isn't
   // page-locked
-  CUDA_TRY(cudaMemcpyAsync(scratch_u8 + fb_heap_size,
-                           get_brotli_dictionary(),
-                           sizeof(brotli_dictionary_s),
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(scratch_u8 + fb_heap_size,
+                                get_brotli_dictionary(),
+                                sizeof(brotli_dictionary_s),
+                                cudaMemcpyHostToDevice,
+                                stream.value()));
   gpu_debrotli_kernel<<<dim_grid, dim_block, 0, stream.value()>>>(
     inputs, outputs, scratch_u8, fb_heap_size, count32);
 #if DUMP_FB_HEAP
@@ -2107,7 +2107,7 @@ cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s* inputs,
   uint32_t cur = 0;
   printf("heap dump (%d bytes)\n", fb_heap_size);
   while (cur < fb_heap_size && !(cur & 3)) {
-    CUDA_TRY(cudaMemcpyAsync(
+    CUDF_CUDA_TRY(cudaMemcpyAsync(
       &dump[0], scratch_u8 + cur, 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream.value()));
     stream.synchronize();
     printf("@%d: next = %d, size = %d\n", cur, dump[0], dump[1]);
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index ae9738164f3..cd070d28f38 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -260,11 +260,11 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
 
     auto const previous_data_size = d_data.size();
     d_data.resize(target_pos - buffer_pos, stream);
-    CUDA_TRY(cudaMemcpyAsync(d_data.begin() + previous_data_size,
-                             data.begin() + buffer_pos + previous_data_size,
-                             target_pos - buffer_pos - previous_data_size,
-                             cudaMemcpyDefault,
-                             stream.value()));
+    CUDF_CUDA_TRY(cudaMemcpyAsync(d_data.begin() + previous_data_size,
+                                  data.begin() + buffer_pos + previous_data_size,
+                                  target_pos - buffer_pos - previous_data_size,
+                                  cudaMemcpyDefault,
+                                  stream.value()));
 
     // Pass 1: Count the potential number of rows in each character block for each
     // possible parser state at the beginning of the block.
@@ -280,11 +280,11 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
                                                                  range_end,
                                                                  skip_rows,
                                                                  stream);
-    CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
-                             row_ctx.device_ptr(),
-                             num_blocks * sizeof(uint64_t),
-                             cudaMemcpyDeviceToHost,
-                             stream.value()));
+    CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
+                                  row_ctx.device_ptr(),
+                                  num_blocks * sizeof(uint64_t),
+                                  cudaMemcpyDeviceToHost,
+                                  stream.value()));
     stream.synchronize();
 
     // Sum up the rows in each character block, selecting the row count that
@@ -300,11 +300,11 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
       // At least one row in range in this batch
       all_row_offsets.resize(total_rows - skip_rows, stream);
 
-      CUDA_TRY(cudaMemcpyAsync(row_ctx.device_ptr(),
-                               row_ctx.host_ptr(),
-                               num_blocks * sizeof(uint64_t),
-                               cudaMemcpyHostToDevice,
-                               stream.value()));
+      CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.device_ptr(),
+                                    row_ctx.host_ptr(),
+                                    num_blocks * sizeof(uint64_t),
+                                    cudaMemcpyHostToDevice,
+                                    stream.value()));
 
       // Pass 2: Output row offsets
       cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(),
@@ -321,11 +321,11 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
                                              stream);
       // With byte range, we want to keep only one row out of the specified range
       if (range_end < data.size()) {
-        CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
-                                 row_ctx.device_ptr(),
-                                 num_blocks * sizeof(uint64_t),
-                                 cudaMemcpyDeviceToHost,
-                                 stream.value()));
+        CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
+                                      row_ctx.device_ptr(),
+                                      num_blocks * sizeof(uint64_t),
+                                      cudaMemcpyDeviceToHost,
+                                      stream.value()));
         stream.synchronize();
 
         size_t rows_out_of_range = 0;
@@ -370,11 +370,11 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
   // Remove header rows and extract header
   const size_t header_row_index = std::max<size_t>(header_rows, 1) - 1;
   if (header_row_index + 1 < row_offsets.size()) {
-    CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
-                             row_offsets.data() + header_row_index,
-                             2 * sizeof(uint64_t),
-                             cudaMemcpyDeviceToHost,
-                             stream.value()));
+    CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
+                                  row_offsets.data() + header_row_index,
+                                  2 * sizeof(uint64_t),
+                                  cudaMemcpyDeviceToHost,
+                                  stream.value()));
     stream.synchronize();
 
     const auto header_start = buffer_pos + row_ctx[0];
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index cb2197cf755..2aa93ae4d0f 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -378,11 +378,11 @@ void write_chunked(data_sink* out_sink,
   } else {
     // copy the bytes to host to write them out
     thrust::host_vector<char> h_bytes(total_num_bytes);
-    CUDA_TRY(cudaMemcpyAsync(h_bytes.data(),
-                             ptr_all_bytes,
-                             total_num_bytes * sizeof(char),
-                             cudaMemcpyDeviceToHost,
-                             stream.value()));
+    CUDF_CUDA_TRY(cudaMemcpyAsync(h_bytes.data(),
+                                  ptr_all_bytes,
+                                  total_num_bytes * sizeof(char),
+                                  cudaMemcpyDeviceToHost,
+                                  stream.value()));
     stream.synchronize();
 
     out_sink->host_write(h_bytes.data(), total_num_bytes);
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index d26831b9112..56a00191ae4 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -689,7 +689,7 @@ void convert_json_to_columns(parse_options_view const& opts,
 {
   int block_size;
   int min_grid_size;
-  CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(
+  CUDF_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(
     &min_grid_size, &block_size, convert_data_to_columns_kernel));
 
   const int grid_size = (row_offsets.size() + block_size - 1) / block_size;
@@ -703,7 +703,7 @@ void convert_json_to_columns(parse_options_view const& opts,
                                                                                valid_fields,
                                                                                num_valid_fields);
 
-  CUDA_TRY(cudaGetLastError());
+  CUDF_CHECK_CUDA(stream.value());
 }
 
 /**
@@ -721,7 +721,7 @@ std::vector<cudf::io::column_type_histogram> detect_data_types(
 {
   int block_size;
   int min_grid_size;
-  CUDA_TRY(
+  CUDF_CUDA_TRY(
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, detect_data_types_kernel));
 
   auto d_column_infos = [&]() {
@@ -763,7 +763,7 @@ void collect_keys_info(parse_options_view const& options,
 {
   int block_size;
   int min_grid_size;
-  CUDA_TRY(
+  CUDF_CUDA_TRY(
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, collect_keys_info_kernel));
 
   // Calculate actual block count to use based on records count
@@ -772,7 +772,7 @@ void collect_keys_info(parse_options_view const& options,
   collect_keys_info_kernel<<<grid_size, block_size, 0, stream.value()>>>(
     options, data, row_offsets, keys_cnt, keys_info);
 
-  CUDA_TRY(cudaGetLastError());
+  CUDF_CHECK_CUDA(stream.value());
 }
 
 }  // namespace gpu
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 5ca947f3ee5..20eeec267b1 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -274,7 +274,7 @@ rmm::device_uvector<uint64_t> find_record_starts(json_reader_options const& read
   // Manually adding an extra row to account for the first row in the file
   if (reader_opts.get_byte_range_offset() == 0) {
     find_result_ptr++;
-    CUDA_TRY(cudaMemsetAsync(rec_starts.data(), 0ull, sizeof(uint64_t), stream.value()));
+    CUDF_CUDA_TRY(cudaMemsetAsync(rec_starts.data(), 0ull, sizeof(uint64_t), stream.value()));
   }
 
   std::vector<char> chars_to_find{'\n'};
@@ -356,18 +356,18 @@ std::pair<std::vector<std::string>, col_map_ptr_type> get_column_names_and_map(
   uint64_t first_row_len = d_data.size();
   if (rec_starts.size() > 1) {
     // Set first_row_len to the offset of the second row, if it exists
-    CUDA_TRY(cudaMemcpyAsync(&first_row_len,
-                             rec_starts.data() + 1,
-                             sizeof(uint64_t),
-                             cudaMemcpyDeviceToHost,
-                             stream.value()));
+    CUDF_CUDA_TRY(cudaMemcpyAsync(&first_row_len,
+                                  rec_starts.data() + 1,
+                                  sizeof(uint64_t),
+                                  cudaMemcpyDeviceToHost,
+                                  stream.value()));
   }
   std::vector<char> first_row(first_row_len);
-  CUDA_TRY(cudaMemcpyAsync(first_row.data(),
-                           d_data.data(),
-                           first_row_len * sizeof(char),
-                           cudaMemcpyDeviceToHost,
-                           stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(first_row.data(),
+                                d_data.data(),
+                                first_row_len * sizeof(char),
+                                cudaMemcpyDeviceToHost,
+                                stream.value()));
   stream.synchronize();
 
   // Determine the row format between:
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 059df283c94..83c23774362 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -431,7 +431,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
     device_span<gpu_inflate_status_s> inflate_out_view(inflate_out.data(), num_compressed_blocks);
     switch (decompressor->GetKind()) {
       case orc::ZLIB:
-        CUDA_TRY(
+        CUDF_CUDA_TRY(
           gpuinflate(inflate_in.data(), inflate_out.data(), num_compressed_blocks, 0, stream));
         break;
       case orc::SNAPPY:
@@ -440,7 +440,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
                                                            num_compressed_blocks};
           snappy_decompress(inflate_in_view, inflate_out_view, max_uncomp_block_size, stream);
         } else {
-          CUDA_TRY(
+          CUDF_CUDA_TRY(
             gpu_unsnap(inflate_in.data(), inflate_out.data(), num_compressed_blocks, stream));
         }
         break;
@@ -449,7 +449,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
     decompress_check(inflate_out_view, any_block_failure.device_ptr(), stream);
   }
   if (num_uncompressed_blocks > 0) {
-    CUDA_TRY(gpu_copy_uncompressed_blocks(
+    CUDF_CUDA_TRY(gpu_copy_uncompressed_blocks(
       inflate_in.data() + num_compressed_blocks, num_uncompressed_blocks, stream));
   }
   gpu::PostDecompressionReassemble(compinfo.device_ptr(), compinfo.size(), stream);
@@ -1129,7 +1129,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                 _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
                   offset, len);
               CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
-              CUDA_TRY(cudaMemcpyAsync(
+              CUDF_CUDA_TRY(cudaMemcpyAsync(
                 d_dst, buffer->data(), len, cudaMemcpyHostToDevice, stream.value()));
               stream.synchronize();
             }
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 30385d395f1..d0c1cea97a8 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -706,11 +706,11 @@ std::vector<std::vector<rowgroup_rows>> calculate_aligned_rowgroup_bounds(
 
   auto aligned_rgs = hostdevice_2dvector<rowgroup_rows>(
     segmentation.num_rowgroups(), orc_table.num_columns(), stream);
-  CUDA_TRY(cudaMemcpyAsync(aligned_rgs.base_device_ptr(),
-                           segmentation.rowgroups.base_device_ptr(),
-                           aligned_rgs.count() * sizeof(rowgroup_rows),
-                           cudaMemcpyDefault,
-                           stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(aligned_rgs.base_device_ptr(),
+                                segmentation.rowgroups.base_device_ptr(),
+                                aligned_rgs.count() * sizeof(rowgroup_rows),
+                                cudaMemcpyDefault,
+                                stream.value()));
   auto const d_stripes = cudf::detail::make_device_uvector_async(segmentation.stripes, stream);
 
   // One thread per column, per stripe
@@ -1330,7 +1330,7 @@ std::future<void> writer::impl::write_data_stream(gpu::StripeStream const& strm_
     if (out_sink_->is_device_write_preferred(length)) {
       return out_sink_->device_write_async(stream_in, length, stream);
     } else {
-      CUDA_TRY(
+      CUDF_CUDA_TRY(
         cudaMemcpyAsync(stream_out, stream_in, length, cudaMemcpyDeviceToHost, stream.value()));
       stream.synchronize();
 
@@ -1419,10 +1419,10 @@ void pushdown_lists_null_mask(orc_column_view const& col,
                               rmm::cuda_stream_view stream)
 {
   // Set all bits - correct unless there's a mismatch between offsets and null mask
-  CUDA_TRY(cudaMemsetAsync(static_cast<void*>(out_mask.data()),
-                           255,
-                           out_mask.size() * sizeof(bitmask_type),
-                           stream.value()));
+  CUDF_CUDA_TRY(cudaMemsetAsync(static_cast<void*>(out_mask.data()),
+                                255,
+                                out_mask.size() * sizeof(bitmask_type),
+                                stream.value()));
 
   // Reset bits where a null list element has rows in the child column
   thrust::for_each_n(
@@ -1946,7 +1946,7 @@ void writer::impl::write(table_view const& table)
       } else {
         return pinned_buffer<uint8_t>{[](size_t size) {
                                         uint8_t* ptr = nullptr;
-                                        CUDA_TRY(cudaMallocHost(&ptr, size));
+                                        CUDF_CUDA_TRY(cudaMallocHost(&ptr, size));
                                         return ptr;
                                       }(max_stream_size),
                                       cudaFreeHost};
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index 33151102aec..56eb34bbe2f 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -1226,24 +1226,24 @@ rmm::device_buffer reader::impl::decompress_page_data(
         argc++;
       });
 
-      CUDA_TRY(cudaMemcpyAsync(inflate_in.device_ptr(start_pos),
-                               inflate_in.host_ptr(start_pos),
-                               sizeof(decltype(inflate_in)::value_type) * (argc - start_pos),
-                               cudaMemcpyHostToDevice,
-                               stream.value()));
-      CUDA_TRY(cudaMemcpyAsync(inflate_out.device_ptr(start_pos),
-                               inflate_out.host_ptr(start_pos),
-                               sizeof(decltype(inflate_out)::value_type) * (argc - start_pos),
-                               cudaMemcpyHostToDevice,
-                               stream.value()));
+      CUDF_CUDA_TRY(cudaMemcpyAsync(inflate_in.device_ptr(start_pos),
+                                    inflate_in.host_ptr(start_pos),
+                                    sizeof(decltype(inflate_in)::value_type) * (argc - start_pos),
+                                    cudaMemcpyHostToDevice,
+                                    stream.value()));
+      CUDF_CUDA_TRY(cudaMemcpyAsync(inflate_out.device_ptr(start_pos),
+                                    inflate_out.host_ptr(start_pos),
+                                    sizeof(decltype(inflate_out)::value_type) * (argc - start_pos),
+                                    cudaMemcpyHostToDevice,
+                                    stream.value()));
 
       switch (codec.compression_type) {
         case parquet::GZIP:
-          CUDA_TRY(gpuinflate(inflate_in.device_ptr(start_pos),
-                              inflate_out.device_ptr(start_pos),
-                              argc - start_pos,
-                              1,
-                              stream))
+          CUDF_CUDA_TRY(gpuinflate(inflate_in.device_ptr(start_pos),
+                                   inflate_out.device_ptr(start_pos),
+                                   argc - start_pos,
+                                   1,
+                                   stream))
           break;
         case parquet::SNAPPY:
           if (nvcomp_integration::is_stable_enabled()) {
@@ -1252,27 +1252,27 @@ rmm::device_buffer reader::impl::decompress_page_data(
                               codec.max_decompressed_size,
                               stream);
           } else {
-            CUDA_TRY(gpu_unsnap(inflate_in.device_ptr(start_pos),
-                                inflate_out.device_ptr(start_pos),
-                                argc - start_pos,
-                                stream));
+            CUDF_CUDA_TRY(gpu_unsnap(inflate_in.device_ptr(start_pos),
+                                     inflate_out.device_ptr(start_pos),
+                                     argc - start_pos,
+                                     stream));
           }
           break;
         case parquet::BROTLI:
-          CUDA_TRY(gpu_debrotli(inflate_in.device_ptr(start_pos),
-                                inflate_out.device_ptr(start_pos),
-                                debrotli_scratch.data(),
-                                debrotli_scratch.size(),
-                                argc - start_pos,
-                                stream));
+          CUDF_CUDA_TRY(gpu_debrotli(inflate_in.device_ptr(start_pos),
+                                     inflate_out.device_ptr(start_pos),
+                                     debrotli_scratch.data(),
+                                     debrotli_scratch.size(),
+                                     argc - start_pos,
+                                     stream));
           break;
         default: CUDF_FAIL("Unexpected decompression dispatch"); break;
       }
-      CUDA_TRY(cudaMemcpyAsync(inflate_out.host_ptr(start_pos),
-                               inflate_out.device_ptr(start_pos),
-                               sizeof(decltype(inflate_out)::value_type) * (argc - start_pos),
-                               cudaMemcpyDeviceToHost,
-                               stream.value()));
+      CUDF_CUDA_TRY(cudaMemcpyAsync(inflate_out.host_ptr(start_pos),
+                                    inflate_out.device_ptr(start_pos),
+                                    sizeof(decltype(inflate_out)::value_type) * (argc - start_pos),
+                                    cudaMemcpyDeviceToHost,
+                                    stream.value()));
     }
   }
 
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 872ca6f6656..70a594423c9 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1123,7 +1123,7 @@ void writer::impl::encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks
       if (nvcomp_integration::is_stable_enabled()) {
         snappy_compress(comp_in, comp_stat, max_page_uncomp_data_size, stream);
       } else {
-        CUDA_TRY(gpu_snap(comp_in.data(), comp_stat.data(), pages_in_batch, stream));
+        CUDF_CUDA_TRY(gpu_snap(comp_in.data(), comp_stat.data(), pages_in_batch, stream));
       }
       break;
     default: break;
@@ -1136,11 +1136,11 @@ void writer::impl::encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks
   GatherPages(d_chunks_in_batch.flat_view(), pages, stream);
 
   auto h_chunks_in_batch = chunks.host_view().subspan(first_rowgroup, rowgroups_in_batch);
-  CUDA_TRY(cudaMemcpyAsync(h_chunks_in_batch.data(),
-                           d_chunks_in_batch.data(),
-                           d_chunks_in_batch.flat_view().size_bytes(),
-                           cudaMemcpyDeviceToHost,
-                           stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(h_chunks_in_batch.data(),
+                                d_chunks_in_batch.data(),
+                                d_chunks_in_batch.flat_view().size_bytes(),
+                                cudaMemcpyDeviceToHost,
+                                stream.value()));
   stream.synchronize();
 }
 
@@ -1579,28 +1579,28 @@ void writer::impl::write(table_view const& table, std::vector<partition_info> co
           // we still need to do a (much smaller) memcpy for the statistics.
           if (ck.ck_stat_size != 0) {
             column_chunk_meta.statistics_blob.resize(ck.ck_stat_size);
-            CUDA_TRY(cudaMemcpyAsync(column_chunk_meta.statistics_blob.data(),
-                                     dev_bfr,
-                                     ck.ck_stat_size,
-                                     cudaMemcpyDeviceToHost,
-                                     stream.value()));
+            CUDF_CUDA_TRY(cudaMemcpyAsync(column_chunk_meta.statistics_blob.data(),
+                                          dev_bfr,
+                                          ck.ck_stat_size,
+                                          cudaMemcpyDeviceToHost,
+                                          stream.value()));
             stream.synchronize();
           }
         } else {
           if (!host_bfr) {
             host_bfr = pinned_buffer<uint8_t>{[](size_t size) {
                                                 uint8_t* ptr = nullptr;
-                                                CUDA_TRY(cudaMallocHost(&ptr, size));
+                                                CUDF_CUDA_TRY(cudaMallocHost(&ptr, size));
                                                 return ptr;
                                               }(max_chunk_bfr_size),
                                               cudaFreeHost};
           }
           // copy the full data
-          CUDA_TRY(cudaMemcpyAsync(host_bfr.get(),
-                                   dev_bfr,
-                                   ck.ck_stat_size + ck.compressed_size,
-                                   cudaMemcpyDeviceToHost,
-                                   stream.value()));
+          CUDF_CUDA_TRY(cudaMemcpyAsync(host_bfr.get(),
+                                        dev_bfr,
+                                        ck.ck_stat_size + ck.compressed_size,
+                                        cudaMemcpyDeviceToHost,
+                                        stream.value()));
           stream.synchronize();
           out_sink_[p]->host_write(host_bfr.get() + ck.ck_stat_size, ck.compressed_size);
           if (ck.ck_stat_size != 0) {
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 34d8307b024..fd510466477 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -55,7 +55,7 @@ inline rmm::device_buffer create_data(data_type type,
   std::size_t data_size = size_of(type) * size;
 
   rmm::device_buffer data(data_size, stream, mr);
-  CUDA_TRY(cudaMemsetAsync(data.data(), 0, data_size, stream.value()));
+  CUDF_CUDA_TRY(cudaMemsetAsync(data.data(), 0, data_size, stream.value()));
 
   return data;
 }
diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
index a6bfb0d888f..ed8c3d6e1e3 100644
--- a/cpp/src/io/utilities/config_utils.cpp
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/io/utilities/config_utils.hpp b/cpp/src/io/utilities/config_utils.hpp
index 4b993043dd1..80c20529687 100644
--- a/cpp/src/io/utilities/config_utils.hpp
+++ b/cpp/src/io/utilities/config_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 367bbfcbdfa..5c73cf31428 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -54,7 +54,7 @@ class hostdevice_vector {
     : num_elements(initial_size), max_elements(max_size)
   {
     if (max_elements != 0) {
-      CUDA_TRY(cudaMallocHost(&h_data, sizeof(T) * max_elements));
+      CUDF_CUDA_TRY(cudaMallocHost(&h_data, sizeof(T) * max_elements));
       d_data.resize(sizeof(T) * max_elements, stream);
     }
   }
@@ -101,14 +101,14 @@ class hostdevice_vector {
 
   void host_to_device(rmm::cuda_stream_view stream, bool synchronize = false)
   {
-    CUDA_TRY(cudaMemcpyAsync(
+    CUDF_CUDA_TRY(cudaMemcpyAsync(
       d_data.data(), h_data, memory_size(), cudaMemcpyHostToDevice, stream.value()));
     if (synchronize) { stream.synchronize(); }
   }
 
   void device_to_host(rmm::cuda_stream_view stream, bool synchronize = false)
   {
-    CUDA_TRY(cudaMemcpyAsync(
+    CUDF_CUDA_TRY(cudaMemcpyAsync(
       h_data, d_data.data(), memory_size(), cudaMemcpyDeviceToHost, stream.value()));
     if (synchronize) { stream.synchronize(); }
   }
diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu
index 2db87736848..a03789464cc 100644
--- a/cpp/src/io/utilities/parsing_utils.cu
+++ b/cpp/src/io/utilities/parsing_utils.cu
@@ -1,3 +1,19 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -106,7 +122,7 @@ cudf::size_type find_all_from_set(device_span<char const> data,
 {
   int block_size    = 0;  // suggested thread count to use
   int min_grid_size = 0;  // minimum block count required
-  CUDA_TRY(
+  CUDF_CUDA_TRY(
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions<T>));
   const int grid_size = divCeil(data.size(), (size_t)block_size);
 
@@ -131,7 +147,7 @@ cudf::size_type find_all_from_set(host_span<char const> data,
 
   int block_size    = 0;  // suggested thread count to use
   int min_grid_size = 0;  // minimum block count required
-  CUDA_TRY(
+  CUDF_CUDA_TRY(
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions<T>));
 
   const size_t chunk_count = divCeil(data.size(), max_chunk_bytes);
@@ -143,7 +159,7 @@ cudf::size_type find_all_from_set(host_span<char const> data,
     const int grid_size   = divCeil(chunk_bits, block_size);
 
     // Copy chunk to device
-    CUDA_TRY(
+    CUDF_CUDA_TRY(
       cudaMemcpyAsync(d_chunk.data(), h_chunk, chunk_bytes, cudaMemcpyDefault, stream.value()));
 
     for (char key : keys) {
diff --git a/cpp/src/jit/cache.cpp b/cpp/src/jit/cache.cpp
index 159681eaffc..8228ff6da1f 100644
--- a/cpp/src/jit/cache.cpp
+++ b/cpp/src/jit/cache.cpp
@@ -77,9 +77,9 @@ std::filesystem::path get_cache_dir()
     int device;
     int cc_major;
     int cc_minor;
-    CUDA_TRY(cudaGetDevice(&device));
-    CUDA_TRY(cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device));
-    CUDA_TRY(cudaDeviceGetAttribute(&cc_minor, cudaDevAttrComputeCapabilityMinor, device));
+    CUDF_CUDA_TRY(cudaGetDevice(&device));
+    CUDF_CUDA_TRY(cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device));
+    CUDF_CUDA_TRY(cudaDeviceGetAttribute(&cc_minor, cudaDevAttrComputeCapabilityMinor, device));
     int cc = cc_major * 10 + cc_minor;
 
     kernel_cache_path /= std::to_string(cc);
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index dc62eeec539..9bf7e6a7a43 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 043c04b409e..01a94457b69 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -139,7 +139,7 @@ void materialize_bitmask(column_view const& left_col,
     }
   }
 
-  CHECK_CUDA(stream.value());
+  CUDF_CHECK_CUDA(stream.value());
 }
 
 struct side_index_generator {
@@ -212,7 +212,7 @@ index_vector generate_merged_indices(table_view const& left_table,
                   ineq_op);
   }
 
-  CHECK_CUDA(stream.value());
+  CUDF_CHECK_CUDA(stream.value());
 
   return merged_indices;
 }
diff --git a/cpp/src/quantiles/quantiles_util.hpp b/cpp/src/quantiles/quantiles_util.hpp
index 171b81152ff..280a42d9e20 100644
--- a/cpp/src/quantiles/quantiles_util.hpp
+++ b/cpp/src/quantiles/quantiles_util.hpp
@@ -29,7 +29,7 @@ CUDF_HOST_DEVICE inline Result get_array_value(T const* devarr, size_type locati
 #if defined(__CUDA_ARCH__)
   result = devarr[location];
 #else
-  CUDA_TRY(cudaMemcpy(&result, devarr + location, sizeof(T), cudaMemcpyDeviceToHost));
+  CUDF_CUDA_TRY(cudaMemcpy(&result, devarr + location, sizeof(T), cudaMemcpyDeviceToHost));
 #endif
   return static_cast<Result>(result);
 }
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index 61f728447e8..454a8c9d694 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -216,7 +216,7 @@ struct minmax_functor {
     // copy the minmax_pair to the host; does not copy the strings
     using OutputType = minmax_pair<cudf::string_view>;
     OutputType host_result;
-    CUDA_TRY(cudaMemcpyAsync(
+    CUDF_CUDA_TRY(cudaMemcpyAsync(
       &host_result, dev_result.data(), sizeof(OutputType), cudaMemcpyDeviceToHost, stream.value()));
     // strings are copied to create the scalars here
     return {std::make_unique<string_scalar>(host_result.min_val, true, stream, mr),
@@ -235,7 +235,7 @@ struct minmax_functor {
     // copy the minmax_pair to the host to call get_element
     using OutputType = minmax_pair<T>;
     OutputType host_result;
-    CUDA_TRY(cudaMemcpyAsync(
+    CUDF_CUDA_TRY(cudaMemcpyAsync(
       &host_result, dev_result.data(), sizeof(OutputType), cudaMemcpyDeviceToHost, stream.value()));
     // get the keys for those indexes
     auto const keys = dictionary_column_view(col).keys();
diff --git a/cpp/src/reductions/scan/scan_exclusive.cu b/cpp/src/reductions/scan/scan_exclusive.cu
index 3b8cc17c4aa..885d7e904b4 100644
--- a/cpp/src/reductions/scan/scan_exclusive.cu
+++ b/cpp/src/reductions/scan/scan_exclusive.cu
@@ -67,7 +67,7 @@ struct scan_dispatcher {
     thrust::exclusive_scan(
       rmm::exec_policy(stream), begin, begin + input.size(), output.data<T>(), identity, Op{});
 
-    CHECK_CUDA(stream.value());
+    CUDF_CHECK_CUDA(stream.value());
     return output_column;
   }
 
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index 9d07f340ebf..5ffdf1f5c56 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -120,7 +120,7 @@ struct scan_functor {
     thrust::inclusive_scan(
       rmm::exec_policy(stream), begin, begin + input_view.size(), result.data<T>(), Op{});
 
-    CHECK_CUDA(stream.value());
+    CUDF_CHECK_CUDA(stream.value());
     return output_column;
   }
 };
diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh
index d704b18774f..ca07d60f426 100644
--- a/cpp/src/rolling/rolling_detail.cuh
+++ b/cpp/src/rolling/rolling_detail.cuh
@@ -1281,7 +1281,7 @@ std::unique_ptr<column> rolling_window_udf(column_view const& input,
   output->set_null_count(output->size() - device_valid_count.value(stream));
 
   // check the stream for debugging
-  CHECK_CUDA(stream.value());
+  CUDF_CHECK_CUDA(stream.value());
 
   return output;
 }
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 76ec171052a..19bb60ef1a8 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -114,7 +114,7 @@ std::string string_scalar::to_string(rmm::cuda_stream_view stream) const
 {
   std::string result;
   result.resize(_data.size());
-  CUDA_TRY(cudaMemcpyAsync(
+  CUDF_CUDA_TRY(cudaMemcpyAsync(
     &result[0], _data.data(), _data.size(), cudaMemcpyDeviceToHost, stream.value()));
   stream.synchronize();
   return result;
diff --git a/cpp/src/search/search.cu b/cpp/src/search/search.cu
index 477666d93ae..29eddf703df 100644
--- a/cpp/src/search/search.cu
+++ b/cpp/src/search/search.cu
@@ -97,7 +97,8 @@ std::unique_ptr<column> search_ordered(table_view const& t,
 
   // Handle empty inputs
   if (t.num_rows() == 0) {
-    CUDA_TRY(cudaMemsetAsync(result_out, 0, values.num_rows() * sizeof(size_type), stream.value()));
+    CUDF_CUDA_TRY(
+      cudaMemsetAsync(result_out, 0, values.num_rows() * sizeof(size_type), stream.value()));
     return result;
   }
 
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index adfd24f1ca2..6a90a605ca3 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -87,11 +87,11 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
   auto offsets_view = offsets_column->mutable_view();
   // set the first entry to 0 and the last entry to bytes
   int32_t new_offsets[] = {0, static_cast<int32_t>(bytes)};
-  CUDA_TRY(cudaMemcpyAsync(offsets_view.data<int32_t>(),
-                           new_offsets,
-                           sizeof(new_offsets),
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(offsets_view.data<int32_t>(),
+                                new_offsets,
+                                sizeof(new_offsets),
+                                cudaMemcpyHostToDevice,
+                                stream.value()));
 
   // build null mask
   // only one entry so it is either all valid or all null
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index ac3c4df6aeb..1a423ef8eec 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -153,11 +153,11 @@ struct format_compiler {
 
     // create program in device memory
     d_items.resize(items.size(), stream);
-    CUDA_TRY(cudaMemcpyAsync(d_items.data(),
-                             items.data(),
-                             items.size() * sizeof(items[0]),
-                             cudaMemcpyHostToDevice,
-                             stream.value()));
+    CUDF_CUDA_TRY(cudaMemcpyAsync(d_items.data(),
+                                  items.data(),
+                                  items.size() * sizeof(items[0]),
+                                  cudaMemcpyHostToDevice,
+                                  stream.value()));
   }
 
   format_item const* compiled_format_items() { return d_items.data(); }
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 9fa033e9f9a..fedb8d38a08 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -297,7 +297,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
           cudf::detail::get_value<offset_type>(offsets_child, column_size + column_offset, stream) -
           bytes_offset;
 
-        CUDA_TRY(
+        CUDF_CUDA_TRY(
           cudaMemcpyAsync(d_new_chars, d_chars, bytes, cudaMemcpyDeviceToDevice, stream.value()));
 
         // get ready for the next column
diff --git a/cpp/src/strings/regex/regexec.cu b/cpp/src/strings/regex/regexec.cu
index b286812226b..3bcf55cf069 100644
--- a/cpp/src/strings/regex/regexec.cu
+++ b/cpp/src/strings/regex/regexec.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -163,7 +163,7 @@ std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_devic
   }
 
   // copy flat prog to device memory
-  CUDA_TRY(cudaMemcpyAsync(
+  CUDF_CUDA_TRY(cudaMemcpyAsync(
     d_buffer->data(), h_buffer.data(), memsize, cudaMemcpyHostToDevice, stream.value()));
   //
   auto deleter = [d_buffer, d_relists](reprog_device* t) {
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index c0673a5e2b5..d496b46bc36 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -81,10 +81,10 @@ auto generate_empty_output(strings_column_view const& input,
 
   auto offsets_column = make_numeric_column(
     data_type{type_to_id<offset_type>()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
-  CUDA_TRY(cudaMemsetAsync(offsets_column->mutable_view().template data<offset_type>(),
-                           0,
-                           offsets_column->size() * sizeof(offset_type),
-                           stream.value()));
+  CUDF_CUDA_TRY(cudaMemsetAsync(offsets_column->mutable_view().template data<offset_type>(),
+                                0,
+                                offsets_column->size() * sizeof(offset_type),
+                                stream.value()));
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
@@ -264,7 +264,7 @@ auto make_strings_children(Func fn,
   } else {
     // Compute the offsets values from the provided output string sizes.
     auto const string_sizes = output_strings_sizes.value();
-    CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(offset_type), stream.value()));
+    CUDF_CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(offset_type), stream.value()));
     thrust::inclusive_scan(rmm::exec_policy(stream),
                            string_sizes.template begin<size_type>(),
                            string_sizes.template end<size_type>(),
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index 825f09c66e6..d7cc72fdfff 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -130,9 +130,9 @@ const character_flags_table_type* get_character_flags_table()
 {
   return d_character_codepoint_flags.find_or_initialize([&](void) {
     character_flags_table_type* table = nullptr;
-    CUDA_TRY(cudaMemcpyToSymbol(
+    CUDF_CUDA_TRY(cudaMemcpyToSymbol(
       character_codepoint_flags, g_character_codepoint_flags, sizeof(g_character_codepoint_flags)));
-    CUDA_TRY(cudaGetSymbolAddress((void**)&table, character_codepoint_flags));
+    CUDF_CUDA_TRY(cudaGetSymbolAddress((void**)&table, character_codepoint_flags));
     return table;
   });
 }
@@ -144,9 +144,9 @@ const character_cases_table_type* get_character_cases_table()
 {
   return d_character_cases_table.find_or_initialize([&](void) {
     character_cases_table_type* table = nullptr;
-    CUDA_TRY(cudaMemcpyToSymbol(
+    CUDF_CUDA_TRY(cudaMemcpyToSymbol(
       character_cases_table, g_character_cases_table, sizeof(g_character_cases_table)));
-    CUDA_TRY(cudaGetSymbolAddress((void**)&table, character_cases_table));
+    CUDF_CUDA_TRY(cudaGetSymbolAddress((void**)&table, character_cases_table));
     return table;
   });
 }
@@ -158,9 +158,9 @@ const special_case_mapping* get_special_case_mapping_table()
 {
   return d_special_case_mappings.find_or_initialize([&](void) {
     special_case_mapping* table = nullptr;
-    CUDA_TRY(cudaMemcpyToSymbol(
+    CUDF_CUDA_TRY(cudaMemcpyToSymbol(
       character_special_case_mappings, g_special_case_mappings, sizeof(g_special_case_mappings)));
-    CUDA_TRY(cudaGetSymbolAddress((void**)&table, character_special_case_mappings));
+    CUDF_CUDA_TRY(cudaGetSymbolAddress((void**)&table, character_special_case_mappings));
     return table;
   });
 }
diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu
index 6ec364cc048..b69d735f612 100644
--- a/cpp/src/text/edit_distance.cu
+++ b/cpp/src/text/edit_distance.cu
@@ -231,7 +231,7 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
   cudf::size_type n_upper = (strings_count * (strings_count - 1)) / 2;
   rmm::device_uvector<int32_t> offsets(n_upper, stream);
   auto d_offsets = offsets.data();
-  CUDA_TRY(cudaMemsetAsync(d_offsets, 0, n_upper * sizeof(cudf::size_type), stream.value()));
+  CUDF_CUDA_TRY(cudaMemsetAsync(d_offsets, 0, n_upper * sizeof(cudf::size_type), stream.value()));
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<cudf::size_type>(0),
diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu
index 9ab769f9edd..00094f2de71 100644
--- a/cpp/src/text/subword/load_hash_file.cu
+++ b/cpp/src/text/subword/load_hash_file.cu
@@ -52,12 +52,12 @@ rmm::device_uvector<codepoint_metadata_type> get_codepoint_metadata(rmm::cuda_st
                table + cp_section1_end,
                table + codepoint_metadata_size,
                codepoint_metadata_default_value);
-  CUDA_TRY(cudaMemcpyAsync(table,
-                           codepoint_metadata,
-                           cp_section1_end * sizeof(codepoint_metadata[0]),  // 1st section
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
-  CUDA_TRY(cudaMemcpyAsync(
+  CUDF_CUDA_TRY(cudaMemcpyAsync(table,
+                                codepoint_metadata,
+                                cp_section1_end * sizeof(codepoint_metadata[0]),  // 1st section
+                                cudaMemcpyHostToDevice,
+                                stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(
     table + cp_section2_begin,
     cp_metadata_917505_917999,
     (cp_section2_end - cp_section2_begin + 1) * sizeof(codepoint_metadata[0]),  // 2nd section
@@ -80,24 +80,24 @@ rmm::device_uvector<aux_codepoint_data_type> get_aux_codepoint_data(rmm::cuda_st
                table + aux_section1_end,
                table + aux_codepoint_data_size,
                aux_codepoint_default_value);
-  CUDA_TRY(cudaMemcpyAsync(table,
-                           aux_codepoint_data,
-                           aux_section1_end * sizeof(aux_codepoint_data[0]),  // 1st section
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
-  CUDA_TRY(cudaMemcpyAsync(
+  CUDF_CUDA_TRY(cudaMemcpyAsync(table,
+                                aux_codepoint_data,
+                                aux_section1_end * sizeof(aux_codepoint_data[0]),  // 1st section
+                                cudaMemcpyHostToDevice,
+                                stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(
     table + aux_section2_begin,
     aux_cp_data_44032_55203,
     (aux_section2_end - aux_section2_begin + 1) * sizeof(aux_codepoint_data[0]),  // 2nd section
     cudaMemcpyHostToDevice,
     stream.value()));
-  CUDA_TRY(cudaMemcpyAsync(
+  CUDF_CUDA_TRY(cudaMemcpyAsync(
     table + aux_section3_begin,
     aux_cp_data_70475_71099,
     (aux_section3_end - aux_section3_begin + 1) * sizeof(aux_codepoint_data[0]),  // 3rd section
     cudaMemcpyHostToDevice,
     stream.value()));
-  CUDA_TRY(cudaMemcpyAsync(
+  CUDF_CUDA_TRY(cudaMemcpyAsync(
     table + aux_section4_begin,
     aux_cp_data_119134_119232,
     (aux_section4_end - aux_section4_begin + 1) * sizeof(aux_codepoint_data[0]),  // 4th section
@@ -236,33 +236,33 @@ std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
                                            cudf::mask_state::UNALLOCATED,
                                            stream,
                                            mr);
-  CUDA_TRY(cudaMemcpyAsync(result.table->mutable_view().data<uint64_t>(),
-                           table.data(),
-                           table.size() * sizeof(uint64_t),
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(result.table->mutable_view().data<uint64_t>(),
+                                table.data(),
+                                table.size() * sizeof(uint64_t),
+                                cudaMemcpyHostToDevice,
+                                stream.value()));
 
   result.bin_coefficients = cudf::make_numeric_column(cudf::data_type{cudf::type_id::UINT64},
                                                       bin_coefficients.size(),
                                                       cudf::mask_state::UNALLOCATED,
                                                       stream,
                                                       mr);
-  CUDA_TRY(cudaMemcpyAsync(result.bin_coefficients->mutable_view().data<uint64_t>(),
-                           bin_coefficients.data(),
-                           bin_coefficients.size() * sizeof(uint64_t),
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(result.bin_coefficients->mutable_view().data<uint64_t>(),
+                                bin_coefficients.data(),
+                                bin_coefficients.size() * sizeof(uint64_t),
+                                cudaMemcpyHostToDevice,
+                                stream.value()));
 
   result.bin_offsets = cudf::make_numeric_column(cudf::data_type{cudf::type_id::UINT16},
                                                  bin_offsets.size(),
                                                  cudf::mask_state::UNALLOCATED,
                                                  stream,
                                                  mr);
-  CUDA_TRY(cudaMemcpyAsync(result.bin_offsets->mutable_view().data<uint16_t>(),
-                           bin_offsets.data(),
-                           bin_offsets.size() * sizeof(uint16_t),
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(result.bin_offsets->mutable_view().data<uint16_t>(),
+                                bin_offsets.data(),
+                                bin_offsets.size() * sizeof(uint16_t),
+                                cudaMemcpyHostToDevice,
+                                stream.value()));
 
   auto cp_metadata            = detail::get_codepoint_metadata(stream);
   auto const cp_metadata_size = static_cast<cudf::size_type>(cp_metadata.size());
diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu
index 82bb50c6aaa..7d8df583039 100644
--- a/cpp/src/text/subword/wordpiece_tokenizer.cu
+++ b/cpp/src/text/subword/wordpiece_tokenizer.cu
@@ -457,7 +457,7 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre
                                                                      num_code_points,
                                                                      device_token_ids.data(),
                                                                      device_tokens_per_word.data());
-  CHECK_CUDA(stream.value());
+  CUDF_CHECK_CUDA(stream.value());
 
   cudf::detail::grid_1d const grid_mark{static_cast<cudf::size_type>(num_strings + 1),
                                         THREADS_PER_BLOCK};
@@ -469,7 +469,7 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre
                                                          device_start_word_indices,
                                                          device_end_word_indices,
                                                          num_strings);
-  CHECK_CUDA(stream.value());
+  CUDF_CHECK_CUDA(stream.value());
 
   // check for special tokens and adjust indices
   thrust::for_each_n(
@@ -512,7 +512,7 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre
       num_words,
       device_token_ids.data(),
       device_tokens_per_word.data());
-  CHECK_CUDA(stream.value());
+  CUDF_CHECK_CUDA(stream.value());
 
   // Repurpose the input array for the token ids. In the worst case, each code point ends up being a
   // token so this will always have enough memory to store the contiguous tokens.
diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu
index bc3678380be..74433af9f05 100644
--- a/cpp/src/transform/compute_column.cu
+++ b/cpp/src/transform/compute_column.cu
@@ -102,9 +102,9 @@ std::unique_ptr<column> compute_column(table_view const& table,
   // Configure kernel parameters
   auto const& device_expression_data = parser.device_expression_data;
   int device_id;
-  CUDA_TRY(cudaGetDevice(&device_id));
+  CUDF_CUDA_TRY(cudaGetDevice(&device_id));
   int shmem_limit_per_block;
-  CUDA_TRY(
+  CUDF_CUDA_TRY(
     cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
   auto constexpr MAX_BLOCK_SIZE = 128;
   auto const block_size =
@@ -125,7 +125,7 @@ std::unique_ptr<column> compute_column(table_view const& table,
       <<<config.num_blocks, config.num_threads_per_block, shmem_per_block, stream.value()>>>(
         *table_device, device_expression_data, *mutable_output_device);
   }
-  CHECK_CUDA(stream.value());
+  CUDF_CHECK_CUDA(stream.value());
   return output_column;
 }
 
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 0f06be0149e..744cec90fd9 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -505,9 +505,9 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
   // of memory of size (# input rows * sizeof(row_span) * max_branch_depth).
   auto const shmem_per_thread = sizeof(row_span) * h_info.max_branch_depth;
   int device_id;
-  CUDA_TRY(cudaGetDevice(&device_id));
+  CUDF_CUDA_TRY(cudaGetDevice(&device_id));
   int shmem_limit_per_block;
-  CUDA_TRY(
+  CUDF_CUDA_TRY(
     cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
   constexpr int max_block_size = 256;
   auto const block_size =
diff --git a/cpp/src/unary/unary_ops.cuh b/cpp/src/unary/unary_ops.cuh
index 19d78b010ec..08b68cc0591 100644
--- a/cpp/src/unary/unary_ops.cuh
+++ b/cpp/src/unary/unary_ops.cuh
@@ -70,7 +70,7 @@ struct launcher {
     thrust::transform(
       rmm::exec_policy(stream), input.begin<T>(), input.end<T>(), output_view.begin<Tout>(), F{});
 
-    CHECK_CUDA(stream.value());
+    CUDF_CHECK_CUDA(stream.value());
 
     return output;
   }
diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp
index 3a479f0860b..6c2c0716331 100644
--- a/cpp/tests/bitmask/bitmask_tests.cpp
+++ b/cpp/tests/bitmask/bitmask_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -88,10 +88,10 @@ rmm::device_uvector<cudf::bitmask_type> make_mask(cudf::size_type size, bool fil
     return cudf::detail::make_zeroed_device_uvector_sync<cudf::bitmask_type>(size);
   } else {
     auto ret = rmm::device_uvector<cudf::bitmask_type>(size, rmm::cuda_stream_default);
-    CUDA_TRY(cudaMemsetAsync(ret.data(),
-                             ~cudf::bitmask_type{0},
-                             size * sizeof(cudf::bitmask_type),
-                             rmm::cuda_stream_default.value()));
+    CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(),
+                                  ~cudf::bitmask_type{0},
+                                  size * sizeof(cudf::bitmask_type),
+                                  rmm::cuda_stream_default.value()));
     return ret;
   }
 }
@@ -530,10 +530,10 @@ void cleanEndWord(rmm::device_buffer& mask, int begin_bit, int end_bit)
   auto number_of_bits       = end_bit - begin_bit;
   if (number_of_bits % 32 != 0) {
     cudf::bitmask_type end_mask = 0;
-    CUDA_TRY(cudaMemcpy(
+    CUDF_CUDA_TRY(cudaMemcpy(
       &end_mask, ptr + number_of_mask_words - 1, sizeof(end_mask), cudaMemcpyDeviceToHost));
     end_mask = end_mask & ((1 << (number_of_bits % 32)) - 1);
-    CUDA_TRY(cudaMemcpy(
+    CUDF_CUDA_TRY(cudaMemcpy(
       ptr + number_of_mask_words - 1, &end_mask, sizeof(end_mask), cudaMemcpyHostToDevice));
   }
 }
diff --git a/cpp/tests/copying/concatenate_tests.cu b/cpp/tests/copying/concatenate_tests.cu
index 93e4e588e0e..4d76008fd13 100644
--- a/cpp/tests/copying/concatenate_tests.cu
+++ b/cpp/tests/copying/concatenate_tests.cu
@@ -62,9 +62,9 @@ struct TypedColumnTest : public cudf::test::BaseFixture {
     std::iota(h_data.begin(), h_data.end(), char{0});
     std::vector<char> h_mask(mask.size());
     std::iota(h_mask.begin(), h_mask.end(), char{0});
-    CUDA_TRY(cudaMemcpyAsync(
+    CUDF_CUDA_TRY(cudaMemcpyAsync(
       typed_data, h_data.data(), data.size(), cudaMemcpyHostToDevice, stream.value()));
-    CUDA_TRY(cudaMemcpyAsync(
+    CUDF_CUDA_TRY(cudaMemcpyAsync(
       typed_mask, h_mask.data(), mask.size(), cudaMemcpyHostToDevice, stream.value()));
     stream.synchronize();
   }
diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index 581268f26f4..1067366d010 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -155,7 +155,7 @@ struct AtomicsTest : public cudf::test::BaseFixture {
 
     auto host_result = cudf::detail::make_host_vector_sync(dev_result);
 
-    CHECK_CUDA(rmm::cuda_stream_default.value());
+    CUDF_CHECK_CUDA(rmm::cuda_stream_default.value());
 
     if (!is_timestamp_sum<T, cudf::DeviceSum>()) {
       EXPECT_EQ(host_result[0], exact[0]) << "atomicAdd test failed";
@@ -302,7 +302,7 @@ struct AtomicsBitwiseOpTest : public cudf::test::BaseFixture {
 
     auto host_result = cudf::detail::make_host_vector_sync(dev_result);
 
-    CHECK_CUDA(rmm::cuda_stream_default.value());
+    CUDF_CHECK_CUDA(rmm::cuda_stream_default.value());
 
     // print_exact(exact, "exact");
     // print_exact(host_result.data(), "result");
diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu
index da9509e94a6..4327a8b694b 100644
--- a/cpp/tests/error/error_handling_test.cu
+++ b/cpp/tests/error/error_handling_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,28 +36,28 @@ TEST(ExpectsTest, TryCatch)
 
 TEST(CudaTryTest, Error)
 {
-  CUDA_EXPECT_THROW_MESSAGE(CUDA_TRY(cudaErrorLaunchFailure),
+  CUDA_EXPECT_THROW_MESSAGE(CUDF_CUDA_TRY(cudaErrorLaunchFailure),
                             "cudaErrorLaunchFailure unspecified launch failure");
 }
-TEST(CudaTryTest, Success) { EXPECT_NO_THROW(CUDA_TRY(cudaSuccess)); }
+TEST(CudaTryTest, Success) { EXPECT_NO_THROW(CUDF_CUDA_TRY(cudaSuccess)); }
 
 TEST(CudaTryTest, TryCatch)
 {
-  CUDA_EXPECT_THROW_MESSAGE(CUDA_TRY(cudaErrorMemoryAllocation),
+  CUDA_EXPECT_THROW_MESSAGE(CUDF_CUDA_TRY(cudaErrorMemoryAllocation),
                             "cudaErrorMemoryAllocation out of memory");
 }
 
-TEST(StreamCheck, success) { EXPECT_NO_THROW(CHECK_CUDA(0)); }
+TEST(StreamCheck, success) { EXPECT_NO_THROW(CUDF_CHECK_CUDA(0)); }
 
 namespace {
 // Some silly kernel that will cause an error
 void __global__ test_kernel(int* data) { data[threadIdx.x] = threadIdx.x; }
 }  // namespace
 
-// In a release build and without explicit synchronization, CHECK_CUDA may
+// In a release build and without explicit synchronization, CUDF_CHECK_CUDA may
 // or may not fail on erroneous asynchronous CUDA calls. Invoke
 // cudaStreamSynchronize to guarantee failure on error. In a non-release build,
-// CHECK_CUDA deterministically fails on erroneous asynchronous CUDA
+// CUDF_CHECK_CUDA deterministically fails on erroneous asynchronous CUDA
 // calls.
 TEST(StreamCheck, FailedKernel)
 {
@@ -67,7 +67,7 @@ TEST(StreamCheck, FailedKernel)
 #ifdef NDEBUG
   stream.synchronize();
 #endif
-  EXPECT_THROW(CHECK_CUDA(stream.value()), cudf::cuda_error);
+  EXPECT_THROW(CUDF_CHECK_CUDA(stream.value()), cudf::cuda_error);
 }
 
 TEST(StreamCheck, CatchFailedKernel)
@@ -78,7 +78,7 @@ TEST(StreamCheck, CatchFailedKernel)
 #ifndef NDEBUG
   stream.synchronize();
 #endif
-  CUDA_EXPECT_THROW_MESSAGE(CHECK_CUDA(stream.value()),
+  CUDA_EXPECT_THROW_MESSAGE(CUDF_CHECK_CUDA(stream.value()),
                             "cudaErrorInvalidConfiguration "
                             "invalid configuration argument");
 }
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 1f4a8a7e508..cd0aab3caeb 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -1101,11 +1101,11 @@ class custom_test_data_sink : public cudf::io::data_sink {
   {
     return std::async(std::launch::deferred, [=] {
       char* ptr = nullptr;
-      CUDA_TRY(cudaMallocHost(&ptr, size));
-      CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDeviceToHost, stream.value()));
+      CUDF_CUDA_TRY(cudaMallocHost(&ptr, size));
+      CUDF_CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDeviceToHost, stream.value()));
       stream.synchronize();
       outfile_.write(ptr, size);
-      CUDA_TRY(cudaFreeHost(ptr));
+      CUDF_CUDA_TRY(cudaFreeHost(ptr));
     });
   }
 
@@ -2166,11 +2166,11 @@ class custom_test_memmap_sink : public cudf::io::data_sink {
   {
     return std::async(std::launch::deferred, [=] {
       char* ptr = nullptr;
-      CUDA_TRY(cudaMallocHost(&ptr, size));
-      CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDeviceToHost, stream.value()));
+      CUDF_CUDA_TRY(cudaMallocHost(&ptr, size));
+      CUDF_CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDeviceToHost, stream.value()));
       stream.synchronize();
       mm_writer->host_write(ptr, size);
-      CUDA_TRY(cudaFreeHost(ptr));
+      CUDF_CUDA_TRY(cudaFreeHost(ptr));
     });
   }
 
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index 57041e448a2..f560ce7f20c 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -1423,7 +1423,7 @@ TEST_F(JoinTest, HashJoinLargeOutputSize)
   // self-join a table of zeroes to generate an output row count that would overflow int32_t
   std::size_t col_size = 65567;
   rmm::device_buffer zeroes(col_size * sizeof(int32_t), rmm::cuda_stream_default);
-  CUDA_TRY(cudaMemsetAsync(zeroes.data(), 0, zeroes.size(), rmm::cuda_stream_default.value()));
+  CUDF_CUDA_TRY(cudaMemsetAsync(zeroes.data(), 0, zeroes.size(), rmm::cuda_stream_default.value()));
   cudf::column_view col_zeros(cudf::data_type{cudf::type_id::INT32}, col_size, zeroes.data());
   cudf::table_view tview{{col_zeros}};
   cudf::hash_join hash_join(tview, cudf::null_equality::UNEQUAL);
diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu
index f9ed22150b7..df5b1f5c14a 100644
--- a/cpp/tests/join/mixed_join_tests.cu
+++ b/cpp/tests/join/mixed_join_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/tests/scalar/scalar_device_view_test.cu b/cpp/tests/scalar/scalar_device_view_test.cu
index ee4c878726f..30c843a91c4 100644
--- a/cpp/tests/scalar/scalar_device_view_test.cu
+++ b/cpp/tests/scalar/scalar_device_view_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -60,13 +60,13 @@ TYPED_TEST(TypedScalarDeviceViewTest, Value)
   rmm::device_scalar<bool> result{rmm::cuda_stream_default};
 
   test_set_value<<<1, 1>>>(scalar_device_view, scalar_device_view1);
-  CHECK_CUDA(0);
+  CUDF_CHECK_CUDA(0);
 
   EXPECT_EQ(s1.value(), value);
   EXPECT_TRUE(s1.is_valid());
 
   test_value<<<1, 1>>>(scalar_device_view, scalar_device_view1, result.data());
-  CHECK_CUDA(0);
+  CUDF_CHECK_CUDA(0);
 
   EXPECT_TRUE(result.value(rmm::cuda_stream_default));
 }
@@ -85,7 +85,7 @@ TYPED_TEST(TypedScalarDeviceViewTest, ConstructNull)
   rmm::device_scalar<bool> result{rmm::cuda_stream_default};
 
   test_null<<<1, 1>>>(scalar_device_view, result.data());
-  CHECK_CUDA(0);
+  CUDF_CHECK_CUDA(0);
 
   EXPECT_FALSE(result.value(rmm::cuda_stream_default));
 }
@@ -105,7 +105,7 @@ TYPED_TEST(TypedScalarDeviceViewTest, SetNull)
   EXPECT_TRUE(s.is_valid());
 
   test_setnull<<<1, 1>>>(scalar_device_view);
-  CHECK_CUDA(0);
+  CUDF_CHECK_CUDA(0);
 
   EXPECT_FALSE(s.is_valid());
 }
@@ -131,7 +131,7 @@ TEST_F(StringScalarDeviceViewTest, Value)
   auto value_v = cudf::detail::make_device_uvector_sync(value);
 
   test_string_value<<<1, 1>>>(scalar_device_view, value_v.data(), value.size(), result.data());
-  CHECK_CUDA(0);
+  CUDF_CHECK_CUDA(0);
 
   EXPECT_TRUE(result.value(rmm::cuda_stream_default));
 }
diff --git a/cpp/tests/sort/sort_test.cpp b/cpp/tests/sort/sort_test.cpp
index a6e1a25ec17..b9ea7a0b078 100644
--- a/cpp/tests/sort/sort_test.cpp
+++ b/cpp/tests/sort/sort_test.cpp
@@ -86,7 +86,7 @@ TYPED_TEST(Sort, WithNullMax)
     // the rest of the values are equivalent and yields random sorted order.
     auto to_host = [](column_view const& col) {
       thrust::host_vector<int32_t> h_data(col.size());
-      CUDA_TRY(cudaMemcpy(
+      CUDF_CUDA_TRY(cudaMemcpy(
         h_data.data(), col.data<int32_t>(), h_data.size() * sizeof(int32_t), cudaMemcpyDefault));
       return h_data;
     };
@@ -124,7 +124,7 @@ TYPED_TEST(Sort, WithNullMin)
     // the rest of the values are equivalent and yields random sorted order.
     auto to_host = [](column_view const& col) {
       thrust::host_vector<int32_t> h_data(col.size());
-      CUDA_TRY(cudaMemcpy(
+      CUDF_CUDA_TRY(cudaMemcpy(
         h_data.data(), col.data<int32_t>(), h_data.size() * sizeof(int32_t), cudaMemcpyDefault));
       return h_data;
     };
@@ -160,7 +160,7 @@ TYPED_TEST(Sort, WithMixedNullOrder)
     // the rest of the values are equivalent and yields random sorted order.
     auto to_host = [](column_view const& col) {
       thrust::host_vector<int32_t> h_data(col.size());
-      CUDA_TRY(cudaMemcpy(
+      CUDF_CUDA_TRY(cudaMemcpy(
         h_data.data(), col.data<int32_t>(), h_data.size() * sizeof(int32_t), cudaMemcpyDefault));
       return h_data;
     };
diff --git a/cpp/tests/sort/stable_sort_tests.cpp b/cpp/tests/sort/stable_sort_tests.cpp
index b6b7495136e..ee43c9e7b4b 100644
--- a/cpp/tests/sort/stable_sort_tests.cpp
+++ b/cpp/tests/sort/stable_sort_tests.cpp
@@ -94,7 +94,7 @@ TYPED_TEST(StableSort, WithNullMax)
     // the rest of the values are equivalent and yields random sorted order.
     auto to_host = [](column_view const& col) {
       thrust::host_vector<int32_t> h_data(col.size());
-      CUDA_TRY(cudaMemcpy(
+      CUDF_CUDA_TRY(cudaMemcpy(
         h_data.data(), col.data<int32_t>(), h_data.size() * sizeof(int32_t), cudaMemcpyDefault));
       return h_data;
     };
@@ -130,7 +130,7 @@ TYPED_TEST(StableSort, WithNullMin)
     // the rest of the values are equivalent and yields random sorted order.
     auto to_host = [](column_view const& col) {
       thrust::host_vector<int32_t> h_data(col.size());
-      CUDA_TRY(cudaMemcpy(
+      CUDF_CUDA_TRY(cudaMemcpy(
         h_data.data(), col.data<int32_t>(), h_data.size() * sizeof(int32_t), cudaMemcpyDefault));
       return h_data;
     };
diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu
index 0ba4b268c70..6861737bfb5 100644
--- a/cpp/tests/strings/factories_test.cu
+++ b/cpp/tests/strings/factories_test.cu
@@ -78,7 +78,7 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
     h_offsets[idx + 1] = offset;
   }
   auto d_strings = cudf::detail::make_device_uvector_sync(strings);
-  CUDA_TRY(cudaMemcpy(d_buffer.data(), h_buffer.data(), memsize, cudaMemcpyHostToDevice));
+  CUDF_CUDA_TRY(cudaMemcpy(d_buffer.data(), h_buffer.data(), memsize, cudaMemcpyHostToDevice));
   auto column = cudf::make_strings_column(d_strings);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_id::STRING});
   EXPECT_EQ(column->null_count(), nulls);
diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp
index 7f8a31ef9bb..5802a1ddc0a 100644
--- a/cpp/tests/strings/integers_tests.cpp
+++ b/cpp/tests/strings/integers_tests.cpp
@@ -302,10 +302,10 @@ TYPED_TEST(StringsIntegerConvertTest, FromToInteger)
   auto integers      = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
                                             (cudf::size_type)d_integers.size());
   auto integers_view = integers->mutable_view();
-  CUDA_TRY(cudaMemcpy(integers_view.data<TypeParam>(),
-                      d_integers.data(),
-                      d_integers.size() * sizeof(TypeParam),
-                      cudaMemcpyDeviceToDevice));
+  CUDF_CUDA_TRY(cudaMemcpy(integers_view.data<TypeParam>(),
+                           d_integers.data(),
+                           d_integers.size() * sizeof(TypeParam),
+                           cudaMemcpyDeviceToDevice));
   integers_view.set_null_count(0);
 
   // convert to strings
diff --git a/cpp/tests/types/type_dispatcher_test.cu b/cpp/tests/types/type_dispatcher_test.cu
index dca80b597c0..d8b2a736bde 100644
--- a/cpp/tests/types/type_dispatcher_test.cu
+++ b/cpp/tests/types/type_dispatcher_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -71,7 +71,7 @@ TYPED_TEST(TypedDispatcherTest, DeviceDispatch)
 {
   auto result = cudf::detail::make_zeroed_device_uvector_sync<bool>(1);
   dispatch_test_kernel<<<1, 1>>>(cudf::type_to_id<TypeParam>(), result.data());
-  CUDA_TRY(cudaDeviceSynchronize());
+  CUDF_CUDA_TRY(cudaDeviceSynchronize());
   EXPECT_EQ(true, result.front_element(rmm::cuda_stream_default));
 }
 
@@ -132,7 +132,7 @@ TYPED_TEST(TypedDoubleDispatcherTest, DeviceDoubleDispatch)
   auto result = cudf::detail::make_zeroed_device_uvector_sync<bool>(1);
   double_dispatch_test_kernel<<<1, 1>>>(
     cudf::type_to_id<TypeParam>(), cudf::type_to_id<TypeParam>(), result.data());
-  CUDA_TRY(cudaDeviceSynchronize());
+  CUDF_CUDA_TRY(cudaDeviceSynchronize());
   EXPECT_EQ(true, result.front_element(rmm::cuda_stream_default));
 }
 
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 68626c2d4d3..015178f8c7c 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -824,16 +824,16 @@ std::vector<bitmask_type> bitmask_to_host(cudf::column_view const& c)
     auto num_bitmasks = num_bitmask_words(c.size());
     std::vector<bitmask_type> host_bitmask(num_bitmasks);
     if (c.offset() == 0) {
-      CUDA_TRY(cudaMemcpy(host_bitmask.data(),
-                          c.null_mask(),
-                          num_bitmasks * sizeof(bitmask_type),
-                          cudaMemcpyDeviceToHost));
+      CUDF_CUDA_TRY(cudaMemcpy(host_bitmask.data(),
+                               c.null_mask(),
+                               num_bitmasks * sizeof(bitmask_type),
+                               cudaMemcpyDeviceToHost));
     } else {
       auto mask = copy_bitmask(c.null_mask(), c.offset(), c.offset() + c.size());
-      CUDA_TRY(cudaMemcpy(host_bitmask.data(),
-                          mask.data(),
-                          num_bitmasks * sizeof(bitmask_type),
-                          cudaMemcpyDeviceToHost));
+      CUDF_CUDA_TRY(cudaMemcpy(host_bitmask.data(),
+                               mask.data(),
+                               num_bitmasks * sizeof(bitmask_type),
+                               cudaMemcpyDeviceToHost));
     }
 
     return host_bitmask;
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 562501c01c6..dbdf8e59e6a 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 #
 # cudf documentation build configuration file, created by
 # sphinx-quickstart on Wed May  3 10:59:22 2017.
diff --git a/docs/cudf/source/user_guide/10min-cudf-cupy.ipynb b/docs/cudf/source/user_guide/10min-cudf-cupy.ipynb
index 169eec07914..b34bbd3f193 100644
--- a/docs/cudf/source/user_guide/10min-cudf-cupy.ipynb
+++ b/docs/cudf/source/user_guide/10min-cudf-cupy.ipynb
@@ -45,23 +45,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "158 µs ± 306 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n",
-      "419 µs ± 149 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/opt/conda/envs/rapids/lib/python3.7/site-packages/cudf/core/dataframe.py:3044: FutureWarning: The as_gpu_matrix method will be removed in a future cuDF release. Consider using `to_cupy` instead.\n",
-      "  FutureWarning,\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "339 µs ± 282 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
+      "167 µs ± 789 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n",
+      "497 µs ± 1.19 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n",
+      "502 µs ± 1.34 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
      ]
     }
    ],
@@ -72,9 +58,9 @@
     "                     'c':range(1000, nelem + 1000)}\n",
     "                   )\n",
     "\n",
-    "%timeit arr_cupy = cp.fromDlpack(df.to_dlpack())\n",
+    "%timeit arr_cupy = cp.from_dlpack(df.to_dlpack())\n",
     "%timeit arr_cupy = df.values\n",
-    "%timeit arr_cupy = cp.asarray(df.as_gpu_matrix())"
+    "%timeit arr_cupy = df.to_cupy()"
    ]
   },
   {
@@ -100,7 +86,7 @@
     }
    ],
    "source": [
-    "arr_cupy = cp.fromDlpack(df.to_dlpack())\n",
+    "arr_cupy = cp.from_dlpack(df.to_dlpack())\n",
     "arr_cupy"
    ]
   },
@@ -131,9 +117,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "45.4 µs ± 63.9 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n",
-      "127 µs ± 351 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n",
-      "135 µs ± 5.24 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
+      "75.2 µs ± 117 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n",
+      "185 µs ± 630 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n",
+      "169 µs ± 1.24 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
      ]
     }
    ],
@@ -141,7 +127,7 @@
     "col = 'a'\n",
     "\n",
     "%timeit cola_cupy = cp.asarray(df[col])\n",
-    "%timeit cola_cupy = cp.fromDlpack(df[col].to_dlpack())\n",
+    "%timeit cola_cupy = cp.from_dlpack(df[col].to_dlpack())\n",
     "%timeit cola_cupy = df[col].values"
    ]
   },
@@ -270,7 +256,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "15.5 ms ± 7.55 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+      "22 ms ± 26.6 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
      ]
     }
    ],
@@ -524,7 +510,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "7.26 ms ± 3.32 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+      "8.6 ms ± 33.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
      ]
     }
    ],
@@ -544,7 +530,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "4.87 ms ± 2.08 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+      "5.56 ms ± 37.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
      ]
     }
    ],
@@ -1037,7 +1023,7 @@
     }
    ],
    "source": [
-    "new_arr = cp.fromDlpack(reshaped_df.to_dlpack())\n",
+    "new_arr = cp.from_dlpack(reshaped_df.to_dlpack())\n",
     "new_arr.sum(axis=1)"
    ]
   },
@@ -1075,7 +1061,7 @@
     "    if sparseformat == 'row':\n",
     "        _sparse_constructor = cp.sparse.csr_matrix\n",
     "\n",
-    "    return _sparse_constructor(cp.fromDlpack(data.to_dlpack()))"
+    "    return _sparse_constructor(cp.from_dlpack(data.to_dlpack()))"
    ]
   },
   {
@@ -1154,134 +1140,141 @@
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>4.704433</td>\n",
+       "      <td>3.380014</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
+       "      <td>11.030136</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>-1.162275</td>\n",
        "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.00000</td>\n",
+       "      <td>5.726806</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>11.460403</td>\n",
-       "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>5.917846</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>5.90886</td>\n",
+       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
-       "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
-       "      <td>0.407392</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>6.646564</td>\n",
+       "      <td>0.00000</td>\n",
+       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>3.399164</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
+       "      <td>14.092100</td>\n",
        "      <td>0.000000</td>\n",
-       "      <td>0.0</td>\n",
+       "      <td>0.378781</td>\n",
+       "      <td>0.420953</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.000000</td>\n",
+       "      <td>0.00000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>8.299425</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>2.096401</td>\n",
-       "      <td>0.0</td>\n",
+       "      <td>0.109242</td>\n",
+       "      <td>2.541798</td>\n",
        "      <td>0.0</td>\n",
+       "      <td>0.071563</td>\n",
+       "      <td>8.223387</td>\n",
        "      <td>0.000000</td>\n",
-       "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.00000</td>\n",
+       "      <td>10.744624</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "    a0   a1   a2   a3   a4        a5   a6        a7   a8   a9  a10       a11  \\\n",
-       "0  0.0  0.0  0.0  0.0  0.0  4.704433  0.0  0.000000  0.0  0.0  0.0  0.000000   \n",
-       "1  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.000000  0.0  0.0  0.0  0.000000   \n",
-       "2  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.000000  0.0  0.0  0.0  0.000000   \n",
-       "3  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.000000  0.0  0.0  0.0  0.000000   \n",
-       "4  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  8.299425  0.0  0.0  0.0  2.096401   \n",
+       "    a0   a1   a2        a3   a4        a5         a6   a7         a8  \\\n",
+       "0  0.0  0.0  0.0  3.380014  0.0  0.000000  11.030136  0.0   0.000000   \n",
+       "1  0.0  0.0  0.0  0.000000  0.0  0.000000   0.000000  0.0   0.000000   \n",
+       "2  0.0  0.0  0.0  0.000000  0.0  0.000000   0.000000  0.0   0.000000   \n",
+       "3  0.0  0.0  0.0  0.000000  0.0  0.000000   0.000000  0.0  14.092100   \n",
+       "4  0.0  0.0  0.0  0.000000  0.0  0.109242   2.541798  0.0   0.071563   \n",
+       "\n",
+       "         a9       a10       a11       a12       a13      a14        a15  a16  \\\n",
+       "0  0.000000  0.000000  0.000000  0.000000  0.000000  0.00000   5.726806  0.0   \n",
+       "1  0.000000  0.000000  0.000000  5.917846  0.000000  5.90886   0.000000  0.0   \n",
+       "2  0.000000  0.000000  0.000000  0.000000  6.646564  0.00000   0.000000  0.0   \n",
+       "3  0.000000  0.378781  0.420953  0.000000  0.000000  0.00000   0.000000  0.0   \n",
+       "4  8.223387  0.000000  0.000000  0.000000  0.000000  0.00000  10.744624  0.0   \n",
        "\n",
-       "   a12  a13        a14  a15       a16       a17  a18  a19  \n",
-       "0  0.0  0.0   0.000000  0.0 -1.162275  0.000000  0.0  0.0  \n",
-       "1  0.0  0.0  11.460403  0.0  0.000000  0.000000  0.0  0.0  \n",
-       "2  0.0  0.0   0.000000  0.0  0.000000  0.407392  0.0  0.0  \n",
-       "3  0.0  0.0   0.000000  0.0  0.000000  0.000000  0.0  0.0  \n",
-       "4  0.0  0.0   0.000000  0.0  0.000000  0.000000  0.0  0.0  "
+       "   a17  a18       a19  \n",
+       "0  0.0  0.0  0.000000  \n",
+       "1  0.0  0.0  0.000000  \n",
+       "2  0.0  0.0  3.399164  \n",
+       "3  0.0  0.0  0.000000  \n",
+       "4  0.0  0.0  0.000000  "
       ]
      },
      "execution_count": 20,
@@ -1302,57 +1295,57 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "  (41, 0)\t8.237732918475851\n",
-      "  (49, 0)\t-4.161219849238402\n",
-      "  (70, 0)\t-1.646588718395583\n",
-      "  (80, 0)\t11.607048248828713\n",
-      "  (81, 0)\t11.387095517746493\n",
-      "  (105, 0)\t4.059008225609349\n",
-      "  (107, 0)\t9.299030876304984\n",
-      "  (108, 0)\t10.652087054434446\n",
-      "  (127, 0)\t2.442578989241219\n",
-      "  (133, 0)\t-0.7674141633646347\n",
-      "  (135, 0)\t-6.091151515788713\n",
-      "  (145, 0)\t2.968949150266586\n",
-      "  (148, 0)\t5.649147779687932\n",
-      "  (158, 0)\t7.7809955768930745\n",
-      "  (166, 0)\t5.801884262747882\n",
-      "  (175, 0)\t7.3205065025042\n",
-      "  (181, 0)\t13.704683370645277\n",
-      "  (204, 0)\t15.915619596241733\n",
-      "  (207, 0)\t-0.2205888963107494\n",
-      "  (209, 0)\t3.565578265020142\n",
-      "  (215, 0)\t4.1493767841754154\n",
-      "  (231, 0)\t3.4286524053271803\n",
-      "  (233, 0)\t6.021200022977307\n",
-      "  (241, 0)\t4.247163658236771\n",
-      "  (249, 0)\t1.8502158424149273\n",
+      "  (896, 0)\t0.7194778152522069\n",
+      "  (385, 0)\t5.061243119202521\n",
+      "  (899, 0)\t8.032932656540671\n",
+      "  (1028, 0)\t10.072155866140903\n",
+      "  (133, 0)\t13.27741318265092\n",
+      "  (518, 0)\t2.242099518010387\n",
+      "  (647, 0)\t6.487369007371155\n",
+      "  (776, 0)\t5.621989952370181\n",
+      "  (9, 0)\t8.833796529523534\n",
+      "  (521, 0)\t7.719749292928572\n",
+      "  (777, 0)\t7.4610987015782975\n",
+      "  (394, 0)\t10.09026095476732\n",
+      "  (140, 0)\t2.974228870142501\n",
+      "  (653, 0)\t4.520704347545524\n",
+      "  (1037, 0)\t4.53896886415556\n",
+      "  (400, 0)\t4.0198547103826705\n",
+      "  (401, 0)\t-0.2557920447399875\n",
+      "  (1041, 0)\t1.8627471984893114\n",
+      "  (146, 0)\t9.834516073722536\n",
+      "  (1042, 0)\t7.850006814937681\n",
+      "  (275, 0)\t1.5747512513374389\n",
+      "  (662, 0)\t6.717038670488377\n",
+      "  (25, 0)\t7.311464380885098\n",
+      "  (281, 0)\t3.5147599499072024\n",
+      "  (409, 0)\t1.121874214291239\n",
       "  :\t:\n",
-      "  (9729, 19)\t7.226429647432215\n",
-      "  (9762, 19)\t-0.6042314722021014\n",
-      "  (9764, 19)\t-1.4827372788735615\n",
-      "  (9769, 19)\t4.140245505599609\n",
-      "  (9776, 19)\t-0.3441145182655059\n",
-      "  (9781, 19)\t-0.235562982602191\n",
-      "  (9782, 19)\t2.1458765970993223\n",
-      "  (9791, 19)\t7.219427633840467\n",
-      "  (9803, 19)\t6.6874487362355115\n",
-      "  (9807, 19)\t5.1769501512294465\n",
-      "  (9823, 19)\t-1.1040045399744103\n",
-      "  (9828, 19)\t3.074156937033751\n",
-      "  (9849, 19)\t0.4663962936122451\n",
-      "  (9851, 19)\t10.302861735090476\n",
-      "  (9862, 19)\t1.9377857550195872\n",
-      "  (9893, 19)\t8.991541850619656\n",
-      "  (9896, 19)\t-0.9003118390325282\n",
-      "  (9919, 19)\t2.4984693551284587\n",
-      "  (9934, 19)\t1.6161057487404191\n",
-      "  (9944, 19)\t6.063387997554039\n",
-      "  (9945, 19)\t11.038782286791717\n",
-      "  (9954, 19)\t13.750186699958661\n",
-      "  (9979, 19)\t0.9225731640357893\n",
-      "  (9995, 19)\t-1.775155437069923\n",
-      "  (9998, 19)\t12.265785237649636\n"
+      "  (8290, 19)\t19.23532976720017\n",
+      "  (8679, 19)\t3.9092712623274224\n",
+      "  (8935, 19)\t0.8411008847310036\n",
+      "  (9063, 19)\t12.010953214709328\n",
+      "  (9319, 19)\t3.470064419440258\n",
+      "  (8683, 19)\t14.397876149427695\n",
+      "  (8300, 19)\t10.524275022546979\n",
+      "  (8301, 19)\t0.6266917401191829\n",
+      "  (8557, 19)\t-0.4554588974911311\n",
+      "  (9197, 19)\t12.379896820812874\n",
+      "  (8304, 19)\t1.3276250981825033\n",
+      "  (8563, 19)\t-1.579631321204169\n",
+      "  (8442, 19)\t6.881252269650868\n",
+      "  (8315, 19)\t0.5811637925849389\n",
+      "  (8575, 19)\t15.52855242553137\n",
+      "  (9343, 19)\t-0.12679091919544638\n",
+      "  (9569, 19)\t9.316119794827424\n",
+      "  (9570, 19)\t10.791371431930969\n",
+      "  (9443, 19)\t4.7035396189880645\n",
+      "  (9452, 19)\t-0.9924476181662789\n",
+      "  (9713, 19)\t-3.2038209275781346\n",
+      "  (9719, 19)\t0.6578276176100656\n",
+      "  (9847, 19)\t9.57555910183088\n",
+      "  (9724, 19)\t0.990362915454171\n",
+      "  (9855, 19)\t1.153449284622368\n"
      ]
     }
    ],
@@ -1373,7 +1366,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -1387,7 +1380,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.12"
+   "version": "3.9.7"
   }
  },
  "nbformat": 4,
diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
index 17ba1c6d53f..2382abe38a9 100644
--- a/fetch_rapids.cmake
+++ b/fetch_rapids.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Compressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Compressor.java
deleted file mode 100644
index 67a770f1346..00000000000
--- a/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Compressor.java
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package ai.rapids.cudf.nvcomp;
-
-import ai.rapids.cudf.Cuda;
-import ai.rapids.cudf.BaseDeviceMemoryBuffer;
-import ai.rapids.cudf.DeviceMemoryBuffer;
-import ai.rapids.cudf.HostMemoryBuffer;
-
-/** Single-buffer compressor implementing LZ4 */
-public class LZ4Compressor {
-
-  /** LZ4 compression settings corresponding to a chunk size */
-  public static final class Configuration {
-    private final long metadataBytes;
-    private final long tempBytes;
-    private final long maxCompressedBytes;
-
-    Configuration(long metadataBytes, long tempBytes, long maxCompressedBytes) {
-      this.metadataBytes = metadataBytes;
-      this.tempBytes = tempBytes;
-      this.maxCompressedBytes = maxCompressedBytes;
-    }
-
-    /** Get the size of the metadata information in bytes */
-    public long getMetadataBytes() {
-      return metadataBytes;
-    }
-
-    /** Get the size of the temporary storage in bytes needed to compress */
-    public long getTempBytes() {
-      return tempBytes;
-    }
-
-    /** Get the maximum compressed output size in bytes */
-    public long getMaxCompressedBytes() {
-      return maxCompressedBytes;
-    }
-  }
-
-  /**
-   * Get the compression configuration necessary for a particular chunk size.
-   * @param chunkSize size of an LZ4 chunk in bytes
-   * @param uncompressedSize total size of the uncompressed data
-   * @return compression configuration for the specified chunk size
-   */
-  public static Configuration configure(long chunkSize, long uncompressedSize) {
-    long[] configs = NvcompJni.lz4CompressConfigure(chunkSize, uncompressedSize);
-    assert configs.length == 3;
-    return new Configuration(configs[0], configs[1], configs[2]);
-  }
-
-  /**
-   * Synchronously compress a buffer with LZ4.
-   * @param input      buffer to compress
-   * @param inputType  type of data within the buffer
-   * @param chunkSize  compression chunk size to use
-   * @param tempBuffer temporary storage space
-   * @param output     buffer that will contain the compressed result
-   * @param stream     CUDA stream to use
-   * @return size of the resulting compressed data stored to the output buffer
-   */
-  public static long compress(BaseDeviceMemoryBuffer input, CompressionType inputType,
-                              long chunkSize, BaseDeviceMemoryBuffer tempBuffer,
-                              BaseDeviceMemoryBuffer output, Cuda.Stream stream) {
-    if (chunkSize <= 0) {
-      throw new IllegalArgumentException("Illegal chunk size: " + chunkSize);
-    }
-    try (DeviceMemoryBuffer devOutputSizeBuffer = DeviceMemoryBuffer.allocate(Long.BYTES);
-         HostMemoryBuffer hostOutputSizeBuffer = HostMemoryBuffer.allocate(Long.BYTES)) {
-      compressAsync(devOutputSizeBuffer, input, inputType, chunkSize, tempBuffer, output, stream);
-      hostOutputSizeBuffer.copyFromDeviceBuffer(devOutputSizeBuffer, stream);
-      return hostOutputSizeBuffer.getLong(0);
-    }
-  }
-
-  /**
-   * Asynchronously compress a buffer with LZ4. The compressed size output buffer must be pinned
-   * memory for this operation to be truly asynchronous. Note that the caller must synchronize
-   * on the specified CUDA stream in order to safely examine the compressed output size!
-   * @param compressedSizeOutputBuffer device memory where the compressed output size will be stored
-   * @param input      buffer to compress
-   * @param inputType  type of data within the buffer
-   * @param chunkSize  compression chunk size to use
-   * @param tempBuffer temporary storage space
-   * @param output     buffer that will contain the compressed result
-   * @param stream     CUDA stream to use
-   */
-  public static void compressAsync(DeviceMemoryBuffer compressedSizeOutputBuffer,
-                                   BaseDeviceMemoryBuffer input, CompressionType inputType,
-                                   long chunkSize, BaseDeviceMemoryBuffer tempBuffer,
-                                   BaseDeviceMemoryBuffer output, Cuda.Stream stream) {
-    if (chunkSize <= 0) {
-      throw new IllegalArgumentException("Illegal chunk size: " + chunkSize);
-    }
-    if (compressedSizeOutputBuffer.getLength() < 8) {
-      throw new IllegalArgumentException("compressed output size buffer must be able to hold " +
-          "at least 8 bytes, size is only " + compressedSizeOutputBuffer.getLength());
-    }
-    NvcompJni.lz4CompressAsync(
-        compressedSizeOutputBuffer.getAddress(),
-        input.getAddress(),
-        input.getLength(),
-        inputType.nativeId,
-        chunkSize,
-        tempBuffer.getAddress(),
-        tempBuffer.getLength(),
-        output.getAddress(),
-        output.getLength(),
-        stream.getStream());
-  }
-}
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Decompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Decompressor.java
deleted file mode 100644
index 46b3127581b..00000000000
--- a/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Decompressor.java
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package ai.rapids.cudf.nvcomp;
-
-import ai.rapids.cudf.BaseDeviceMemoryBuffer;
-import ai.rapids.cudf.Cuda;
-
-/** Single-buffer decompression using LZ4 */
-public class LZ4Decompressor {
-
-  /**
-   * LZ4 decompression settings corresponding to an LZ4 compressed input.
-   * NOTE: Each instance must be closed to avoid a native memory leak.
-   */
-  public static final class Configuration implements AutoCloseable {
-    private final long metadataPtr;
-    private final long metadataSize;
-    private final long tempBytes;
-    private final long uncompressedBytes;
-
-    Configuration(long metadataPtr, long metadataSize, long tempBytes,
-                  long uncompressedBytes) {
-      this.metadataPtr = metadataPtr;
-      this.metadataSize = metadataSize;
-      this.tempBytes = tempBytes;
-      this.uncompressedBytes = uncompressedBytes;
-    }
-
-    /** Get the host address of the metadata */
-    public long getMetadataPtr() {
-      return metadataPtr;
-    }
-
-    /** Get the size of the metadata in bytes */
-    public long getMetadataSize() {
-      return metadataSize;
-    }
-
-    /** Get the size of the temporary buffer in bytes needed to decompress */
-    public long getTempBytes() {
-      return tempBytes;
-    }
-
-    /** Get the size of the uncompressed data in bytes */
-    public long getUncompressedBytes() {
-      return uncompressedBytes;
-    }
-
-    @Override
-    public void close() {
-      NvcompJni.lz4DestroyMetadata(metadataPtr);
-    }
-  }
-
-  /**
-   * Determine if a buffer is data compressed with LZ4.
-   * @param buffer data to examine
-   * @param stream CUDA stream to use
-   * @return true if the data is LZ4 compressed
-   */
-  public static boolean isLZ4Data(BaseDeviceMemoryBuffer buffer, Cuda.Stream stream) {
-    return NvcompJni.isLZ4Data(buffer.getAddress(), buffer.getLength(), stream.getStream());
-  }
-
-  /**
-   * Get the decompression configuration from compressed data.
-   * NOTE: The resulting configuration object must be closed to avoid a native memory leak.
-   * @param compressed data that has been compressed by the LZ4 compressor
-   * @param stream CUDA stream to use
-   * @return decompression configuration for the specified input
-   */
-  public static Configuration configure(BaseDeviceMemoryBuffer compressed, Cuda.Stream stream) {
-    long[] configs = NvcompJni.lz4DecompressConfigure(compressed.getAddress(),
-        compressed.getLength(), stream.getStream());
-    assert configs.length == 4;
-    return new Configuration(configs[0], configs[1], configs[2], configs[3]);
-  }
-
-  /**
-   * Asynchronously decompress data compressed with the LZ4 compressor.
-   * @param compressed buffer containing LZ4-compressed data
-   * @param config decompression configuration
-   * @param temp temporary storage buffer
-   * @param outputBuffer buffer that will be written with the uncompressed output
-   * @param stream CUDA stream to use
-   */
-  public static void decompressAsync(
-      BaseDeviceMemoryBuffer compressed,
-      Configuration config,
-      BaseDeviceMemoryBuffer temp,
-      BaseDeviceMemoryBuffer outputBuffer,
-      Cuda.Stream stream) {
-    NvcompJni.lz4DecompressAsync(
-        compressed.getAddress(),
-        compressed.getLength(),
-        config.getMetadataPtr(),
-        config.getMetadataSize(),
-        temp.getAddress(),
-        temp.getLength(),
-        outputBuffer.getAddress(),
-        outputBuffer.getLength(),
-        stream.getStream());
-  }
-}
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java b/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java
index 58f8390d0eb..57094008c08 100644
--- a/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,100 +24,6 @@ class NvcompJni {
     NativeDepsLoader.loadNativeDeps();
   }
 
-  /**
-   * Determine if data is compressed with the nvcomp LZ4 compressor.
-   * @param inPtr device address of the compressed data
-   * @param inSize size of the compressed data in bytes
-   * @param stream CUDA stream to use
-   * @return true if the data is compressed with the nvcomp LZ4 compressor
-   */
-  static native boolean isLZ4Data(long inPtr, long inSize, long stream);
-
-  /**
-   * Determine if the metadata corresponds to data compressed with the nvcomp LZ4 compressor.
-   * @param metadataPtr address of the metadata object
-   * @return true if the metadata describes data compressed with the nvcomp LZ4 compressor.
-   */
-  static native boolean isLZ4Metadata(long metadataPtr);
-
-  /**
-   * Return the LZ4 compression configuration necessary for a particular chunk size.
-   * @param chunkSize maximum size of an uncompressed chunk in bytes
-   * @param uncompressedSize total size of the uncompressed data
-   * @return array of three longs containing metadata size, temp storage size,
-   *         and output buffer size
-   */
-  static native long[] lz4CompressConfigure(long chunkSize, long uncompressedSize);
-
-  /**
-   * Perform LZ4 compression asynchronously using the specified CUDA stream.
-   * @param compressedSizeOutputPtr host address of a 64-bit integer to update
-   *                                with the resulting compressed size of the
-   *                                data. For the operation to be truly
-   *                                asynchronous this should point to pinned
-   *                                host memory.
-   * @param inPtr device address of the uncompressed data
-   * @param inSize size of the uncompressed data in bytes
-   * @param inputType type of uncompressed data
-   * @param chunkSize size of an LZ4 chunk in bytes
-   * @param tempPtr device address of the temporary compression storage buffer
-   * @param tempSize size of the temporary storage buffer in bytes
-   * @param outPtr device address of the output buffer
-   * @param outSize size of the output buffer in bytes
-   * @param stream CUDA stream to use
-   */
-  static native void lz4CompressAsync(
-      long compressedSizeOutputPtr,
-      long inPtr,
-      long inSize,
-      int inputType,
-      long chunkSize,
-      long tempPtr,
-      long tempSize,
-      long outPtr,
-      long outSize,
-      long stream);
-
-  /**
-   * Return the decompression configuration for a compressed input.
-   * NOTE: The resulting configuration object must be closed to destroy the corresponding
-   * host-side metadata created by this method to avoid a native memory leak.
-   * @param inPtr device address of the compressed data
-   * @param inSize size of the compressed data
-   * @return array of four longs containing metadata address, metadata size, temp storage size,
-   *         and output buffer size
-   */
-  static native long[] lz4DecompressConfigure(long inPtr, long inSize, long stream);
-
-  /**
-   * Perform LZ4 decompression asynchronously using the specified CUDA stream.
-   * @param inPtr device address of the uncompressed data
-   * @param inSize size of the uncompressed data in bytes
-   * @param metadataPtr host address of the metadata
-   * @param metadataSize size of the metadata in bytes
-   * @param tempPtr device address of the temporary compression storage buffer
-   * @param tempSize size of the temporary storage buffer in bytes
-   * @param outPtr device address of the output buffer
-   * @param outSize size of the output buffer in bytes
-   * @param stream CUDA stream to use
-   */
-  static native void lz4DecompressAsync(
-      long inPtr,
-      long inSize,
-      long metadataPtr,
-      long metadataSize,
-      long tempPtr,
-      long tempSize,
-      long outPtr,
-      long outSize,
-      long stream);
-
-  /**
-   * Destroy host-side metadata created by {@link NvcompJni#lz4DecompressConfigure(long, long, long)}
-   * @param metadataPtr host address of metadata
-   */
-  static native void lz4DestroyMetadata(long metadataPtr);
-
   /**
    * Get the temporary workspace size required to perform compression of entire LZ4 batch.
    * @param batchSize number of chunks in the batch
diff --git a/java/src/main/native/src/NvcompJni.cpp b/java/src/main/native/src/NvcompJni.cpp
index 533654baee1..e616b7f66be 100644
--- a/java/src/main/native/src/NvcompJni.cpp
+++ b/java/src/main/native/src/NvcompJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,131 +56,6 @@ void check_nvcomp_status(JNIEnv *env, nvcompStatus_t status) {
 
 extern "C" {
 
-JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_isLZ4Data(JNIEnv *env, jclass,
-                                                                          jlong j_in_ptr,
-                                                                          jlong j_in_size,
-                                                                          jlong j_stream) {
-  try {
-    cudf::jni::auto_set_device(env);
-    auto in_ptr = reinterpret_cast<void const *>(j_in_ptr);
-    auto in_size = static_cast<std::size_t>(j_in_size);
-    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
-    return nvcompLZ4IsData(in_ptr, in_size, stream);
-  }
-  CATCH_STD(env, 0)
-}
-
-JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_isLZ4Metadata(JNIEnv *env, jclass,
-                                                                              jlong metadata_ptr) {
-  try {
-    cudf::jni::auto_set_device(env);
-    return nvcompLZ4IsMetadata(reinterpret_cast<void *>(metadata_ptr));
-  }
-  CATCH_STD(env, 0)
-}
-
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4CompressConfigure(
-    JNIEnv *env, jclass, jlong j_chunk_size, jlong j_uncompressed_size) {
-  try {
-    cudf::jni::auto_set_device(env);
-    nvcompLZ4FormatOpts opts{};
-    opts.chunk_size = static_cast<std::size_t>(j_chunk_size);
-    auto uncompressed_size = static_cast<std::size_t>(j_uncompressed_size);
-    std::size_t metadata_bytes = 0;
-    std::size_t temp_bytes = 0;
-    std::size_t out_bytes = 0;
-    auto status = nvcompLZ4CompressConfigure(&opts, NVCOMP_TYPE_CHAR, uncompressed_size,
-                                             &metadata_bytes, &temp_bytes, &out_bytes);
-    check_nvcomp_status(env, status);
-    cudf::jni::native_jlongArray result(env, 3);
-    result[0] = static_cast<jlong>(metadata_bytes);
-    result[1] = static_cast<jlong>(temp_bytes);
-    result[2] = static_cast<jlong>(out_bytes);
-    return result.get_jArray();
-  }
-  CATCH_STD(env, 0);
-}
-
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4CompressAsync(
-    JNIEnv *env, jclass, jlong j_compressed_size_ptr, jlong j_in_ptr, jlong j_in_size,
-    jint j_input_type, jlong j_chunk_size, jlong j_temp_ptr, jlong j_temp_size, jlong j_out_ptr,
-    jlong j_out_size, jlong j_stream) {
-  try {
-    cudf::jni::auto_set_device(env);
-    auto in_ptr = reinterpret_cast<void const *>(j_in_ptr);
-    auto in_size = static_cast<std::size_t>(j_in_size);
-    auto comp_type = static_cast<nvcompType_t>(j_input_type);
-    nvcompLZ4FormatOpts opts{};
-    opts.chunk_size = static_cast<std::size_t>(j_chunk_size);
-    auto temp_ptr = reinterpret_cast<void *>(j_temp_ptr);
-    auto temp_size = static_cast<std::size_t>(j_temp_size);
-    auto out_ptr = reinterpret_cast<void *>(j_out_ptr);
-    auto compressed_size_ptr = reinterpret_cast<std::size_t *>(j_compressed_size_ptr);
-    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
-    auto status = nvcompLZ4CompressAsync(&opts, comp_type, in_ptr, in_size, temp_ptr, temp_size,
-                                         out_ptr, compressed_size_ptr, stream);
-    check_nvcomp_status(env, status);
-  }
-  CATCH_STD(env, );
-}
-
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4DecompressConfigure(
-    JNIEnv *env, jclass, jlong j_input_ptr, jlong j_input_size, jlong j_stream) {
-  try {
-    cudf::jni::auto_set_device(env);
-    auto compressed_ptr = reinterpret_cast<void const *>(j_input_ptr);
-    auto compressed_bytes = static_cast<std::size_t>(j_input_size);
-    void *metadata_ptr = nullptr;
-    std::size_t metadata_bytes = 0;
-    std::size_t temp_bytes = 0;
-    std::size_t uncompressed_bytes = 0;
-    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
-    auto status =
-        nvcompLZ4DecompressConfigure(compressed_ptr, compressed_bytes, &metadata_ptr,
-                                     &metadata_bytes, &temp_bytes, &uncompressed_bytes, stream);
-    check_nvcomp_status(env, status);
-    cudf::jni::native_jlongArray result(env, 4);
-    result[0] = reinterpret_cast<jlong>(metadata_ptr);
-    result[1] = static_cast<jlong>(metadata_bytes);
-    result[2] = static_cast<jlong>(temp_bytes);
-    result[3] = static_cast<jlong>(uncompressed_bytes);
-    return result.get_jArray();
-  }
-  CATCH_STD(env, 0);
-}
-
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4DecompressAsync(
-    JNIEnv *env, jclass, jlong j_in_ptr, jlong j_in_size, jlong j_metadata_ptr,
-    jlong j_metadata_size, jlong j_temp_ptr, jlong j_temp_size, jlong j_out_ptr, jlong j_out_size,
-    jlong j_stream) {
-  try {
-    cudf::jni::auto_set_device(env);
-    auto compressed_ptr = reinterpret_cast<void const *>(j_in_ptr);
-    auto compressed_bytes = static_cast<std::size_t>(j_in_size);
-    auto metadata_ptr = reinterpret_cast<void const *>(j_metadata_ptr);
-    auto metadata_bytes = static_cast<std::size_t>(j_metadata_size);
-    auto temp_ptr = reinterpret_cast<void *>(j_temp_ptr);
-    auto temp_bytes = static_cast<std::size_t>(j_temp_size);
-    auto uncompressed_ptr = reinterpret_cast<void *>(j_out_ptr);
-    auto uncompressed_bytes = static_cast<std::size_t>(j_out_size);
-    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
-    auto status = nvcompLZ4DecompressAsync(compressed_ptr, compressed_bytes, metadata_ptr,
-                                           metadata_bytes, temp_ptr, temp_bytes, uncompressed_ptr,
-                                           uncompressed_bytes, stream);
-    check_nvcomp_status(env, status);
-  }
-  CATCH_STD(env, );
-}
-
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4DestroyMetadata(JNIEnv *env, jclass,
-                                                                               jlong metadata_ptr) {
-  try {
-    cudf::jni::auto_set_device(env);
-    nvcompLZ4DestroyMetadata(reinterpret_cast<void *>(metadata_ptr));
-  }
-  CATCH_STD(env, );
-}
-
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressGetTempSize(
     JNIEnv *env, jclass, jlong j_batch_size, jlong j_max_chunk_size) {
   try {
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 78ac8a18107..cebe476dd87 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -137,8 +137,8 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
           left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available;
       char *copy_to = current_buffer_data + current_buffer_written;
 
-      CUDA_TRY(cudaMemcpyAsync(copy_to, copy_from, amount_to_copy, cudaMemcpyDeviceToHost,
-                               stream.value()));
+      CUDF_CUDA_TRY(cudaMemcpyAsync(copy_to, copy_from, amount_to_copy, cudaMemcpyDeviceToHost,
+                                    stream.value()));
 
       copy_from = copy_from + amount_to_copy;
       current_buffer_written += amount_to_copy;
diff --git a/java/src/main/native/src/cudf_jni_apis.hpp b/java/src/main/native/src/cudf_jni_apis.hpp
index 12fd45b831a..2ac535bbf2f 100644
--- a/java/src/main/native/src/cudf_jni_apis.hpp
+++ b/java/src/main/native/src/cudf_jni_apis.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/java/src/main/native/src/map_lookup.cu b/java/src/main/native/src/map_lookup.cu
index 683651799e7..13d1a5a94a9 100644
--- a/java/src/main/native/src/map_lookup.cu
+++ b/java/src/main/native/src/map_lookup.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -123,7 +123,7 @@ get_gather_map_for_map_values(column_view const &input, string_scalar &lookup_ke
   gpu_find_first<block_size, has_nulls><<<grid.num_blocks, block_size, 0, stream.value()>>>(
       *input_device_view, *output_view, lookup_key_device_view);
 
-  CHECK_CUDA(stream.value());
+  CUDF_CHECK_CUDA(stream.value());
 
   return gather_map;
 }
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 4d78f416134..96ee95c476d 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -1766,9 +1766,9 @@ std::vector<std::unique_ptr<column>> convert_to_rows(
     std::optional<rmm::device_uvector<strings_column_view::offset_iterator>> variable_width_offsets,
     rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
   int device_id;
-  CUDA_TRY(cudaGetDevice(&device_id));
+  CUDF_CUDA_TRY(cudaGetDevice(&device_id));
   int total_shmem_in_bytes;
-  CUDA_TRY(
+  CUDF_CUDA_TRY(
       cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
 #ifndef __CUDA_ARCH__ // __host__ code.
@@ -2097,9 +2097,9 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
   auto const num_rows = input.parent().size();
 
   int device_id;
-  CUDA_TRY(cudaGetDevice(&device_id));
+  CUDF_CUDA_TRY(cudaGetDevice(&device_id));
   int total_shmem_in_bytes;
-  CUDA_TRY(
+  CUDF_CUDA_TRY(
       cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
 #ifndef __CUDA_ARCH__ // __host__ code.
diff --git a/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java b/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java
index c36d241500a..ec14a1cfee6 100644
--- a/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java
+++ b/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,16 +28,6 @@
 public class NvcompTest {
   private static final Logger log = LoggerFactory.getLogger(ColumnVector.class);
 
-  @Test
-  void testLZ4RoundTripViaLZ4DecompressorSync() {
-    lz4RoundTrip(false);
-  }
-
-  @Test
-  void testLZ4RoundTripViaLZ4DecompressorAsync() {
-    lz4RoundTrip(true);
-  }
-
   @Test
   void testBatchedLZ4RoundTripAsync() {
     final Cuda.Stream stream = Cuda.DEFAULT_STREAM;
@@ -134,90 +124,4 @@ private DeviceMemoryBuffer initBatchBuffer(long[] data, int bufferId) {
       throw new RuntimeException(t);
     }
   }
-
-  private void lz4RoundTrip(boolean useAsync) {
-    final Cuda.Stream stream = Cuda.DEFAULT_STREAM;
-    final long chunkSize = 64 * 1024;
-    final int numElements = 10 * 1024 * 1024 + 1;
-    long[] data = new long[numElements];
-    for (int i = 0; i < numElements; ++i) {
-      data[i] = i;
-    }
-
-    DeviceMemoryBuffer tempBuffer = null;
-    DeviceMemoryBuffer compressedBuffer = null;
-    DeviceMemoryBuffer uncompressedBuffer = null;
-    try (ColumnVector v = ColumnVector.fromLongs(data)) {
-      BaseDeviceMemoryBuffer inputBuffer = v.getDeviceBufferFor(BufferType.DATA);
-      final long uncompressedSize = inputBuffer.getLength();
-      log.debug("Uncompressed size is {}", uncompressedSize);
-
-      LZ4Compressor.Configuration compressConf =
-          LZ4Compressor.configure(chunkSize, uncompressedSize);
-      Assertions.assertTrue(compressConf.getMetadataBytes() > 0);
-      log.debug("Using {} temporary space for lz4 compression", compressConf.getTempBytes());
-      tempBuffer = DeviceMemoryBuffer.allocate(compressConf.getTempBytes());
-      log.debug("lz4 compressed size estimate is {}", compressConf.getMaxCompressedBytes());
-
-      compressedBuffer = DeviceMemoryBuffer.allocate(compressConf.getMaxCompressedBytes());
-
-      long startTime = System.nanoTime();
-      long compressedSize;
-      if (useAsync) {
-        try (DeviceMemoryBuffer devCompressedSizeBuffer = DeviceMemoryBuffer.allocate(8);
-             HostMemoryBuffer hostCompressedSizeBuffer = HostMemoryBuffer.allocate(8)) {
-          LZ4Compressor.compressAsync(devCompressedSizeBuffer, inputBuffer, CompressionType.CHAR,
-              chunkSize, tempBuffer, compressedBuffer, stream);
-          hostCompressedSizeBuffer.copyFromDeviceBufferAsync(devCompressedSizeBuffer, stream);
-          stream.sync();
-          compressedSize = hostCompressedSizeBuffer.getLong(0);
-        }
-      } else {
-        compressedSize = LZ4Compressor.compress(inputBuffer, CompressionType.CHAR, chunkSize,
-            tempBuffer, compressedBuffer, stream);
-      }
-      double duration = (System.nanoTime() - startTime) / 1000.0;
-      log.info("Compressed with lz4 to {} in {} us", compressedSize, duration);
-
-      tempBuffer.close();
-      tempBuffer = null;
-
-      try (LZ4Decompressor.Configuration decompressConf =
-               LZ4Decompressor.configure(compressedBuffer, stream)) {
-        final long tempSize = decompressConf.getTempBytes();
-
-        log.debug("Using {} temporary space for lz4 compression", tempSize);
-        tempBuffer = DeviceMemoryBuffer.allocate(tempSize);
-
-        final long outSize = decompressConf.getUncompressedBytes();
-        Assertions.assertEquals(inputBuffer.getLength(), outSize);
-
-        uncompressedBuffer = DeviceMemoryBuffer.allocate(outSize);
-
-        LZ4Decompressor.decompressAsync(compressedBuffer, decompressConf, tempBuffer,
-            uncompressedBuffer, stream);
-
-        try (ColumnVector v2 = new ColumnVector(
-            DType.INT64,
-            numElements,
-            Optional.empty(),
-            uncompressedBuffer,
-            null,
-            null);
-             HostColumnVector hv2 = v2.copyToHost()) {
-          uncompressedBuffer = null;
-          for (int i = 0; i < numElements; ++i) {
-            long val = hv2.getLong(i);
-            if (val != i) {
-              Assertions.fail("Expected " + i + " at " + i + " found " + val);
-            }
-          }
-        }
-      }
-    } finally {
-      closeBuffer(tempBuffer);
-      closeBuffer(compressedBuffer);
-      closeBuffer(uncompressedBuffer);
-    }
-  }
 }
diff --git a/python/cudf/cudf/_fuzz_testing/io.py b/python/cudf/cudf/_fuzz_testing/io.py
index dfc59a1f18d..a6f27691f56 100644
--- a/python/cudf/cudf/_fuzz_testing/io.py
+++ b/python/cudf/cudf/_fuzz_testing/io.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import copy
 import json
diff --git a/python/cudf/cudf/_fuzz_testing/main.py b/python/cudf/cudf/_fuzz_testing/main.py
index 6b536fc3e2e..54e49b63e41 100644
--- a/python/cudf/cudf/_fuzz_testing/main.py
+++ b/python/cudf/cudf/_fuzz_testing/main.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from cudf._fuzz_testing import fuzzer
 
diff --git a/python/cudf/cudf/_version.py b/python/cudf/cudf/_version.py
index c6281349c50..60a2afed39b 100644
--- a/python/cudf/cudf/_version.py
+++ b/python/cudf/cudf/_version.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 # This file helps to compute a version number in source trees obtained from
 # git-archive tarball (such as those provided by githubs download-from-tag
 # feature). Distribution tarballs (built by setup.py sdist) and build
diff --git a/python/cudf/cudf/comm/gpuarrow.py b/python/cudf/cudf/comm/gpuarrow.py
index 7879261139d..f21eb4e4d8c 100644
--- a/python/cudf/cudf/comm/gpuarrow.py
+++ b/python/cudf/cudf/comm/gpuarrow.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 from collections import OrderedDict
 from collections.abc import Sequence
 
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 60d13150b39..3f8c8997803 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -2,7 +2,7 @@
 
 import pickle
 from functools import cached_property
-from typing import List, Sequence
+from typing import List, Optional, Sequence
 
 import numpy as np
 import pyarrow as pa
@@ -337,16 +337,20 @@ def __init__(self, parent: ParentType):
             )
         super().__init__(parent=parent)
 
-    def get(self, index: int) -> ParentType:
+    def get(
+        self, index: int, default: Optional[ScalarLike] = None
+    ) -> ParentType:
         """
-        Extract element at the given index from each component
+        Extract element at the given index from each list.
 
-        Extract element from lists, tuples, or strings in
-        each element in the Series/Index.
+        If the index is out of bounds for any list,
+        return <NA> or, if provided, ``default``.
+        Thus, this method never raises an ``IndexError``.
 
         Parameters
         ----------
         index : int
+        default : scalar, optional
 
         Returns
         -------
@@ -360,14 +364,37 @@ def get(self, index: int) -> ParentType:
         1    5
         2    6
         dtype: int64
+
+        >>> s = cudf.Series([[1, 2], [3, 4, 5], [4, 5, 6]])
+        >>> s.list.get(2)
+        0    <NA>
+        1       5
+        2       6
+        dtype: int64
+
+        >>> s = cudf.Series([[1, 2], [3, 4, 5], [4, 5, 6]])
+        >>> s.list.get(2, default=0)
+        0   0
+        1   5
+        2   6
+        dtype: int64
         """
-        min_col_list_len = self.len().min()
-        if -min_col_list_len <= index < min_col_list_len:
-            return self._return_or_inplace(
-                extract_element(self._column, index)
+        out = extract_element(self._column, index)
+
+        if not (default is None or default is cudf.NA):
+            # determine rows for which `index` is out-of-bounds
+            lengths = count_elements(self._column)
+            out_of_bounds_mask = (np.negative(index) > lengths) | (
+                index >= lengths
             )
-        else:
-            raise IndexError("list index out of range")
+
+            # replace the value in those rows (should be NA) with `default`
+            if out_of_bounds_mask.any():
+                out = out._scatter_by_column(
+                    out_of_bounds_mask, cudf.Scalar(default)
+                )
+
+        return self._return_or_inplace(out)
 
     def contains(self, search_key: ScalarLike) -> ParentType:
         """
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index ef8e9c4dffc..d5d45c341d5 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -2006,7 +2006,9 @@ def filter_alphanum(
             repl = ""
 
         return self._return_or_inplace(
-            libstrings.filter_alphanum(self._column, cudf.Scalar(repl), keep),
+            libstrings.filter_alphanum(
+                self._column, cudf.Scalar(repl, "str"), keep
+            ),
         )
 
     def slice_from(
@@ -2141,7 +2143,7 @@ def slice_replace(
 
         return self._return_or_inplace(
             libstrings.slice_replace(
-                self._column, start, stop, cudf.Scalar(repl)
+                self._column, start, stop, cudf.Scalar(repl, "str")
             ),
         )
 
@@ -2192,7 +2194,7 @@ def insert(self, start: int = 0, repl: str = None) -> SeriesOrIndex:
             repl = ""
 
         return self._return_or_inplace(
-            libstrings.insert(self._column, start, cudf.Scalar(repl)),
+            libstrings.insert(self._column, start, cudf.Scalar(repl, "str")),
         )
 
     def get(self, i: int = 0) -> SeriesOrIndex:
@@ -2643,7 +2645,7 @@ def rsplit(
                 )
             else:
                 result_table = libstrings.rsplit_record(
-                    self._column, cudf.Scalar(pat), n
+                    self._column, cudf.Scalar(pat, "str"), n
                 )
 
         return self._return_or_inplace(result_table, expand=expand)
@@ -2726,7 +2728,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
 
         return self._return_or_inplace(
             cudf.core.frame.Frame(
-                *libstrings.partition(self._column, cudf.Scalar(sep))
+                *libstrings.partition(self._column, cudf.Scalar(sep, "str"))
             ),
             expand=expand,
         )
@@ -2793,7 +2795,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
 
         return self._return_or_inplace(
             cudf.core.frame.Frame(
-                *libstrings.rpartition(self._column, cudf.Scalar(sep))
+                *libstrings.rpartition(self._column, cudf.Scalar(sep, "str"))
             ),
             expand=expand,
         )
@@ -3194,7 +3196,7 @@ def strip(self, to_strip: str = None) -> SeriesOrIndex:
             to_strip = ""
 
         return self._return_or_inplace(
-            libstrings.strip(self._column, cudf.Scalar(to_strip))
+            libstrings.strip(self._column, cudf.Scalar(to_strip, "str"))
         )
 
     def lstrip(self, to_strip: str = None) -> SeriesOrIndex:
@@ -3241,7 +3243,7 @@ def lstrip(self, to_strip: str = None) -> SeriesOrIndex:
             to_strip = ""
 
         return self._return_or_inplace(
-            libstrings.lstrip(self._column, cudf.Scalar(to_strip))
+            libstrings.lstrip(self._column, cudf.Scalar(to_strip, "str"))
         )
 
     def rstrip(self, to_strip: str = None) -> SeriesOrIndex:
@@ -3296,7 +3298,7 @@ def rstrip(self, to_strip: str = None) -> SeriesOrIndex:
             to_strip = ""
 
         return self._return_or_inplace(
-            libstrings.rstrip(self._column, cudf.Scalar(to_strip))
+            libstrings.rstrip(self._column, cudf.Scalar(to_strip, "str"))
         )
 
     def wrap(self, width: int, **kwargs) -> SeriesOrIndex:
@@ -4245,7 +4247,7 @@ def filter_characters(
         table = str.maketrans(table)
         return self._return_or_inplace(
             libstrings.filter_characters(
-                self._column, table, keep, cudf.Scalar(repl)
+                self._column, table, keep, cudf.Scalar(repl, "str")
             ),
         )
 
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 08a30729e7c..1b85769b84d 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6126,7 +6126,7 @@ def make_binop_func(op, postprocess=None):
     # def postprocess(left, right, output)
     # where left and right are the inputs to the binop and output is the result
     # of calling the wrapped Frame binop.
-    wrapped_func = getattr(Frame, op)
+    wrapped_func = getattr(IndexedFrame, op)
 
     @functools.wraps(wrapped_func)
     def wrapper(self, other, axis="columns", level=None, fill_value=None):
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 75c6e4d0964..1382ebfd8ee 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -3649,1623 +3649,6 @@ def __invert__(self):
             self._index,
         )
 
-    @_cudf_nvtx_annotate
-    def add(self, other, axis, level=None, fill_value=None):
-        """
-        Get Addition of dataframe or series and other, element-wise (binary
-        operator `add`).
-
-        Equivalent to ``frame + other``, but with support to substitute a
-        ``fill_value`` for missing data in one of the inputs.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        axis : int or string
-            Only ``0`` is supported for series, ``1`` or ``columns`` supported
-            for dataframe
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame or Series
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-
-        **DataFrame**
-
-        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
-        ...                    'degrees': [360, 180, 360]},
-        ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> df + 1
-                   angles  degrees
-        circle          1      361
-        triangle        4      181
-        rectangle       5      361
-        >>> df.add(1)
-                   angles  degrees
-        circle          1      361
-        triangle        4      181
-        rectangle       5      361
-
-        **Series**
-
-        >>> a = cudf.Series([1, 1, 1, None], index=['a', 'b', 'c', 'd'])
-        >>> b = cudf.Series([1, None, 1, None], index=['a', 'b', 'd', 'e'])
-        >>> a.add(b)
-        a       2
-        b    <NA>
-        c    <NA>
-        d    <NA>
-        e    <NA>
-        dtype: int64
-        >>> a.add(b, fill_value=0)
-        a       2
-        b       1
-        c       1
-        d       1
-        e    <NA>
-        dtype: int64
-        """
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "__add__", fill_value)
-
-    @_cudf_nvtx_annotate
-    def radd(self, other, axis, level=None, fill_value=None):
-        """
-        Get Addition of dataframe or series and other, element-wise (binary
-        operator `radd`).
-
-        Equivalent to ``other + frame``, but with support to substitute a
-        fill_value for missing data in one of the inputs. With reverse
-        version, `add`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        axis : int or string
-            Only ``0`` is supported for series, ``1`` or ``columns`` supported
-            for dataframe
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame or Series
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-
-        **DataFrame**
-
-        >>> import cudf
-        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
-        ...                    'degrees': [360, 180, 360]},
-        ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> df + 1
-                   angles  degrees
-        circle          1      361
-        triangle        4      181
-        rectangle       5      361
-        >>> df.radd(1)
-                   angles  degrees
-        circle          1      361
-        triangle        4      181
-        rectangle       5      361
-
-        **Series**
-
-        >>> a = cudf.Series([1, 2, 3, None], index=['a', 'b', 'c', 'd'])
-        >>> a
-        a       1
-        b       2
-        c       3
-        d    <NA>
-        dtype: int64
-        >>> b = cudf.Series([1, None, 1, None], index=['a', 'b', 'd', 'e'])
-        >>> b
-        a       1
-        b    <NA>
-        d       1
-        e    <NA>
-        dtype: int64
-        >>> a.add(b, fill_value=0)
-        a       2
-        b       2
-        c       3
-        d       1
-        e    <NA>
-        dtype: int64
-
-        """
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "__radd__", fill_value)
-
-    @_cudf_nvtx_annotate
-    def subtract(self, other, axis, level=None, fill_value=None):
-        """
-        Get Subtraction of dataframe or series and other, element-wise (binary
-        operator `sub`).
-
-        Equivalent to ``frame - other``, but with support to substitute a
-        fill_value for missing data in one of the inputs. With reverse
-        version, `rsub`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        axis : int or string
-            Only ``0`` is supported for series, ``1`` or ``columns`` supported
-            for dataframe
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame or Series
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-
-        **DataFrame**
-
-        >>> import cudf
-        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
-        ...                    'degrees': [360, 180, 360]},
-        ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> df.sub(1)
-                   angles  degrees
-        circle         -1      359
-        triangle        2      179
-        rectangle       3      359
-        >>> df.sub([1, 2])
-                   angles  degrees
-        circle         -1      358
-        triangle        2      178
-        rectangle       3      358
-
-        **Series**
-
-        >>> a = cudf.Series([10, 20, None, 30, None], index=['a', 'b', 'c', 'd', 'e'])
-        >>> a
-        a      10
-        b      20
-        c    <NA>
-        d      30
-        e    <NA>
-        dtype: int64
-        >>> b = cudf.Series([1, None, 2, 30], index=['a', 'c', 'b', 'd'])
-        >>> b
-        a       1
-        c    <NA>
-        b       2
-        d      30
-        dtype: int64
-        >>> a.subtract(b, fill_value=2)
-        a       9
-        b      18
-        c    <NA>
-        d       0
-        e    <NA>
-        dtype: int64
-
-        """  # noqa: E501
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "__sub__", fill_value)
-
-    sub = subtract
-
-    @_cudf_nvtx_annotate
-    def rsub(self, other, axis, level=None, fill_value=None):
-        """
-        Get Subtraction of dataframe or series and other, element-wise (binary
-        operator `rsub`).
-
-        Equivalent to ``other - frame``, but with support to substitute a
-        fill_value for missing data in one of the inputs. With reverse
-        version, `sub`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        axis : int or string
-            Only ``0`` is supported for series, ``1`` or ``columns`` supported
-            for dataframe
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame or Series
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-
-        **DataFrame**
-
-        >>> import cudf
-        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
-        ...                    'degrees': [360, 180, 360]},
-        ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> df
-                   angles  degrees
-        circle          0      360
-        triangle        3      180
-        rectangle       4      360
-        >>> df.rsub(1)
-                   angles  degrees
-        circle          1     -359
-        triangle       -2     -179
-        rectangle      -3     -359
-        >>> df.rsub([1, 2])
-                   angles  degrees
-        circle          1     -358
-        triangle       -2     -178
-        rectangle      -3     -358
-
-        **Series**
-
-        >>> import cudf
-        >>> a = cudf.Series([1, 2, 3, None], index=['a', 'b', 'c', 'd'])
-        >>> a
-        a       1
-        b       2
-        c       3
-        d    <NA>
-        dtype: int64
-        >>> b = cudf.Series([1, None, 2, None], index=['a', 'b', 'd', 'e'])
-        >>> b
-        a       1
-        b    <NA>
-        d       2
-        e    <NA>
-        dtype: int64
-        >>> a.rsub(b, fill_value=10)
-        a       0
-        b       8
-        c       7
-        d      -8
-        e    <NA>
-        dtype: int64
-        """
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "__rsub__", fill_value)
-
-    @_cudf_nvtx_annotate
-    def multiply(self, other, axis, level=None, fill_value=None):
-        """
-        Get Multiplication of dataframe or series and other, element-wise
-        (binary operator `mul`).
-
-        Equivalent to ``frame * other``, but with support to substitute a
-        fill_value for missing data in one of the inputs. With reverse
-        version, `rmul`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        axis : int or string
-            Only ``0`` is supported for series, ``1`` or ``columns`` supported
-            for dataframe
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame or Series
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-
-        **DataFrame**
-
-        >>> import cudf
-        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
-        ...                    'degrees': [360, 180, 360]},
-        ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> other = cudf.DataFrame({'angles': [0, 3, 4]},
-        ...                      index=['circle', 'triangle', 'rectangle'])
-        >>> df * other
-                   angles degrees
-        circle          0    <NA>
-        triangle        9    <NA>
-        rectangle      16    <NA>
-        >>> df.mul(other, fill_value=0)
-                   angles  degrees
-        circle          0        0
-        triangle        9        0
-        rectangle      16        0
-
-        **Series**
-
-        >>> import cudf
-        >>> a = cudf.Series([1, 2, 3, None], index=['a', 'b', 'c', 'd'])
-        >>> a
-        a       1
-        b       2
-        c       3
-        d    <NA>
-        dtype: int64
-        >>> b = cudf.Series([1, None, 2, None], index=['a', 'b', 'd', 'e'])
-        >>> b
-        a       1
-        b    <NA>
-        d       2
-        e    <NA>
-        dtype: int64
-        >>> a.multiply(b, fill_value=0)
-        a       1
-        b       0
-        c       0
-        d       0
-        e    <NA>
-        dtype: int64
-
-        """
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "__mul__", fill_value)
-
-    mul = multiply
-
-    @_cudf_nvtx_annotate
-    def rmul(self, other, axis, level=None, fill_value=None):
-        """
-        Get Multiplication of dataframe or series and other, element-wise
-        (binary operator `rmul`).
-
-        Equivalent to ``other * frame``, but with support to substitute a
-        fill_value for missing data in one of the inputs. With reverse
-        version, `mul`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        axis : int or string
-            Only ``0`` is supported for series, ``1`` or ``columns`` supported
-            for dataframe
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame or Series
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-
-        **DataFrame**
-
-        >>> import cudf
-        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
-        ...                    'degrees': [360, 180, 360]},
-        ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> other = cudf.DataFrame({'angles': [0, 3, 4]},
-        ...                      index=['circle', 'triangle', 'rectangle'])
-        >>> other * df
-                   angles degrees
-        circle          0    <NA>
-        triangle        9    <NA>
-        rectangle      16    <NA>
-        >>> df.rmul(other, fill_value=0)
-                   angles  degrees
-        circle          0        0
-        triangle        9        0
-        rectangle      16        0
-
-        **Series**
-
-        >>> import cudf
-        >>> a = cudf.Series([10, 20, None, 30, 40], index=['a', 'b', 'c', 'd', 'e'])
-        >>> a
-        a      10
-        b      20
-        c    <NA>
-        d      30
-        e      40
-        dtype: int64
-        >>> b = cudf.Series([None, 1, 20, 5, 4], index=['a', 'b', 'd', 'e', 'f'])
-        >>> b
-        a    <NA>
-        b       1
-        d      20
-        e       5
-        f       4
-        dtype: int64
-        >>> a.rmul(b, fill_value=2)
-        a      20
-        b      20
-        c    <NA>
-        d     600
-        e     200
-        f       8
-        dtype: int64
-        """  # noqa E501
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "__rmul__", fill_value)
-
-    @_cudf_nvtx_annotate
-    def mod(self, other, axis, level=None, fill_value=None):
-        """
-        Get Modulo division of dataframe or series and other, element-wise
-        (binary operator `mod`).
-
-        Equivalent to ``frame % other``, but with support to substitute a
-        fill_value for missing data in one of the inputs. With reverse
-        version, `rmod`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        axis : int or string
-            Only ``0`` is supported for series, ``1`` or ``columns`` supported
-            for dataframe
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame or Series
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-
-        **DataFrame**
-
-        >>> import cudf
-        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
-        ...                    'degrees': [360, 180, 360]},
-        ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> df % 100
-                   angles  degrees
-        circle          0       60
-        triangle        3       80
-        rectangle       4       60
-        >>> df.mod(100)
-                   angles  degrees
-        circle          0       60
-        triangle        3       80
-        rectangle       4       60
-
-        **Series**
-
-        >>> import cudf
-        >>> series = cudf.Series([10, 20, 30])
-        >>> series
-        0    10
-        1    20
-        2    30
-        dtype: int64
-        >>> series.mod(4)
-        0    2
-        1    0
-        2    2
-        dtype: int64
-
-
-        """
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "__mod__", fill_value)
-
-    @_cudf_nvtx_annotate
-    def rmod(self, other, axis, level=None, fill_value=None):
-        """
-        Get Modulo division of dataframe or series and other, element-wise
-        (binary operator `rmod`).
-
-        Equivalent to ``other % frame``, but with support to substitute a
-        fill_value for missing data in one of the inputs. With reverse
-        version, `mod`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        axis : int or string
-            Only ``0`` is supported for series, ``1`` or ``columns`` supported
-            for dataframe
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame or Series
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-
-        **DataFrame**
-
-        >>> import cudf
-        >>> df = cudf.DataFrame({'angles': [1, 3, 4],
-        ...                    'degrees': [360, 180, 360]},
-        ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> 100 % df
-                   angles  degrees
-        circle          0      100
-        triangle        1      100
-        rectangle       0      100
-        >>> df.rmod(100)
-                   angles  degrees
-        circle          0      100
-        triangle        1      100
-        rectangle       0      100
-
-        **Series**
-
-        >>> import cudf
-        >>> a = cudf.Series([10, 20, None, 30, 40], index=['a', 'b', 'c', 'd', 'e'])
-        >>> a
-        a      10
-        b      20
-        c    <NA>
-        d      30
-        e      40
-        dtype: int64
-        >>> b = cudf.Series([None, 1, 20, 5, 4], index=['a', 'b', 'd', 'e', 'f'])
-        >>> b
-        a    <NA>
-        b       1
-        d      20
-        e       5
-        f       4
-        dtype: int64
-        >>> a.rmod(b, fill_value=10)
-        a       0
-        b       1
-        c    <NA>
-        d      20
-        e       5
-        f       4
-        dtype: int64
-        """  # noqa E501
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "__rmod__", fill_value)
-
-    @_cudf_nvtx_annotate
-    def pow(self, other, axis, level=None, fill_value=None):
-        """
-        Get Exponential power of dataframe series and other, element-wise
-        (binary operator `pow`).
-
-        Equivalent to ``frame ** other``, but with support to substitute a
-        fill_value for missing data in one of the inputs. With reverse
-        version, `rpow`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        axis : int or string
-            Only ``0`` is supported for series, ``1`` or ``columns`` supported
-            for dataframe
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame or Series
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-
-        **DataFrame**
-
-        >>> import cudf
-        >>> df = cudf.DataFrame({'angles': [1, 3, 4],
-        ...                    'degrees': [360, 180, 360]},
-        ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> df ** 2
-                   angles  degrees
-        circle          0   129600
-        triangle        9    32400
-        rectangle      16   129600
-        >>> df.pow(2)
-                   angles  degrees
-        circle          0   129600
-        triangle        9    32400
-        rectangle      16   129600
-
-        **Series**
-
-        >>> import cudf
-        >>> a = cudf.Series([1, 2, 3, None], index=['a', 'b', 'c', 'd'])
-        >>> a
-        a       1
-        b       2
-        c       3
-        d    <NA>
-        dtype: int64
-        >>> b = cudf.Series([10, None, 12, None], index=['a', 'b', 'd', 'e'])
-        >>> b
-        a      10
-        b    <NA>
-        d      12
-        e    <NA>
-        dtype: int64
-        >>> a.pow(b, fill_value=0)
-        a       1
-        b       1
-        c       1
-        d       0
-        e    <NA>
-        dtype: int64
-        """
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "__pow__", fill_value)
-
-    @_cudf_nvtx_annotate
-    def rpow(self, other, axis, level=None, fill_value=None):
-        """
-        Get Exponential power of dataframe or series and other, element-wise
-        (binary operator `pow`).
-
-        Equivalent to ``other ** frame``, but with support to substitute a
-        fill_value for missing data in one of the inputs. With reverse
-        version, `pow`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        axis : int or string
-            Only ``0`` is supported for series, ``1`` or ``columns`` supported
-            for dataframe
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame or Series
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-
-        **DataFrame**
-
-        >>> import cudf
-        >>> df = cudf.DataFrame({'angles': [1, 3, 4],
-        ...                    'degrees': [360, 180, 360]},
-        ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> 1 ** df
-                   angles  degrees
-        circle          1        1
-        triangle        1        1
-        rectangle       1        1
-        >>> df.rpow(1)
-                   angles  degrees
-        circle          1        1
-        triangle        1        1
-        rectangle       1        1
-
-        **Series**
-
-        >>> import cudf
-        >>> a = cudf.Series([1, 2, 3, None], index=['a', 'b', 'c', 'd'])
-        >>> a
-        a       1
-        b       2
-        c       3
-        d    <NA>
-        dtype: int64
-        >>> b = cudf.Series([10, None, 12, None], index=['a', 'b', 'd', 'e'])
-        >>> b
-        a      10
-        b    <NA>
-        d      12
-        e    <NA>
-        dtype: int64
-        >>> a.rpow(b, fill_value=0)
-        a      10
-        b       0
-        c       0
-        d       1
-        e    <NA>
-        dtype: int64
-        """
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "__rpow__", fill_value)
-
-    @_cudf_nvtx_annotate
-    def floordiv(self, other, axis, level=None, fill_value=None):
-        """
-        Get Integer division of dataframe or series and other, element-wise
-        (binary operator `floordiv`).
-
-        Equivalent to ``frame // other``, but with support to substitute a
-        fill_value for missing data in one of the inputs. With reverse
-        version, `rfloordiv`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        axis : int or string
-            Only ``0`` is supported for series, ``1`` or ``columns`` supported
-            for dataframe
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame or Series
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-
-        **DataFrame**
-
-        >>> import cudf
-        >>> df = cudf.DataFrame({'angles': [1, 3, 4],
-        ...                    'degrees': [360, 180, 360]},
-        ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> df.floordiv(2)
-                   angles  degrees
-        circle          0      180
-        triangle        1       90
-        rectangle       2      180
-        >>> df // 2
-                   angles  degrees
-        circle          0      180
-        triangle        1       90
-        rectangle       2      180
-
-        **Series**
-
-        >>> import cudf
-        >>> a = cudf.Series([1, 1, 1, None], index=['a', 'b', 'c', 'd'])
-        >>> a
-        a       1
-        b       1
-        c       1
-        d    <NA>
-        dtype: int64
-        >>> b = cudf.Series([1, None, 1, None], index=['a', 'b', 'd', 'e'])
-        >>> b
-        a       1
-        b    <NA>
-        d       1
-        e    <NA>
-        dtype: int64
-        >>> a.floordiv(b)
-        a       1
-        b    <NA>
-        c    <NA>
-        d    <NA>
-        e    <NA>
-        dtype: int64
-        """
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "__floordiv__", fill_value)
-
-    @_cudf_nvtx_annotate
-    def rfloordiv(self, other, axis, level=None, fill_value=None):
-        """
-        Get Integer division of dataframe or series and other, element-wise
-        (binary operator `rfloordiv`).
-
-        Equivalent to ``other // dataframe``, but with support to substitute
-        a fill_value for missing data in one of the inputs. With reverse
-        version, `floordiv`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        axis : int or string
-            Only ``0`` is supported for series, ``1`` or ``columns`` supported
-            for dataframe
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame or Series
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-
-        **DataFrame**
-
-        >>> import cudf
-        >>> df = cudf.DataFrame({'col1': [10, 11, 23],
-        ... 'col2': [101, 122, 321]})
-        >>> df
-           col1  col2
-        0    10   101
-        1    11   122
-        2    23   321
-        >>> df.rfloordiv(df)
-           col1  col2
-        0     1     1
-        1     1     1
-        2     1     1
-        >>> df.rfloordiv(200)
-           col1  col2
-        0    20     1
-        1    18     1
-        2     8     0
-        >>> df.rfloordiv(100)
-           col1  col2
-        0    10     0
-        1     9     0
-        2     4     0
-
-        **Series**
-
-        >>> import cudf
-        >>> s = cudf.Series([1, 2, 10, 17])
-        >>> s
-        0     1
-        1     2
-        2    10
-        3    17
-        dtype: int64
-        >>> s.rfloordiv(100)
-        0    100
-        1     50
-        2     10
-        3      5
-        dtype: int64
-        >>> s = cudf.Series([10, 20, None])
-        >>> s
-        0      10
-        1      20
-        2    <NA>
-        dtype: int64
-        >>> s.rfloordiv(200)
-        0      20
-        1      10
-        2    <NA>
-        dtype: int64
-        >>> s.rfloordiv(200, fill_value=2)
-        0     20
-        1     10
-        2    100
-        dtype: int64
-        """
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "__rfloordiv__", fill_value)
-
-    @_cudf_nvtx_annotate
-    def truediv(self, other, axis, level=None, fill_value=None):
-        """
-        Get Floating division of dataframe or series and other, element-wise
-        (binary operator `truediv`).
-
-        Equivalent to ``frame / other``, but with support to substitute a
-        fill_value for missing data in one of the inputs. With reverse
-        version, `rtruediv`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        axis : int or string
-            Only ``0`` is supported for series, ``1`` or ``columns`` supported
-            for dataframe
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame or Series
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-
-        **DataFrame**
-
-        >>> import cudf
-        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
-        ...                    'degrees': [360, 180, 360]},
-        ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> df.truediv(10)
-                   angles  degrees
-        circle        0.0     36.0
-        triangle      0.3     18.0
-        rectangle     0.4     36.0
-        >>> df.div(10)
-                   angles  degrees
-        circle        0.0     36.0
-        triangle      0.3     18.0
-        rectangle     0.4     36.0
-        >>> df / 10
-                   angles  degrees
-        circle        0.0     36.0
-        triangle      0.3     18.0
-        rectangle     0.4     36.0
-
-        **Series**
-
-        >>> import cudf
-        >>> a = cudf.Series([1, 10, 20, None], index=['a', 'b', 'c', 'd'])
-        >>> a
-        a       1
-        b      10
-        c      20
-        d    <NA>
-        dtype: int64
-        >>> b = cudf.Series([1, None, 2, None], index=['a', 'b', 'd', 'e'])
-        >>> b
-        a       1
-        b    <NA>
-        d       2
-        e    <NA>
-        dtype: int64
-        >>> a.truediv(b, fill_value=0)
-        a     1.0
-        b     Inf
-        c     Inf
-        d     0.0
-        e    <NA>
-        dtype: float64
-        """
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "__truediv__", fill_value)
-
-    # Alias for truediv
-    div = truediv
-    divide = truediv
-
-    @_cudf_nvtx_annotate
-    def rtruediv(self, other, axis, level=None, fill_value=None):
-        """
-        Get Floating division of dataframe or series and other, element-wise
-        (binary operator `rtruediv`).
-
-        Equivalent to ``other / frame``, but with support to substitute a
-        fill_value for missing data in one of the inputs. With reverse
-        version, `truediv`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        axis : int or string
-            Only ``0`` is supported for series, ``1`` or ``columns`` supported
-            for dataframe
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame or Series
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-
-        **DataFrame**
-
-        >>> import cudf
-        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
-        ...                    'degrees': [360, 180, 360]},
-        ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> df
-                   angles  degrees
-        circle          0      360
-        triangle        3      180
-        rectangle       4      360
-        >>> df.rtruediv(10)
-                     angles   degrees
-        circle          inf  0.027778
-        triangle   3.333333  0.055556
-        rectangle  2.500000  0.027778
-        >>> df.rdiv(10)
-                     angles   degrees
-        circle          inf  0.027778
-        triangle   3.333333  0.055556
-        rectangle  2.500000  0.027778
-        >>> 10 / df
-                     angles   degrees
-        circle          inf  0.027778
-        triangle   3.333333  0.055556
-        rectangle  2.500000  0.027778
-
-        **Series**
-
-        >>> import cudf
-        >>> a = cudf.Series([10, 20, None, 30], index=['a', 'b', 'c', 'd'])
-        >>> a
-        a      10
-        b      20
-        c    <NA>
-        d      30
-        dtype: int64
-        >>> b = cudf.Series([1, None, 2, 3], index=['a', 'b', 'd', 'e'])
-        >>> b
-        a       1
-        b    <NA>
-        d       2
-        e       3
-        dtype: int64
-        >>> a.rtruediv(b, fill_value=0)
-        a            0.1
-        b            0.0
-        c           <NA>
-        d    0.066666667
-        e            Inf
-        dtype: float64
-        """
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "__rtruediv__", fill_value)
-
-    # Alias for rtruediv
-    rdiv = rtruediv
-
-    @_cudf_nvtx_annotate
-    def eq(self, other, axis="columns", level=None, fill_value=None):
-        """Equal to, element-wise (binary operator eq).
-
-        Parameters
-        ----------
-        other : Series or scalar value
-        fill_value : None or value
-            Value to fill nulls with before computation. If data in both
-            corresponding Series locations is null the result will be null
-
-        Returns
-        -------
-        Frame
-            The result of the operation.
-
-        Examples
-        --------
-        **DataFrame**
-
-        >>> left = cudf.DataFrame({
-        ...     'a': [1, 2, 3],
-        ...     'b': [4, 5, 6],
-        ...     'c': [7, 8, 9]}
-        ... )
-        >>> right = cudf.DataFrame({
-        ...     'a': [1, 2, 3],
-        ...     'b': [4, 5, 6],
-        ...     'd': [10, 12, 12]}
-        ... )
-        >>> left.eq(right)
-              a     b     c     d
-        0  True  True  <NA>  <NA>
-        1  True  True  <NA>  <NA>
-        2  True  True  <NA>  <NA>
-        >>> left.eq(right, fill_value=7)
-              a     b      c      d
-        0  True  True   True  False
-        1  True  True  False  False
-        2  True  True  False  False
-
-        **Series**
-
-        >>> a = cudf.Series([1, 2, 3, None, 10, 20],
-        ...                 index=['a', 'c', 'd', 'e', 'f', 'g'])
-        >>> a
-        a       1
-        c       2
-        d       3
-        e    <NA>
-        f      10
-        g      20
-        dtype: int64
-        >>> b = cudf.Series([-10, 23, -1, None, None],
-        ...                 index=['a', 'b', 'c', 'd', 'e'])
-        >>> b
-        a     -10
-        b      23
-        c      -1
-        d    <NA>
-        e    <NA>
-        dtype: int64
-        >>> a.eq(b, fill_value=2)
-        a    False
-        b    False
-        c    False
-        d    False
-        e     <NA>
-        f    False
-        g    False
-        dtype: bool
-        """
-        return self._binaryop(
-            other=other, op="__eq__", fill_value=fill_value, can_reindex=True
-        )
-
-    @_cudf_nvtx_annotate
-    def ne(self, other, axis="columns", level=None, fill_value=None):
-        """Not equal to, element-wise (binary operator ne).
-
-        Parameters
-        ----------
-        other : Series or scalar value
-        fill_value : None or value
-            Value to fill nulls with before computation. If data in both
-            corresponding Series locations is null the result will be null
-
-        Returns
-        -------
-        Frame
-            The result of the operation.
-
-        Examples
-        --------
-        **DataFrame**
-
-        >>> left = cudf.DataFrame({
-        ...     'a': [1, 2, 3],
-        ...     'b': [4, 5, 6],
-        ...     'c': [7, 8, 9]}
-        ... )
-        >>> right = cudf.DataFrame({
-        ...     'a': [1, 2, 3],
-        ...     'b': [4, 5, 6],
-        ...     'd': [10, 12, 12]}
-        ... )
-        >>> left.ne(right)
-               a      b     c     d
-        0  False  False  <NA>  <NA>
-        1  False  False  <NA>  <NA>
-        2  False  False  <NA>  <NA>
-        >>> left.ne(right, fill_value=7)
-               a      b      c     d
-        0  False  False  False  True
-        1  False  False   True  True
-        2  False  False   True  True
-
-        **Series**
-
-        >>> a = cudf.Series([1, 2, 3, None, 10, 20],
-        ...                 index=['a', 'c', 'd', 'e', 'f', 'g'])
-        >>> a
-        a       1
-        c       2
-        d       3
-        e    <NA>
-        f      10
-        g      20
-        dtype: int64
-        >>> b = cudf.Series([-10, 23, -1, None, None],
-        ...                 index=['a', 'b', 'c', 'd', 'e'])
-        >>> b
-        a     -10
-        b      23
-        c      -1
-        d    <NA>
-        e    <NA>
-        dtype: int64
-        >>> a.ne(b, fill_value=2)
-        a    True
-        b    True
-        c    True
-        d    True
-        e    <NA>
-        f    True
-        g    True
-        dtype: bool
-        """  # noqa: E501
-        return self._binaryop(
-            other=other, op="__ne__", fill_value=fill_value, can_reindex=True
-        )
-
-    @_cudf_nvtx_annotate
-    def lt(self, other, axis="columns", level=None, fill_value=None):
-        """Less than, element-wise (binary operator lt).
-
-        Parameters
-        ----------
-        other : Series or scalar value
-        fill_value : None or value
-            Value to fill nulls with before computation. If data in both
-            corresponding Series locations is null the result will be null
-
-        Returns
-        -------
-        Frame
-            The result of the operation.
-
-        Examples
-        --------
-        **DataFrame**
-
-        >>> left = cudf.DataFrame({
-        ...     'a': [1, 2, 3],
-        ...     'b': [4, 5, 6],
-        ...     'c': [7, 8, 9]}
-        ... )
-        >>> right = cudf.DataFrame({
-        ...     'a': [1, 2, 3],
-        ...     'b': [4, 5, 6],
-        ...     'd': [10, 12, 12]}
-        ... )
-        >>> left.lt(right)
-               a      b     c     d
-        0  False  False  <NA>  <NA>
-        1  False  False  <NA>  <NA>
-        2  False  False  <NA>  <NA>
-        >>> left.lt(right, fill_value=7)
-               a      b      c     d
-        0  False  False  False  True
-        1  False  False  False  True
-        2  False  False  False  True
-
-        **Series**
-
-        >>> a = cudf.Series([1, 2, 3, None, 10, 20],
-        ...                 index=['a', 'c', 'd', 'e', 'f', 'g'])
-        >>> a
-        a       1
-        c       2
-        d       3
-        e    <NA>
-        f      10
-        g      20
-        dtype: int64
-        >>> b = cudf.Series([-10, 23, -1, None, None],
-        ...                 index=['a', 'b', 'c', 'd', 'e'])
-        >>> b
-        a     -10
-        b      23
-        c      -1
-        d    <NA>
-        e    <NA>
-        dtype: int64
-        >>> a.lt(b, fill_value=-10)
-        a    False
-        b     True
-        c    False
-        d    False
-        e     <NA>
-        f    False
-        g    False
-        dtype: bool
-        """  # noqa: E501
-        return self._binaryop(
-            other=other, op="__lt__", fill_value=fill_value, can_reindex=True
-        )
-
-    @_cudf_nvtx_annotate
-    def le(self, other, axis="columns", level=None, fill_value=None):
-        """Less than or equal, element-wise (binary operator le).
-
-        Parameters
-        ----------
-        other : Series or scalar value
-        fill_value : None or value
-            Value to fill nulls with before computation. If data in both
-            corresponding Series locations is null the result will be null
-
-        Returns
-        -------
-        Frame
-            The result of the operation.
-
-        Examples
-        --------
-        **DataFrame**
-
-        >>> left = cudf.DataFrame({
-        ...     'a': [1, 2, 3],
-        ...     'b': [4, 5, 6],
-        ...     'c': [7, 8, 9]}
-        ... )
-        >>> right = cudf.DataFrame({
-        ...     'a': [1, 2, 3],
-        ...     'b': [4, 5, 6],
-        ...     'd': [10, 12, 12]}
-        ... )
-        >>> left.le(right)
-              a     b     c     d
-        0  True  True  <NA>  <NA>
-        1  True  True  <NA>  <NA>
-        2  True  True  <NA>  <NA>
-        >>> left.le(right, fill_value=7)
-              a     b      c     d
-        0  True  True   True  True
-        1  True  True  False  True
-        2  True  True  False  True
-
-        **Series**
-
-        >>> a = cudf.Series([1, 2, 3, None, 10, 20],
-        ...                 index=['a', 'c', 'd', 'e', 'f', 'g'])
-        >>> a
-        a       1
-        c       2
-        d       3
-        e    <NA>
-        f      10
-        g      20
-        dtype: int64
-        >>> b = cudf.Series([-10, 23, -1, None, None],
-        ...                 index=['a', 'b', 'c', 'd', 'e'])
-        >>> b
-        a     -10
-        b      23
-        c      -1
-        d    <NA>
-        e    <NA>
-        dtype: int64
-        >>> a.le(b, fill_value=-10)
-        a    False
-        b     True
-        c    False
-        d    False
-        e     <NA>
-        f    False
-        g    False
-        dtype: bool
-        """  # noqa: E501
-        return self._binaryop(
-            other=other, op="__le__", fill_value=fill_value, can_reindex=True
-        )
-
-    @_cudf_nvtx_annotate
-    def gt(self, other, axis="columns", level=None, fill_value=None):
-        """Greater than, element-wise (binary operator gt).
-
-        Parameters
-        ----------
-        other : Series or scalar value
-        fill_value : None or value
-            Value to fill nulls with before computation. If data in both
-            corresponding Series locations is null the result will be null
-
-        Returns
-        -------
-        Frame
-            The result of the operation.
-
-        Examples
-        --------
-        **DataFrame**
-
-        >>> left = cudf.DataFrame({
-        ...     'a': [1, 2, 3],
-        ...     'b': [4, 5, 6],
-        ...     'c': [7, 8, 9]}
-        ... )
-        >>> right = cudf.DataFrame({
-        ...     'a': [1, 2, 3],
-        ...     'b': [4, 5, 6],
-        ...     'd': [10, 12, 12]}
-        ... )
-        >>> left.gt(right)
-               a      b     c     d
-        0  False  False  <NA>  <NA>
-        1  False  False  <NA>  <NA>
-        2  False  False  <NA>  <NA>
-        >>> left.gt(right, fill_value=7)
-               a      b      c      d
-        0  False  False  False  False
-        1  False  False   True  False
-        2  False  False   True  False
-
-        **Series**
-
-        >>> a = cudf.Series([1, 2, 3, None, 10, 20],
-        ...                 index=['a', 'c', 'd', 'e', 'f', 'g'])
-        >>> a
-        a       1
-        c       2
-        d       3
-        e    <NA>
-        f      10
-        g      20
-        dtype: int64
-        >>> b = cudf.Series([-10, 23, -1, None, None],
-        ...                 index=['a', 'b', 'c', 'd', 'e'])
-        >>> b
-        a     -10
-        b      23
-        c      -1
-        d    <NA>
-        e    <NA>
-        dtype: int64
-        >>> a.gt(b)
-        a     True
-        b    False
-        c     True
-        d    False
-        e    False
-        f    False
-        g    False
-        dtype: bool
-        """  # noqa: E501
-        return self._binaryop(
-            other=other, op="__gt__", fill_value=fill_value, can_reindex=True
-        )
-
-    @_cudf_nvtx_annotate
-    def ge(self, other, axis="columns", level=None, fill_value=None):
-        """Greater than or equal, element-wise (binary operator ge).
-
-        Parameters
-        ----------
-        other : Series or scalar value
-        fill_value : None or value
-            Value to fill nulls with before computation. If data in both
-            corresponding Series locations is null the result will be null
-
-        Returns
-        -------
-        Frame
-            The result of the operation.
-
-        Examples
-        --------
-        **DataFrame**
-
-        >>> left = cudf.DataFrame({
-        ...     'a': [1, 2, 3],
-        ...     'b': [4, 5, 6],
-        ...     'c': [7, 8, 9]}
-        ... )
-        >>> right = cudf.DataFrame({
-        ...     'a': [1, 2, 3],
-        ...     'b': [4, 5, 6],
-        ...     'd': [10, 12, 12]}
-        ... )
-        >>> left.ge(right)
-              a     b     c     d
-        0  True  True  <NA>  <NA>
-        1  True  True  <NA>  <NA>
-        2  True  True  <NA>  <NA>
-        >>> left.ge(right, fill_value=7)
-              a     b     c      d
-        0  True  True  True  False
-        1  True  True  True  False
-        2  True  True  True  False
-
-        **Series**
-
-        >>> a = cudf.Series([1, 2, 3, None, 10, 20],
-        ...                 index=['a', 'c', 'd', 'e', 'f', 'g'])
-        >>> a
-        a       1
-        c       2
-        d       3
-        e    <NA>
-        f      10
-        g      20
-        dtype: int64
-        >>> b = cudf.Series([-10, 23, -1, None, None],
-        ...                 index=['a', 'b', 'c', 'd', 'e'])
-        >>> b
-        a     -10
-        b      23
-        c      -1
-        d    <NA>
-        e    <NA>
-        dtype: int64
-        >>> a.ge(b)
-        a     True
-        b    False
-        c     True
-        d    False
-        e    False
-        f    False
-        g    False
-        dtype: bool
-        """  # noqa: E501
-        return self._binaryop(
-            other=other, op="__ge__", fill_value=fill_value, can_reindex=True
-        )
-
     def nunique(self, dropna: bool = True):
         """
         Returns a per column mapping with counts of unique values for
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 3d025738974..10736948b57 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -5,6 +5,7 @@
 
 import numbers
 import operator
+import textwrap
 import warnings
 from collections import Counter, abc
 from functools import cached_property
@@ -44,6 +45,7 @@
 from cudf.core.index import Index, RangeIndex, _index_from_columns
 from cudf.core.multiindex import MultiIndex
 from cudf.core.udf.utils import _compile_or_get, _supported_cols_from_frame
+from cudf.utils import docutils
 from cudf.utils.utils import _cudf_nvtx_annotate
 
 doc_reset_index_template = """
@@ -72,6 +74,55 @@
 """
 
 
+doc_binop_template = textwrap.dedent(
+    """
+    Get {operation} of DataFrame or Series and other, element-wise (binary
+    operator `{op_name}`).
+
+    Equivalent to ``frame + other``, but with support to substitute a
+    ``fill_value`` for missing data in one of the inputs.
+
+    Parameters
+    ----------
+    other : scalar, sequence, Series, or DataFrame
+        Any single or multiple element data structure, or list-like object.
+    axis : int or string
+        Only ``0`` is supported for series, ``1`` or ``columns`` supported
+        for dataframe
+    level : int or name
+        Broadcast across a level, matching Index values on the
+        passed MultiIndex level. Not yet supported.
+    fill_value  : float or None, default None
+        Fill existing missing (NaN) values, and any new element needed
+        for successful DataFrame alignment, with this value before
+        computation. If data in both corresponding DataFrame locations
+        is missing the result will be missing.
+
+    Returns
+    -------
+    DataFrame or Series
+        Result of the arithmetic operation.
+
+    Examples
+    --------
+
+    **DataFrame**
+
+    >>> df = cudf.DataFrame(
+    ...     {{'angles': [0, 3, 4], 'degrees': [360, 180, 360]}},
+    ...     index=['circle', 'triangle', 'rectangle']
+    ... )
+    {df_op_example}
+
+    **Series**
+
+    >>> a = cudf.Series([1, 1, 1, None], index=['a', 'b', 'c', 'd'])
+    >>> b = cudf.Series([1, None, 1, None], index=['a', 'b', 'd', 'e'])
+    {ser_op_example}
+    """
+)
+
+
 def _get_host_unique(array):
     if isinstance(array, (cudf.Series, cudf.Index, ColumnBase)):
         return array.unique.to_pandas()
@@ -2653,6 +2704,845 @@ def _explode(self, explode_column: Any, ignore_index: bool):
             res.index.names = self._index.names
         return res
 
+    @_cudf_nvtx_annotate
+    @docutils.doc_apply(
+        doc_binop_template.format(
+            operation="Addition",
+            op_name="add",
+            equivalent_op="frame + other",
+            df_op_example=textwrap.dedent(
+                """
+                >>> df.add(1)
+                        angles  degrees
+                circle          1      361
+                triangle        4      181
+                rectangle       5      361
+                """,
+            ),
+            ser_op_example=textwrap.dedent(
+                """
+                >>> a.add(b)
+                a       2
+                b    <NA>
+                c    <NA>
+                d    <NA>
+                e    <NA>
+                dtype: int64
+                >>> a.add(b, fill_value=0)
+                a       2
+                b       1
+                c       1
+                d       1
+                e    <NA>
+                dtype: int64
+                """
+            ),
+        )
+    )
+    def add(self, other, axis, level=None, fill_value=None):  # noqa: D102
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
+
+        return self._binaryop(other, "__add__", fill_value)
+
+    @_cudf_nvtx_annotate
+    @docutils.doc_apply(
+        doc_binop_template.format(
+            operation="Addition",
+            op_name="radd",
+            equivalent_op="other + frame",
+            df_op_example=textwrap.dedent(
+                """
+                >>> df.radd(1)
+                        angles  degrees
+                circle          1      361
+                triangle        4      181
+                rectangle       5      361
+                """
+            ),
+            ser_op_example=textwrap.dedent(
+                """
+                >>> a.radd(b)
+                a       2
+                b    <NA>
+                c    <NA>
+                d    <NA>
+                e    <NA>
+                dtype: int64
+                >>> a.radd(b, fill_value=0)
+                a       2
+                b       1
+                c       1
+                d       1
+                e    <NA>
+                dtype: int64
+                """
+            ),
+        )
+    )
+    def radd(self, other, axis, level=None, fill_value=None):  # noqa: D102
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
+
+        return self._binaryop(other, "__radd__", fill_value)
+
+    @_cudf_nvtx_annotate
+    @docutils.doc_apply(
+        doc_binop_template.format(
+            operation="Subtraction",
+            op_name="sub",
+            equivalent_op="frame - other",
+            df_op_example=textwrap.dedent(
+                """
+                >>> df.sub(1)
+                        angles  degrees
+                circle         -1      359
+                triangle        2      179
+                rectangle       3      359
+                """
+            ),
+            ser_op_example=textwrap.dedent(
+                """
+                >>> a.sub(b)
+                a       0
+                b    <NA>
+                c    <NA>
+                d    <NA>
+                e    <NA>
+                dtype: int64
+                >>> a.sub(b, fill_value=0)
+                a       2
+                b       1
+                c       1
+                d      -1
+                e    <NA>
+                dtype: int64
+                """
+            ),
+        )
+    )
+    def subtract(self, other, axis, level=None, fill_value=None):  # noqa: D102
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
+
+        return self._binaryop(other, "__sub__", fill_value)
+
+    sub = subtract
+
+    @_cudf_nvtx_annotate
+    @docutils.doc_apply(
+        doc_binop_template.format(
+            operation="Subtraction",
+            op_name="rsub",
+            equivalent_op="other - frame",
+            df_op_example=textwrap.dedent(
+                """
+                >>> df.rsub(1)
+                        angles  degrees
+                circle          1     -359
+                triangle       -2     -179
+                rectangle      -3     -359
+                """
+            ),
+            ser_op_example=textwrap.dedent(
+                """
+                >>> a.rsub(b)
+                a       0
+                b    <NA>
+                c    <NA>
+                d    <NA>
+                e    <NA>
+                dtype: int64
+                >>> a.rsub(b, fill_value=0)
+                a       0
+                b      -1
+                c      -1
+                d       1
+                e    <NA>
+                dtype: int64
+                """
+            ),
+        )
+    )
+    def rsub(self, other, axis, level=None, fill_value=None):  # noqa: D102
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
+
+        return self._binaryop(other, "__rsub__", fill_value)
+
+    @_cudf_nvtx_annotate
+    @docutils.doc_apply(
+        doc_binop_template.format(
+            operation="Multiplication",
+            op_name="mul",
+            equivalent_op="frame * other",
+            df_op_example=textwrap.dedent(
+                """
+                >>> df.multiply(1)
+                        angles  degrees
+                circle          0      360
+                triangle        3      180
+                rectangle       4      360
+                """
+            ),
+            ser_op_example=textwrap.dedent(
+                """
+                >>> a.multiply(b)
+                a       1
+                b    <NA>
+                c    <NA>
+                d    <NA>
+                e    <NA>
+                dtype: int64
+                >>> a.multiply(b, fill_value=0)
+                a       1
+                b       0
+                c       0
+                d       0
+                e    <NA>
+                dtype: int64
+                """
+            ),
+        )
+    )
+    def multiply(self, other, axis, level=None, fill_value=None):  # noqa: D102
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
+
+        return self._binaryop(other, "__mul__", fill_value)
+
+    mul = multiply
+
+    @_cudf_nvtx_annotate
+    @docutils.doc_apply(
+        doc_binop_template.format(
+            operation="Multiplication",
+            op_name="rmul",
+            equivalent_op="other * frame",
+            df_op_example=textwrap.dedent(
+                """
+                >>> df.rmul(1)
+                        angles  degrees
+                circle          0      360
+                triangle        3      180
+                rectangle       4      360
+                """
+            ),
+            ser_op_example=textwrap.dedent(
+                """
+                >>> a.rmul(b)
+                a       1
+                b    <NA>
+                c    <NA>
+                d    <NA>
+                e    <NA>
+                dtype: int64
+                >>> a.rmul(b, fill_value=0)
+                a       1
+                b       0
+                c       0
+                d       0
+                e    <NA>
+                dtype: int64
+                """
+            ),
+        )
+    )
+    def rmul(self, other, axis, level=None, fill_value=None):  # noqa: D102
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
+
+        return self._binaryop(other, "__rmul__", fill_value)
+
+    @_cudf_nvtx_annotate
+    @docutils.doc_apply(
+        doc_binop_template.format(
+            operation="Modulo",
+            op_name="mod",
+            equivalent_op="frame % other",
+            df_op_example=textwrap.dedent(
+                """
+                >>> df.mod(1)
+                        angles  degrees
+                circle          0        0
+                triangle        0        0
+                rectangle       0        0
+                """
+            ),
+            ser_op_example=textwrap.dedent(
+                """
+                >>> a.mod(b)
+                a       0
+                b    <NA>
+                c    <NA>
+                d    <NA>
+                e    <NA>
+                dtype: int64
+                >>> a.mod(b, fill_value=0)
+                a             0
+                b    4294967295
+                c    4294967295
+                d             0
+                e          <NA>
+                dtype: int64
+                """
+            ),
+        )
+    )
+    def mod(self, other, axis, level=None, fill_value=None):  # noqa: D102
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
+
+        return self._binaryop(other, "__mod__", fill_value)
+
+    @_cudf_nvtx_annotate
+    @docutils.doc_apply(
+        doc_binop_template.format(
+            operation="Modulo",
+            op_name="rmod",
+            equivalent_op="other % frame",
+            df_op_example=textwrap.dedent(
+                """
+                >>> df.rmod(1)
+                            angles  degrees
+                circle     4294967295        1
+                triangle            1        1
+                rectangle           1        1
+                """
+            ),
+            ser_op_example=textwrap.dedent(
+                """
+                >>> a.rmod(b)
+                a       0
+                b    <NA>
+                c    <NA>
+                d    <NA>
+                e    <NA>
+                dtype: int64
+                >>> a.rmod(b, fill_value=0)
+                a             0
+                b             0
+                c             0
+                d    4294967295
+                e          <NA>
+                dtype: int64
+                """
+            ),
+        )
+    )
+    def rmod(self, other, axis, level=None, fill_value=None):  # noqa: D102
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
+
+        return self._binaryop(other, "__rmod__", fill_value)
+
+    @_cudf_nvtx_annotate
+    @docutils.doc_apply(
+        doc_binop_template.format(
+            operation="Exponential",
+            op_name="pow",
+            equivalent_op="frame ** other",
+            df_op_example=textwrap.dedent(
+                """
+                >>> df.pow(1)
+                        angles  degrees
+                circle          0      360
+                triangle        2      180
+                rectangle       4      360
+                """
+            ),
+            ser_op_example=textwrap.dedent(
+                """
+                >>> a.pow(b)
+                a       1
+                b    <NA>
+                c    <NA>
+                d    <NA>
+                e    <NA>
+                dtype: int64
+                >>> a.pow(b, fill_value=0)
+                a       1
+                b       1
+                c       1
+                d       0
+                e    <NA>
+                dtype: int64
+                """
+            ),
+        )
+    )
+    def pow(self, other, axis, level=None, fill_value=None):  # noqa: D102
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
+
+        return self._binaryop(other, "__pow__", fill_value)
+
+    @_cudf_nvtx_annotate
+    @docutils.doc_apply(
+        doc_binop_template.format(
+            operation="Exponential",
+            op_name="rpow",
+            equivalent_op="other ** frame",
+            df_op_example=textwrap.dedent(
+                """
+                >>> df.rpow(1)
+                        angles  degrees
+                circle          1        1
+                triangle        1        1
+                rectangle       1        1
+                """
+            ),
+            ser_op_example=textwrap.dedent(
+                """
+                >>> a.rpow(b)
+                a       1
+                b    <NA>
+                c    <NA>
+                d    <NA>
+                e    <NA>
+                dtype: int64
+                >>> a.rpow(b, fill_value=0)
+                a       1
+                b       0
+                c       0
+                d       1
+                e    <NA>
+                dtype: int64
+                """
+            ),
+        )
+    )
+    def rpow(self, other, axis, level=None, fill_value=None):  # noqa: D102
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
+
+        return self._binaryop(other, "__rpow__", fill_value)
+
+    @_cudf_nvtx_annotate
+    @docutils.doc_apply(
+        doc_binop_template.format(
+            operation="Integer division",
+            op_name="floordiv",
+            equivalent_op="frame // other",
+            df_op_example=textwrap.dedent(
+                """
+                >>> df.floordiv(1)
+                        angles  degrees
+                circle          0      360
+                triangle        3      180
+                rectangle       4      360
+                """
+            ),
+            ser_op_example=textwrap.dedent(
+                """
+                >>> a.floordiv(b)
+                a       1
+                b    <NA>
+                c    <NA>
+                d    <NA>
+                e    <NA>
+                dtype: int64
+                >>> a.floordiv(b, fill_value=0)
+                a                      1
+                b    9223372036854775807
+                c    9223372036854775807
+                d                      0
+                e                   <NA>
+                dtype: int64
+                """
+            ),
+        )
+    )
+    def floordiv(self, other, axis, level=None, fill_value=None):  # noqa: D102
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
+
+        return self._binaryop(other, "__floordiv__", fill_value)
+
+    @_cudf_nvtx_annotate
+    @docutils.doc_apply(
+        doc_binop_template.format(
+            operation="Integer division",
+            op_name="rfloordiv",
+            equivalent_op="other // frame",
+            df_op_example=textwrap.dedent(
+                """
+                >>> df.rfloordiv(1)
+                                        angles  degrees
+                circle     9223372036854775807        0
+                triangle                     0        0
+                rectangle                    0        0
+                """
+            ),
+            ser_op_example=textwrap.dedent(
+                """
+                >>> a.rfloordiv(b)
+                a       1
+                b    <NA>
+                c    <NA>
+                d    <NA>
+                e    <NA>
+                dtype: int64
+                >>> a.rfloordiv(b, fill_value=0)
+                a                      1
+                b                      0
+                c                      0
+                d    9223372036854775807
+                e                   <NA>
+                dtype: int64
+                """
+            ),
+        )
+    )
+    def rfloordiv(
+        self, other, axis, level=None, fill_value=None
+    ):  # noqa: D102
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
+
+        return self._binaryop(other, "__rfloordiv__", fill_value)
+
+    @_cudf_nvtx_annotate
+    @docutils.doc_apply(
+        doc_binop_template.format(
+            operation="Floating division",
+            op_name="truediv",
+            equivalent_op="frame / other",
+            df_op_example=textwrap.dedent(
+                """
+                >>> df.truediv(1)
+                        angles  degrees
+                circle        0.0    360.0
+                triangle      3.0    180.0
+                rectangle     4.0    360.0
+                """
+            ),
+            ser_op_example=textwrap.dedent(
+                """
+                >>> a.truediv(b)
+                a     1.0
+                b    <NA>
+                c    <NA>
+                d    <NA>
+                e    <NA>
+                dtype: float64
+                >>> a.truediv(b, fill_value=0)
+                a     1.0
+                b     Inf
+                c     Inf
+                d     0.0
+                e    <NA>
+                dtype: float64
+                """
+            ),
+        )
+    )
+    def truediv(self, other, axis, level=None, fill_value=None):  # noqa: D102
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
+
+        return self._binaryop(other, "__truediv__", fill_value)
+
+    # Alias for truediv
+    div = truediv
+    divide = truediv
+
+    @_cudf_nvtx_annotate
+    @docutils.doc_apply(
+        doc_binop_template.format(
+            operation="Floating division",
+            op_name="rtruediv",
+            equivalent_op="other / frame",
+            df_op_example=textwrap.dedent(
+                """
+                >>> df.rtruediv(1)
+                            angles   degrees
+                circle          inf  0.002778
+                triangle   0.333333  0.005556
+                rectangle  0.250000  0.002778
+                """
+            ),
+            ser_op_example=textwrap.dedent(
+                """
+                >>> a.rtruediv(b)
+                a     1.0
+                b    <NA>
+                c    <NA>
+                d    <NA>
+                e    <NA>
+                dtype: float64
+                >>> a.rtruediv(b, fill_value=0)
+                a     1.0
+                b     0.0
+                c     0.0
+                d     Inf
+                e    <NA>
+                dtype: float64
+                """
+            ),
+        )
+    )
+    def rtruediv(self, other, axis, level=None, fill_value=None):  # noqa: D102
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
+
+        return self._binaryop(other, "__rtruediv__", fill_value)
+
+    # Alias for rtruediv
+    rdiv = rtruediv
+
+    @_cudf_nvtx_annotate
+    @docutils.doc_apply(
+        doc_binop_template.format(
+            operation="Equal to",
+            op_name="eq",
+            equivalent_op="frame == other",
+            df_op_example=textwrap.dedent(
+                """
+                >>> df.eq(1)
+                        angles  degrees
+                circle      False    False
+                triangle    False    False
+                rectangle   False    False
+                """
+            ),
+            ser_op_example=textwrap.dedent(
+                """
+                >>> a.eq(b)
+                a    True
+                b    <NA>
+                c    <NA>
+                d    <NA>
+                e    <NA>
+                dtype: bool
+                >>> a.eq(b, fill_value=0)
+                a    True
+                b   False
+                c   False
+                d   False
+                e    <NA>
+                dtype: bool
+                """
+            ),
+        )
+    )
+    def eq(
+        self, other, axis="columns", level=None, fill_value=None
+    ):  # noqa: D102
+        return self._binaryop(
+            other=other, op="__eq__", fill_value=fill_value, can_reindex=True
+        )
+
+    @_cudf_nvtx_annotate
+    @docutils.doc_apply(
+        doc_binop_template.format(
+            operation="Not equal to",
+            op_name="ne",
+            equivalent_op="frame != other",
+            df_op_example=textwrap.dedent(
+                """
+                >>> df.ne(1)
+                        angles  degrees
+                circle       True     True
+                triangle     True     True
+                rectangle    True     True
+                """
+            ),
+            ser_op_example=textwrap.dedent(
+                """
+                >>> a.ne(b)
+                a    False
+                b    <NA>
+                c    <NA>
+                d    <NA>
+                e    <NA>
+                dtype: bool
+                >>> a.ne(b, fill_value=0)
+                a   False
+                b    True
+                c    True
+                d    True
+                e    <NA>
+                dtype: bool
+                """
+            ),
+        )
+    )
+    def ne(
+        self, other, axis="columns", level=None, fill_value=None
+    ):  # noqa: D102
+        return self._binaryop(
+            other=other, op="__ne__", fill_value=fill_value, can_reindex=True
+        )
+
+    @_cudf_nvtx_annotate
+    @docutils.doc_apply(
+        doc_binop_template.format(
+            operation="Less than",
+            op_name="lt",
+            equivalent_op="frame < other",
+            df_op_example=textwrap.dedent(
+                """
+                >>> df.lt(1)
+                        angles  degrees
+                circle       True    False
+                triangle    False    False
+                rectangle   False    False
+                """
+            ),
+            ser_op_example=textwrap.dedent(
+                """
+                >>> a.lt(b)
+                a   False
+                b    <NA>
+                c    <NA>
+                d    <NA>
+                e    <NA>
+                dtype: bool
+                >>> a.lt(b, fill_value=0)
+                a   False
+                b   False
+                c   False
+                d    True
+                e    <NA>
+                dtype: bool
+                """
+            ),
+        )
+    )
+    def lt(
+        self, other, axis="columns", level=None, fill_value=None
+    ):  # noqa: D102
+        return self._binaryop(
+            other=other, op="__lt__", fill_value=fill_value, can_reindex=True
+        )
+
+    @_cudf_nvtx_annotate
+    @docutils.doc_apply(
+        doc_binop_template.format(
+            operation="Less than or equal to",
+            op_name="le",
+            equivalent_op="frame <= other",
+            df_op_example=textwrap.dedent(
+                """
+                >>> df.le(1)
+                        angles  degrees
+                circle       True    False
+                triangle    False    False
+                rectangle   False    False
+                """
+            ),
+            ser_op_example=textwrap.dedent(
+                """
+                >>> a.le(b)
+                a    True
+                b    <NA>
+                c    <NA>
+                d    <NA>
+                e    <NA>
+                dtype: bool
+                >>> a.le(b, fill_value=0)
+                a    True
+                b   False
+                c   False
+                d    True
+                e    <NA>
+                dtype: bool
+                """
+            ),
+        )
+    )
+    def le(
+        self, other, axis="columns", level=None, fill_value=None
+    ):  # noqa: D102
+        return self._binaryop(
+            other=other, op="__le__", fill_value=fill_value, can_reindex=True
+        )
+
+    @_cudf_nvtx_annotate
+    @docutils.doc_apply(
+        doc_binop_template.format(
+            operation="Greater than",
+            op_name="gt",
+            equivalent_op="frame > other",
+            df_op_example=textwrap.dedent(
+                """
+                >>> df.gt(1)
+                        angles  degrees
+                circle      False     True
+                triangle     True     True
+                rectangle    True     True
+                """
+            ),
+            ser_op_example=textwrap.dedent(
+                """
+                >>> a.gt(b)
+                a   False
+                b    <NA>
+                c    <NA>
+                d    <NA>
+                e    <NA>
+                dtype: bool
+                >>> a.gt(b, fill_value=0)
+                a   False
+                b    True
+                c    True
+                d   False
+                e    <NA>
+                dtype: bool
+                """
+            ),
+        )
+    )
+    def gt(
+        self, other, axis="columns", level=None, fill_value=None
+    ):  # noqa: D102
+        return self._binaryop(
+            other=other, op="__gt__", fill_value=fill_value, can_reindex=True
+        )
+
+    @_cudf_nvtx_annotate
+    @docutils.doc_apply(
+        doc_binop_template.format(
+            operation="Greater than or equal to",
+            op_name="ge",
+            equivalent_op="frame >= other",
+            df_op_example=textwrap.dedent(
+                """
+                >>> df.ge(1)
+                        angles  degrees
+                circle      False     True
+                triangle     True     True
+                rectangle    True     True
+                """
+            ),
+            ser_op_example=textwrap.dedent(
+                """
+                >>> a.ge(b)
+                a    True
+                b    <NA>
+                c    <NA>
+                d    <NA>
+                e    <NA>
+                dtype: bool
+                >>> a.ge(b, fill_value=0)
+                a   True
+                b    True
+                c    True
+                d   False
+                e    <NA>
+                dtype: bool
+                """
+            ),
+        )
+    )
+    def ge(
+        self, other, axis="columns", level=None, fill_value=None
+    ):  # noqa: D102
+        return self._binaryop(
+            other=other, op="__ge__", fill_value=fill_value, can_reindex=True
+        )
+
 
 def _check_duplicate_level_names(specified, level_names):
     """Raise if any of `specified` has duplicates in `level_names`."""
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index d14942cd3ce..965810a19e6 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -53,7 +53,6 @@
 from cudf.core.column.string import StringMethods
 from cudf.core.column.struct import StructMethods
 from cudf.core.column_accessor import ColumnAccessor
-from cudf.core.frame import Frame
 from cudf.core.groupby.groupby import SeriesGroupBy
 from cudf.core.index import BaseIndex, RangeIndex, as_index
 from cudf.core.indexed_frame import (
@@ -3284,7 +3283,7 @@ def make_binop_func(op):
     # appropriate API for Series as required for pandas compatibility. The
     # main effect is reordering and error-checking parameters in
     # Series-specific ways.
-    wrapped_func = getattr(Frame, op)
+    wrapped_func = getattr(IndexedFrame, op)
 
     @functools.wraps(wrapped_func)
     def wrapper(self, other, level=None, fill_value=None, axis=0):
diff --git a/python/cudf/cudf/core/udf/row_function.py b/python/cudf/cudf/core/udf/row_function.py
index 5cda9fb8218..1d0bd5ac99d 100644
--- a/python/cudf/cudf/core/udf/row_function.py
+++ b/python/cudf/cudf/core/udf/row_function.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 import math
 
 import numpy as np
diff --git a/python/cudf/cudf/core/udf/scalar_function.py b/python/cudf/cudf/core/udf/scalar_function.py
index 7f3b461a1f0..a7b887dd2d5 100644
--- a/python/cudf/cudf/core/udf/scalar_function.py
+++ b/python/cudf/cudf/core/udf/scalar_function.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+
 from numba import cuda
 from numba.np import numpy_support
 
diff --git a/python/cudf/cudf/core/udf/templates.py b/python/cudf/cudf/core/udf/templates.py
index 8cb11133323..3ac7083582f 100644
--- a/python/cudf/cudf/core/udf/templates.py
+++ b/python/cudf/cudf/core/udf/templates.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+
 unmasked_input_initializer_template = """\
         d_{idx} = input_col_{idx}
         masked_{idx} = Masked(d_{idx}[i], True)
diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index 2be1691a1a6..cdb9492c695 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import operator
 
diff --git a/python/cudf/cudf/datasets.py b/python/cudf/cudf/datasets.py
index d7a2fedef59..c6091ab60fc 100644
--- a/python/cudf/cudf/datasets.py
+++ b/python/cudf/cudf/datasets.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+
 import numpy as np
 import pandas as pd
 
diff --git a/python/cudf/cudf/errors.py b/python/cudf/cudf/errors.py
index 5d6f52c0307..bd264940081 100644
--- a/python/cudf/cudf/errors.py
+++ b/python/cudf/cudf/errors.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 
 class UnsupportedCUDAError(Exception):
diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py
index 0d0ba579f22..1d3d9e91ae2 100644
--- a/python/cudf/cudf/tests/test_copying.py
+++ b/python/cudf/cudf/tests/test_copying.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+
 import numpy as np
 import pandas as pd
 import pytest
diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py
index 3081b7c4a6e..90cf11d7dde 100644
--- a/python/cudf/cudf/tests/test_factorize.py
+++ b/python/cudf/cudf/tests/test_factorize.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 import cupy as cp
 import numpy as np
diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py
index 307232b1305..f15d705c4e2 100644
--- a/python/cudf/cudf/tests/test_gcs.py
+++ b/python/cudf/cudf/tests/test_gcs.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import io
 import os
diff --git a/python/cudf/cudf/tests/test_hdfs.py b/python/cudf/cudf/tests/test_hdfs.py
index 2d61d6693cb..de4303a34a8 100644
--- a/python/cudf/cudf/tests/test_hdfs.py
+++ b/python/cudf/cudf/tests/test_hdfs.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import os
 from io import BytesIO
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 6a665a2b43c..dc624ebe2b5 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -292,10 +292,32 @@ def test_get_nested_lists():
     assert_eq(expect, got)
 
 
-def test_get_nulls():
-    with pytest.raises(IndexError, match="list index out of range"):
-        sr = cudf.Series([[], [], []])
-        sr.list.get(100)
+def test_get_default():
+    sr = cudf.Series([[1, 2], [3, 4, 5], [6, 7, 8, 9]])
+
+    assert_eq(cudf.Series([cudf.NA, 5, 8]), sr.list.get(2))
+    assert_eq(cudf.Series([cudf.NA, 5, 8]), sr.list.get(2, default=cudf.NA))
+    assert_eq(cudf.Series([0, 5, 8]), sr.list.get(2, default=0))
+    assert_eq(cudf.Series([0, 3, 7]), sr.list.get(-3, default=0))
+    assert_eq(cudf.Series([2, 5, 9]), sr.list.get(-1))
+
+    string_sr = cudf.Series(
+        [["apple", "banana"], ["carrot", "daffodil", "elephant"]]
+    )
+    assert_eq(
+        cudf.Series(["default", "elephant"]),
+        string_sr.list.get(2, default="default"),
+    )
+
+    sr_with_null = cudf.Series([[0, cudf.NA], [1]])
+    assert_eq(cudf.Series([cudf.NA, 0]), sr_with_null.list.get(1, default=0))
+
+    sr_nested = cudf.Series([[[1, 2], [3, 4], [5, 6]], [[5, 6], [7, 8]]])
+    assert_eq(cudf.Series([[3, 4], [7, 8]]), sr_nested.list.get(1))
+    assert_eq(cudf.Series([[5, 6], cudf.NA]), sr_nested.list.get(2))
+    assert_eq(
+        cudf.Series([[5, 6], [0, 0]]), sr_nested.list.get(2, default=[0, 0])
+    )
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_pack.py b/python/cudf/cudf/tests/test_pack.py
index 942cba2842a..67435c71e6b 100644
--- a/python/cudf/cudf/tests/test_pack.py
+++ b/python/cudf/cudf/tests/test_pack.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py
index 10c3689fcd7..a182a5e7d24 100644
--- a/python/cudf/cudf/tests/test_sorting.py
+++ b/python/cudf/cudf/tests/test_sorting.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 import string
 from itertools import product
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 493098cd494..d600fdeee27 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -1299,6 +1299,12 @@ def test_string_slice_replace(string, number, diff, repr):
     )
 
 
+def test_string_slice_replace_fail():
+    gs = cudf.Series(["abc", "xyz", ""])
+    with pytest.raises(TypeError):
+        gs.str.slice_replace(0, 1, ["_"])
+
+
 def test_string_insert():
     gs = cudf.Series(["hello world", "holy accéntéd", "batman", None, ""])
 
@@ -1312,6 +1318,9 @@ def test_string_insert():
         ps.str.slice(stop=5) + "---" + ps.str.slice(start=5),
     )
 
+    with pytest.raises(TypeError):
+        gs.str.insert(0, ["+"])
+
 
 _string_char_types_data = [
     ["abc", "xyz", "a", "ab", "123", "097"],
@@ -1404,6 +1413,9 @@ def test_string_filter_alphanum():
         expected.append(rs)
     assert_eq(gs.str.filter_alphanum("*", keep=False), cudf.Series(expected))
 
+    with pytest.raises(TypeError):
+        gs.str.filter_alphanum(["a"])
+
 
 @pytest.mark.parametrize(
     "case_op", ["title", "capitalize", "lower", "upper", "swapcase"]
@@ -1504,6 +1516,14 @@ def test_strings_partition(data):
     assert_eq(pi.str.partition("-"), gi.str.partition("-"))
 
 
+def test_string_partition_fail():
+    gs = cudf.Series(["abc", "aa", "cba"])
+    with pytest.raises(TypeError):
+        gs.str.partition(["a"])
+    with pytest.raises(TypeError):
+        gs.str.rpartition(["a"])
+
+
 @pytest.mark.parametrize(
     "data",
     [
@@ -1640,6 +1660,16 @@ def test_strings_strip_tests(data, to_strip):
     )
 
 
+def test_string_strip_fail():
+    gs = cudf.Series(["a", "aa", ""])
+    with pytest.raises(TypeError):
+        gs.str.strip(["a"])
+    with pytest.raises(TypeError):
+        gs.str.lstrip(["a"])
+    with pytest.raises(TypeError):
+        gs.str.rstrip(["a"])
+
+
 @pytest.mark.parametrize(
     "data",
     [
@@ -2364,6 +2394,9 @@ def test_string_str_filter_characters():
     )
     assert_eq(expected, gs.str.filter_characters(filter, True, " "))
 
+    with pytest.raises(TypeError):
+        gs.str.filter_characters(filter, True, ["a"])
+
 
 def test_string_str_code_points():
 
diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py
index 167f171fa26..8d1056ca9cc 100644
--- a/python/cudf/cudf/tests/test_struct.py
+++ b/python/cudf/cudf/tests/test_struct.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py
index 742c747ab69..96b124c27ec 100755
--- a/python/cudf/cudf/utils/cudautils.py
+++ b/python/cudf/cudf/utils/cudautils.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+
 from pickle import dumps
 
 import cachetools
diff --git a/python/cudf/cudf/utils/gpu_utils.py b/python/cudf/cudf/utils/gpu_utils.py
index bd3da4ea2ba..a722d350ef4 100644
--- a/python/cudf/cudf/utils/gpu_utils.py
+++ b/python/cudf/cudf/utils/gpu_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 
 def validate_setup():
diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py
index 64218ddf46a..cdaaff6b2af 100644
--- a/python/cudf/cudf/utils/queryutils.py
+++ b/python/cudf/cudf/utils/queryutils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 import ast
 import datetime as dt
diff --git a/python/cudf_kafka/cudf_kafka/_version.py b/python/cudf_kafka/cudf_kafka/_version.py
index 6cd10cc10bf..3c1d113fd47 100644
--- a/python/cudf_kafka/cudf_kafka/_version.py
+++ b/python/cudf_kafka/cudf_kafka/_version.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 # This file helps to compute a version number in source trees obtained from
 # git-archive tarball (such as those provided by githubs download-from-tag
 # feature). Distribution tarballs (built by setup.py sdist) and build
diff --git a/python/cudf_kafka/versioneer.py b/python/cudf_kafka/versioneer.py
index a3b0246e785..dbddb6e0fd0 100644
--- a/python/cudf_kafka/versioneer.py
+++ b/python/cudf_kafka/versioneer.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 # Version: 0.18
 
 """The Versioneer - like a rocketeer, but for versions.
diff --git a/python/custreamz/custreamz/_version.py b/python/custreamz/custreamz/_version.py
index 106fc3524f9..a017486df32 100644
--- a/python/custreamz/custreamz/_version.py
+++ b/python/custreamz/custreamz/_version.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 # This file helps to compute a version number in source trees obtained from
 # git-archive tarball (such as those provided by githubs download-from-tag
 # feature). Distribution tarballs (built by setup.py sdist) and build
diff --git a/python/custreamz/custreamz/tests/test_dataframes.py b/python/custreamz/custreamz/tests/test_dataframes.py
index a7378408c24..51f55684500 100644
--- a/python/custreamz/custreamz/tests/test_dataframes.py
+++ b/python/custreamz/custreamz/tests/test_dataframes.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 """
 Tests for Streamz Dataframes (SDFs) built on top of cuDF DataFrames.
diff --git a/python/dask_cudf/dask_cudf/_version.py b/python/dask_cudf/dask_cudf/_version.py
index 104879fce36..f0dbcac0017 100644
--- a/python/dask_cudf/dask_cudf/_version.py
+++ b/python/dask_cudf/dask_cudf/_version.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 # This file helps to compute a version number in source trees obtained from
 # git-archive tarball (such as those provided by githubs download-from-tag
 # feature). Distribution tarballs (built by setup.py sdist) and build
diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py
index 2d326e41c3e..3c11fe3ffbb 100644
--- a/python/dask_cudf/dask_cudf/io/orc.py
+++ b/python/dask_cudf/dask_cudf/io/orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from io import BufferedWriter, IOBase
 
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index 84c0e0e9b39..95cf0c8d56d 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -384,7 +384,7 @@ def test_contains(data, search_key):
     "data, index, expectation",
     [
         (data_test_1(), 1, does_not_raise()),
-        (data_test_2(), 2, pytest.raises(IndexError)),
+        (data_test_2(), 2, does_not_raise()),
     ],
 )
 def test_get(data, index, expectation):

0.0	4.704433	3.380014	0.000000	11.030136	0.0	0.000000	0.0	0.0	0.000000	0.0	-1.162275	0.000000	0.000000	0.000000	0.000000	0.00000	5.726806	0.0	0.0	0.000000
1	0.0	0.000000	0.000000	0.0	0.000000	0.0	0.0	11.460403	0.0	0.000000	0.000000	0.000000	0.000000	5.917846	0.000000	5.90886	0.000000	0.0	0.0	0.000000
2	0.0	0.000000	0.000000	0.0	0.000000	0.0	0.0	0.000000	0.0	0.000000	0.407392	0.000000	0.000000	0.000000	6.646564	0.00000	0.000000	0.0	0.0	3.399164
3	0.0	0.000000	0.000000	0.0	0.000000	0.0	0.0	14.092100	0.000000	0.0	0.378781	0.420953	0.000000	0.000000	0.00000	0.000000	0.0	0.0	0.000000
4	0.0	0.000000	8.299425	0.0	2.096401	0.0	0.109242	2.541798	0.0	0.071563	8.223387	0.000000	0.0	0.000000	0.000000	0.000000	0.00000	10.744624	0.0	0.0	0.000000