diff --git a/cpp/benchmarks/column/concatenate.cpp b/cpp/benchmarks/column/concatenate.cpp
index 67ea6129a74..21e5db8ca8f 100644
--- a/cpp/benchmarks/column/concatenate.cpp
+++ b/cpp/benchmarks/column/concatenate.cpp
@@ -45,7 +45,7 @@ static void BM_concatenate(benchmark::State& state)
   auto input_columns = input->view();
   std::vector<cudf::column_view> column_views(input_columns.begin(), input_columns.end());
 
-  CUDF_CHECK_CUDA(0);
+  CHECK_CUDA(0);
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
@@ -87,7 +87,7 @@ static void BM_concatenate_tables(benchmark::State& state)
     return table->view();
   });
 
-  CUDF_CHECK_CUDA(0);
+  CHECK_CUDA(0);
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
@@ -146,7 +146,7 @@ static void BM_concatenate_strings(benchmark::State& state)
       return static_cast<cudf::column_view>(col);
     });
 
-  CUDF_CHECK_CUDA(0);
+  CHECK_CUDA(0);
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh
index 5df77ac4319..e846317f472 100644
--- a/cpp/benchmarks/join/generate_input_tables.cuh
+++ b/cpp/benchmarks/join/generate_input_tables.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -139,18 +139,18 @@ void generate_input_tables(key_type* const build_tbl,
 
   // Maximize exposed parallelism while minimizing storage for curand state
   int num_blocks_init_build_tbl{-1};
-  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+  CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
     &num_blocks_init_build_tbl, init_build_tbl<key_type, size_type>, block_size, 0));
 
   int num_blocks_init_probe_tbl{-1};
-  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+  CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
     &num_blocks_init_probe_tbl, init_probe_tbl<key_type, size_type>, block_size, 0));
 
   int dev_id{-1};
-  CUDF_CUDA_TRY(cudaGetDevice(&dev_id));
+  CUDA_TRY(cudaGetDevice(&dev_id));
 
   int num_sms{-1};
-  CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
+  CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
 
   const int num_states =
     num_sms * std::max(num_blocks_init_build_tbl, num_blocks_init_probe_tbl) * block_size;
@@ -158,12 +158,12 @@ void generate_input_tables(key_type* const build_tbl,
 
   init_curand<<<(num_states - 1) / block_size + 1, block_size>>>(devStates.data(), num_states);
 
-  CUDF_CHECK_CUDA(0);
+  CHECK_CUDA(0);
 
   init_build_tbl<key_type, size_type><<<num_sms * num_blocks_init_build_tbl, block_size>>>(
     build_tbl, build_tbl_size, multiplicity, devStates.data(), num_states);
 
-  CUDF_CHECK_CUDA(0);
+  CHECK_CUDA(0);
 
   auto const rand_max = std::numeric_limits<key_type>::max();
 
@@ -177,5 +177,5 @@ void generate_input_tables(key_type* const build_tbl,
                                                           devStates.data(),
                                                           num_states);
 
-  CUDF_CHECK_CUDA(0);
+  CHECK_CUDA(0);
 }
diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp
index 6ff2543cf7d..27339248968 100644
--- a/cpp/benchmarks/join/join_common.hpp
+++ b/cpp/benchmarks/join/join_common.hpp
@@ -116,7 +116,7 @@ static void BM_join(state_type& state, Join JoinFunc)
   auto build_payload_column = cudf::sequence(build_table_size, *init);
   auto probe_payload_column = cudf::sequence(probe_table_size, *init);
 
-  CUDF_CHECK_CUDA(0);
+  CHECK_CUDA(0);
 
   cudf::table_view build_table({build_key_column->view(), *build_payload_column});
   cudf::table_view probe_table({probe_key_column->view(), *probe_payload_column});
diff --git a/cpp/benchmarks/synchronization/synchronization.cpp b/cpp/benchmarks/synchronization/synchronization.cpp
index bbf90e6f68a..bd8a4d1de76 100644
--- a/cpp/benchmarks/synchronization/synchronization.cpp
+++ b/cpp/benchmarks/synchronization/synchronization.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,32 +29,32 @@ cuda_event_timer::cuda_event_timer(benchmark::State& state,
   // flush all of L2$
   if (flush_l2_cache) {
     int current_device = 0;
-    CUDF_CUDA_TRY(cudaGetDevice(&current_device));
+    CUDA_TRY(cudaGetDevice(&current_device));
 
     int l2_cache_bytes = 0;
-    CUDF_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));
+    CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));
 
     if (l2_cache_bytes > 0) {
       const int memset_value = 0;
       rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream);
-      CUDF_CUDA_TRY(
+      CUDA_TRY(
         cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value()));
     }
   }
 
-  CUDF_CUDA_TRY(cudaEventCreate(&start));
-  CUDF_CUDA_TRY(cudaEventCreate(&stop));
-  CUDF_CUDA_TRY(cudaEventRecord(start, stream.value()));
+  CUDA_TRY(cudaEventCreate(&start));
+  CUDA_TRY(cudaEventCreate(&stop));
+  CUDA_TRY(cudaEventRecord(start, stream.value()));
 }
 
 cuda_event_timer::~cuda_event_timer()
 {
-  CUDF_CUDA_TRY(cudaEventRecord(stop, stream.value()));
-  CUDF_CUDA_TRY(cudaEventSynchronize(stop));
+  CUDA_TRY(cudaEventRecord(stop, stream.value()));
+  CUDA_TRY(cudaEventSynchronize(stop));
 
   float milliseconds = 0.0f;
-  CUDF_CUDA_TRY(cudaEventElapsedTime(&milliseconds, start, stop));
+  CUDA_TRY(cudaEventElapsedTime(&milliseconds, start, stop));
   p_state->SetIterationTime(milliseconds / (1000.0f));
-  CUDF_CUDA_TRY(cudaEventDestroy(start));
-  CUDF_CUDA_TRY(cudaEventDestroy(stop));
+  CUDA_TRY(cudaEventDestroy(start));
+  CUDA_TRY(cudaEventDestroy(stop));
 }
diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
index aba78dad3fe..6ab6f9a2095 100644
--- a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
+++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
@@ -196,13 +196,13 @@ void type_dispatcher_benchmark(::benchmark::State& state)
   rmm::device_uvector<TypeParam*> d_vec(n_cols, rmm::cuda_stream_default);
 
   if (dispatching_type == NO_DISPATCHING) {
-    CUDF_CUDA_TRY(cudaMemcpy(
+    CUDA_TRY(cudaMemcpy(
       d_vec.data(), h_vec_p.data(), sizeof(TypeParam*) * n_cols, cudaMemcpyHostToDevice));
   }
 
   // Warm up
   launch_kernel<functor_type, dispatching_type>(source_table, d_vec.data(), work_per_thread);
-  CUDF_CUDA_TRY(cudaDeviceSynchronize());
+  CUDA_TRY(cudaDeviceSynchronize());
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
diff --git a/cpp/docs/DEVELOPER_GUIDE.md b/cpp/docs/DEVELOPER_GUIDE.md
index 1599c81cbe5..eeebe38d873 100644
--- a/cpp/docs/DEVELOPER_GUIDE.md
+++ b/cpp/docs/DEVELOPER_GUIDE.md
@@ -373,7 +373,7 @@ namespace detail{
     void external_function(..., rmm::cuda_stream_view stream){
         // Implementation uses the stream with async APIs.
         rmm::device_buffer buff(...,stream);
-        CUDF_CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
+        CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
         kernel<<<..., stream>>>(...);
         thrust::algorithm(rmm::exec_policy(stream), ...);
     }
@@ -777,7 +777,7 @@ CUDF_FAIL("This code path should not be reached.");
 
 ### CUDA Error Checking
 
-Use the `CUDF_CUDA_TRY` macro to check for the successful completion of CUDA runtime API functions. This
+Use the `CUDA_TRY` macro to check for the successful completion of CUDA runtime API functions. This
 macro throws a `cudf::cuda_error` exception if the CUDA API return value is not `cudaSuccess`. The
 thrown exception includes a description of the CUDA error code in its `what()` message.
 
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index ecaa4a30cf0..0087dd1b173 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -254,10 +254,10 @@ struct scatter_gather_functor {
     if (output.nullable()) {
       // Have to initialize the output mask to all zeros because we may update
       // it with atomicOr().
-      CUDF_CUDA_TRY(cudaMemsetAsync(static_cast<void*>(output.null_mask()),
-                                    0,
-                                    cudf::bitmask_allocation_size_bytes(output.size()),
-                                    stream.value()));
+      CUDA_TRY(cudaMemsetAsync(static_cast<void*>(output.null_mask()),
+                               0,
+                               cudf::bitmask_allocation_size_bytes(output.size()),
+                               stream.value()));
     }
 
     auto output_device_view = cudf::mutable_column_device_view::create(output, stream);
@@ -344,7 +344,7 @@ std::unique_ptr<table> copy_if(
 
   // initialize just the first element of block_offsets to 0 since the InclusiveSum below
   // starts at the second element.
-  CUDF_CUDA_TRY(cudaMemsetAsync(block_offsets.begin(), 0, sizeof(cudf::size_type), stream.value()));
+  CUDA_TRY(cudaMemsetAsync(block_offsets.begin(), 0, sizeof(cudf::size_type), stream.value()));
 
   // 2. Find the offset for each block's output using a scan of block counts
   if (grid.num_blocks > 1) {
@@ -370,7 +370,7 @@ std::unique_ptr<table> copy_if(
   // As it is InclusiveSum, last value in block_offsets will be output_size
   // unless num_blocks == 1, in which case output_size is just block_counts[0]
   cudf::size_type output_size{0};
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
+  CUDA_TRY(cudaMemcpyAsync(
     &output_size,
     grid.num_blocks > 1 ? block_offsets.begin() + grid.num_blocks : block_counts.begin(),
     sizeof(cudf::size_type),
diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh
index 6703db305a1..ac59b429a2c 100644
--- a/cpp/include/cudf/detail/copy_range.cuh
+++ b/cpp/include/cudf/detail/copy_range.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -185,7 +185,7 @@ void copy_range(SourceValueIterator source_value_begin,
       nullptr);
   }
 
-  CUDF_CHECK_CUDA(stream.value());
+  CHECK_CUDA(stream.value());
 }
 
 /**
diff --git a/cpp/include/cudf/detail/get_value.cuh b/cpp/include/cudf/detail/get_value.cuh
index 49a406ab5f0..56c0289dc0a 100644
--- a/cpp/include/cudf/detail/get_value.cuh
+++ b/cpp/include/cudf/detail/get_value.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,11 +49,11 @@ T get_value(column_view const& col_view, size_type element_index, rmm::cuda_stre
   CUDF_EXPECTS(element_index >= 0 && element_index < col_view.size(),
                "invalid element_index value");
   T result;
-  CUDF_CUDA_TRY(cudaMemcpyAsync(&result,
-                                col_view.data<T>() + element_index,
-                                sizeof(T),
-                                cudaMemcpyDeviceToHost,
-                                stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(&result,
+                           col_view.data<T>() + element_index,
+                           sizeof(T),
+                           cudaMemcpyDeviceToHost,
+                           stream.value()));
   stream.synchronize();
   return result;
 }
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index 7aec56fdc51..be010689847 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -170,20 +170,20 @@ size_type inplace_bitmask_binop(
   rmm::device_uvector<bitmask_type const*> d_masks(masks.size(), stream, mr);
   rmm::device_uvector<size_type> d_begin_bits(masks_begin_bits.size(), stream, mr);
 
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
+  CUDA_TRY(cudaMemcpyAsync(
     d_masks.data(), masks.data(), masks.size_bytes(), cudaMemcpyHostToDevice, stream.value()));
-  CUDF_CUDA_TRY(cudaMemcpyAsync(d_begin_bits.data(),
-                                masks_begin_bits.data(),
-                                masks_begin_bits.size_bytes(),
-                                cudaMemcpyHostToDevice,
-                                stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(d_begin_bits.data(),
+                           masks_begin_bits.data(),
+                           masks_begin_bits.size_bytes(),
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
 
   auto constexpr block_size = 256;
   cudf::detail::grid_1d config(dest_mask.size(), block_size);
   offset_bitmask_binop<block_size>
     <<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
       op, dest_mask, d_masks, d_begin_bits, mask_size_bits, d_counter.data());
-  CUDF_CHECK_CUDA(stream.value());
+  CHECK_CUDA(stream.value());
   return d_counter.value(stream);
 }
 
@@ -298,25 +298,27 @@ rmm::device_uvector<size_type> segmented_count_bits(bitmask_type const* bitmask,
 
   // Allocate temporary memory.
   size_t temp_storage_bytes{0};
-  CUDF_CUDA_TRY(cub::DeviceSegmentedReduce::Sum(nullptr,
-                                                temp_storage_bytes,
-                                                num_set_bits_in_word,
-                                                d_bit_counts.begin(),
-                                                num_ranges,
-                                                first_word_indices,
-                                                last_word_indices,
-                                                stream.value()));
+  CUDA_TRY(cub::DeviceSegmentedReduce::Sum(nullptr,
+                                           temp_storage_bytes,
+                                           num_set_bits_in_word,
+                                           d_bit_counts.begin(),
+                                           num_ranges,
+                                           first_word_indices,
+                                           last_word_indices,
+                                           stream.value()));
   rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
 
   // Perform segmented reduction.
-  CUDF_CUDA_TRY(cub::DeviceSegmentedReduce::Sum(d_temp_storage.data(),
-                                                temp_storage_bytes,
-                                                num_set_bits_in_word,
-                                                d_bit_counts.begin(),
-                                                num_ranges,
-                                                first_word_indices,
-                                                last_word_indices,
-                                                stream.value()));
+  CUDA_TRY(cub::DeviceSegmentedReduce::Sum(d_temp_storage.data(),
+                                           temp_storage_bytes,
+                                           num_set_bits_in_word,
+                                           d_bit_counts.begin(),
+                                           num_ranges,
+                                           first_word_indices,
+                                           last_word_indices,
+                                           stream.value()));
+
+  CHECK_CUDA(stream.value());
 
   // Adjust counts in segment boundaries (if segments are not word-aligned).
   constexpr size_type block_size{256};
@@ -348,7 +350,7 @@ rmm::device_uvector<size_type> segmented_count_bits(bitmask_type const* bitmask,
                       });
   }
 
-  CUDF_CHECK_CUDA(stream.value());
+  CHECK_CUDA(stream.value());
   return d_bit_counts;
 }
 
diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
index 23d0ff26e0f..a00bd64caa3 100644
--- a/cpp/include/cudf/detail/utilities/cuda.cuh
+++ b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -127,12 +127,12 @@ cudf::size_type elements_per_thread(Kernel kernel,
 
   // calculate theoretical occupancy
   int max_blocks = 0;
-  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks, kernel, block_size, 0));
+  CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks, kernel, block_size, 0));
 
   int device = 0;
-  CUDF_CUDA_TRY(cudaGetDevice(&device));
+  CUDA_TRY(cudaGetDevice(&device));
   int num_sms = 0;
-  CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device));
+  CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device));
   int per_thread = total_size / (max_blocks * num_sms * block_size);
   return std::clamp(per_thread, 1, max_per_thread);
 }
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 63ac48f6060..e3f44ce0bee 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -53,7 +53,7 @@ rmm::device_uvector<T> make_zeroed_device_uvector_async(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   rmm::device_uvector<T> ret(size, stream, mr);
-  CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value()));
+  CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value()));
   return ret;
 }
 
@@ -75,7 +75,7 @@ rmm::device_uvector<T> make_zeroed_device_uvector_sync(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   rmm::device_uvector<T> ret(size, stream, mr);
-  CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value()));
+  CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value()));
   stream.synchronize();
   return ret;
 }
@@ -99,11 +99,11 @@ rmm::device_uvector<T> make_device_uvector_async(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(),
-                                source_data.data(),
-                                source_data.size() * sizeof(T),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(ret.data(),
+                           source_data.data(),
+                           source_data.size() * sizeof(T),
+                           cudaMemcpyDefault,
+                           stream.value()));
   return ret;
 }
 
@@ -151,11 +151,11 @@ rmm::device_uvector<T> make_device_uvector_async(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(),
-                                source_data.data(),
-                                source_data.size() * sizeof(T),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(ret.data(),
+                           source_data.data(),
+                           source_data.size() * sizeof(T),
+                           cudaMemcpyDefault,
+                           stream.value()));
   return ret;
 }
 
@@ -286,7 +286,7 @@ template <typename T, typename OutContainer>
 OutContainer make_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
 {
   OutContainer result(v.size());
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
+  CUDA_TRY(cudaMemcpyAsync(
     result.data(), v.data(), v.size() * sizeof(T), cudaMemcpyDeviceToHost, stream.value()));
   return result;
 }
diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
index 56db3fd6216..ffe159b59dc 100644
--- a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
+++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
@@ -78,14 +78,14 @@ class istream_data_chunk_reader : public data_chunk_reader {
   {
     // create an event to track the completion of the last device-to-host copy.
     for (std::size_t i = 0; i < _tickets.size(); i++) {
-      CUDF_CUDA_TRY(cudaEventCreate(&(_tickets[i].event)));
+      CUDA_TRY(cudaEventCreate(&(_tickets[i].event)));
     }
   }
 
   ~istream_data_chunk_reader()
   {
     for (std::size_t i = 0; i < _tickets.size(); i++) {
-      CUDF_CUDA_TRY(cudaEventDestroy(_tickets[i].event));
+      CUDA_TRY(cudaEventDestroy(_tickets[i].event));
     }
   }
 
@@ -101,7 +101,7 @@ class istream_data_chunk_reader : public data_chunk_reader {
     _next_ticket_idx = (_next_ticket_idx + 1) % _tickets.size();
 
     // synchronize on the last host-to-device copy, so we don't clobber the host buffer.
-    CUDF_CUDA_TRY(cudaEventSynchronize(h_ticket.event));
+    CUDA_TRY(cudaEventSynchronize(h_ticket.event));
 
     // resize the host buffer as necessary to contain the requested number of bytes
     if (h_ticket.buffer.size() < read_size) { h_ticket.buffer.resize(read_size); }
@@ -116,7 +116,7 @@ class istream_data_chunk_reader : public data_chunk_reader {
     auto chunk = rmm::device_uvector<char>(read_size, stream);
 
     // copy the host-pinned data on to device
-    CUDF_CUDA_TRY(cudaMemcpyAsync(  //
+    CUDA_TRY(cudaMemcpyAsync(  //
       chunk.data(),
       h_ticket.buffer.data(),
       read_size,
@@ -124,7 +124,7 @@ class istream_data_chunk_reader : public data_chunk_reader {
       stream.value()));
 
     // record the host-to-device copy.
-    CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value()));
+    CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value()));
 
     // return the view over device memory so it can be processed.
     return std::make_unique<device_uvector_data_chunk>(std::move(chunk));
diff --git a/cpp/include/cudf/strings/detail/utilities.cuh b/cpp/include/cudf/strings/detail/utilities.cuh
index b9ea2d9ecff..4b036fb7f0e 100644
--- a/cpp/include/cudf/strings/detail/utilities.cuh
+++ b/cpp/include/cudf/strings/detail/utilities.cuh
@@ -67,7 +67,7 @@ std::unique_ptr<column> make_offsets_child_column(
   // we use inclusive-scan on a shifted output (d_offsets+1) and then set the first
   // offset values to zero manually.
   thrust::inclusive_scan(rmm::exec_policy(stream), begin, end, d_offsets + 1);
-  CUDF_CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(int32_t), stream.value()));
+  CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(int32_t), stream.value()));
   return offsets_column;
 }
 
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index a486a5a765c..27ee5cf95cd 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -99,8 +99,7 @@ CUDF_HOST_DEVICE inline string_view string_view::max()
 #if defined(__CUDA_ARCH__)
   psentinel = &cudf::strings::detail::max_string_sentinel[0];
 #else
-  CUDF_CUDA_TRY(
-    cudaGetSymbolAddress((void**)&psentinel, cudf::strings::detail::max_string_sentinel));
+  CUDA_TRY(cudaGetSymbolAddress((void**)&psentinel, cudf::strings::detail::max_string_sentinel));
 #endif
   return string_view(psentinel, 4);
 }
diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh
index 8d08a3fd0b0..3ed18099463 100644
--- a/cpp/include/cudf/table/table_device_view.cuh
+++ b/cpp/include/cudf/table/table_device_view.cuh
@@ -145,7 +145,7 @@ auto contiguous_copy_column_device_views(HostTableView source_view, rmm::cuda_st
   auto d_columns = detail::child_columns_to_device_array<ColumnDeviceView>(
     source_view.begin(), source_view.end(), h_ptr, d_ptr);
 
-  CUDF_CUDA_TRY(cudaMemcpyAsync(d_ptr, h_ptr, views_size_bytes, cudaMemcpyDefault, stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(d_ptr, h_ptr, views_size_bytes, cudaMemcpyDefault, stream.value()));
   stream.synchronize();
   return std::make_tuple(std::move(descendant_storage), d_columns);
 }
diff --git a/cpp/include/cudf/utilities/error.hpp b/cpp/include/cudf/utilities/error.hpp
index 8be1a7e3a32..2036723a6ed 100644
--- a/cpp/include/cudf/utilities/error.hpp
+++ b/cpp/include/cudf/utilities/error.hpp
@@ -1,19 +1,3 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
 #pragma once
 
 #include <cuda.h>
@@ -115,7 +99,7 @@ inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int l
  * cudaSuccess, invokes cudaGetLastError() to clear the error and throws an
  * exception detailing the CUDA error that occurred
  */
-#define CUDF_CUDA_TRY(call)                                       \
+#define CUDA_TRY(call)                                            \
   do {                                                            \
     cudaError_t const status = (call);                            \
     if (cudaSuccess != status) {                                  \
@@ -138,12 +122,12 @@ inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int l
  * asynchronous kernel launch.
  */
 #ifndef NDEBUG
-#define CUDF_CHECK_CUDA(stream)                   \
-  do {                                            \
-    CUDF_CUDA_TRY(cudaStreamSynchronize(stream)); \
-    CUDF_CUDA_TRY(cudaPeekAtLastError());         \
+#define CHECK_CUDA(stream)                   \
+  do {                                       \
+    CUDA_TRY(cudaStreamSynchronize(stream)); \
+    CUDA_TRY(cudaPeekAtLastError());         \
   } while (0);
 #else
-#define CUDF_CHECK_CUDA(stream) CUDF_CUDA_TRY(cudaPeekAtLastError());
+#define CHECK_CUDA(stream) CUDA_TRY(cudaPeekAtLastError());
 #endif
 /** @} */
diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp
index b28ed4f70fa..4c2d4d429eb 100644
--- a/cpp/include/cudf_test/column_utilities.hpp
+++ b/cpp/include/cudf_test/column_utilities.hpp
@@ -183,8 +183,7 @@ template <typename T, std::enable_if_t<not cudf::is_fixed_point<T>()>* = nullptr
 std::pair<thrust::host_vector<T>, std::vector<bitmask_type>> to_host(column_view c)
 {
   thrust::host_vector<T> host_data(c.size());
-  CUDF_CUDA_TRY(
-    cudaMemcpy(host_data.data(), c.data<T>(), c.size() * sizeof(T), cudaMemcpyDeviceToHost));
+  CUDA_TRY(cudaMemcpy(host_data.data(), c.data<T>(), c.size() * sizeof(T), cudaMemcpyDeviceToHost));
   return {host_data, bitmask_to_host(c)};
 }
 
@@ -207,7 +206,7 @@ std::pair<thrust::host_vector<T>, std::vector<bitmask_type>> to_host(column_view
 
   auto host_rep_types = thrust::host_vector<Rep>(c.size());
 
-  CUDF_CUDA_TRY(cudaMemcpy(
+  CUDA_TRY(cudaMemcpy(
     host_rep_types.data(), c.begin<Rep>(), c.size() * sizeof(Rep), cudaMemcpyDeviceToHost));
 
   auto to_fp = [&](Rep val) { return T{scaled_integer<Rep>{val, scale_type{c.type().scale()}}}; };
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index ec41fbb8883..9b3e33f491e 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -265,7 +265,7 @@ void for_each(rmm::cuda_stream_view stream, cudf::size_type size, Functor f)
 {
   int block_size;
   int min_grid_size;
-  CUDF_CUDA_TRY(
+  CUDA_TRY(
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, for_each_kernel<decltype(f)>));
   // 2 elements per thread.
   const int grid_size = util::div_rounding_up_safe(size, 2 * block_size);
diff --git a/cpp/src/bitmask/is_element_valid.cpp b/cpp/src/bitmask/is_element_valid.cpp
index 4a94ec9759c..47870e01567 100644
--- a/cpp/src/bitmask/is_element_valid.cpp
+++ b/cpp/src/bitmask/is_element_valid.cpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,11 +34,11 @@ bool is_element_valid_sync(column_view const& col_view,
   bitmask_type word;
   // null_mask() returns device ptr to bitmask without offset
   size_type index = element_index + col_view.offset();
-  CUDF_CUDA_TRY(cudaMemcpyAsync(&word,
-                                col_view.null_mask() + word_index(index),
-                                sizeof(bitmask_type),
-                                cudaMemcpyDeviceToHost,
-                                stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(&word,
+                           col_view.null_mask() + word_index(index),
+                           sizeof(bitmask_type),
+                           cudaMemcpyDeviceToHost,
+                           stream.value()));
   stream.synchronize();
   return static_cast<bool>(word & (bitmask_type{1} << intra_word_index(index)));
 }
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 756cf3421c9..d1107ad3cfd 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -90,7 +90,7 @@ rmm::device_buffer create_null_mask(size_type size,
 
   if (state != mask_state::UNINITIALIZED) {
     uint8_t fill_value = (state == mask_state::ALL_VALID) ? 0xff : 0x00;
-    CUDF_CUDA_TRY(cudaMemsetAsync(
+    CUDA_TRY(cudaMemsetAsync(
       static_cast<bitmask_type*>(mask.data()), fill_value, mask_size, stream.value()));
   }
 
@@ -146,7 +146,7 @@ void set_null_mask(bitmask_type* bitmask,
     cudf::detail::grid_1d config(number_of_mask_words, 256);
     set_null_mask_kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
       static_cast<bitmask_type*>(bitmask), begin_bit, end_bit, valid, number_of_mask_words);
-    CUDF_CHECK_CUDA(stream.value());
+    CHECK_CUDA(stream.value());
   }
 }
 
@@ -220,7 +220,7 @@ rmm::device_buffer copy_bitmask(bitmask_type const* mask,
     cudf::detail::grid_1d config(number_of_mask_words, 256);
     copy_offset_bitmask<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
       static_cast<bitmask_type*>(dest_mask.data()), mask, begin_bit, end_bit, number_of_mask_words);
-    CUDF_CHECK_CUDA(stream.value());
+    CHECK_CUDA(stream.value());
   }
   return dest_mask;
 }
diff --git a/cpp/src/column/column_device_view.cu b/cpp/src/column/column_device_view.cu
index fc244521617..dd1803f4b90 100644
--- a/cpp/src/column/column_device_view.cu
+++ b/cpp/src/column/column_device_view.cu
@@ -77,11 +77,11 @@ create_device_view_from_view(ColumnView const& source, rmm::cuda_stream_view str
     new ColumnDeviceView(source, staging_buffer.data(), descendant_storage->data()), deleter};
 
   // copy the CPU memory with all the children into device memory
-  CUDF_CUDA_TRY(cudaMemcpyAsync(descendant_storage->data(),
-                                staging_buffer.data(),
-                                descendant_storage->size(),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(descendant_storage->data(),
+                           staging_buffer.data(),
+                           descendant_storage->size(),
+                           cudaMemcpyDefault,
+                           stream.value()));
 
   stream.synchronize();
 
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 514374d450d..46470e69611 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1046,11 +1046,11 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
   setup_source_buf_info(input.begin(), input.end(), h_src_buf_info, h_src_buf_info);
 
   // HtoD indices and source buf info to device
-  CUDF_CUDA_TRY(cudaMemcpyAsync(d_indices,
-                                h_indices,
-                                indices_size + src_buf_info_size,
-                                cudaMemcpyHostToDevice,
-                                stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(d_indices,
+                           h_indices,
+                           indices_size + src_buf_info_size,
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
 
   // packed block of memory 2. partition buffer sizes and dst_buf_info structs
   std::size_t const buf_sizes_size =
@@ -1180,11 +1180,11 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
   }
 
   // DtoH buf sizes and col info back to the host
-  CUDF_CUDA_TRY(cudaMemcpyAsync(h_buf_sizes,
-                                d_buf_sizes,
-                                buf_sizes_size + dst_buf_info_size,
-                                cudaMemcpyDeviceToHost,
-                                stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(h_buf_sizes,
+                           d_buf_sizes,
+                           buf_sizes_size + dst_buf_info_size,
+                           cudaMemcpyDeviceToHost,
+                           stream.value()));
   stream.synchronize();
 
   // allocate output partition buffers
@@ -1224,14 +1224,14 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
   });
 
   // HtoD src and dest buffers
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
+  CUDA_TRY(cudaMemcpyAsync(
     d_src_bufs, h_src_bufs, src_bufs_size + dst_bufs_size, cudaMemcpyHostToDevice, stream.value()));
 
   // perform the copy.
   copy_data(num_bufs, num_src_bufs, d_src_bufs, d_dst_bufs, d_dst_buf_info, stream);
 
   // DtoH dst info (to retrieve null counts)
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
+  CUDA_TRY(cudaMemcpyAsync(
     h_dst_buf_info, d_dst_buf_info, dst_buf_info_size, cudaMemcpyDeviceToHost, stream.value()));
 
   stream.synchronize();
diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh
index 76f3fba4689..0ae0baa9908 100644
--- a/cpp/src/hash/concurrent_unordered_map.cuh
+++ b/cpp/src/hash/concurrent_unordered_map.cuh
@@ -432,11 +432,11 @@ class concurrent_unordered_map {
 
       m_hashtbl_values = m_allocator.allocate(m_capacity, stream);
     }
-    CUDF_CUDA_TRY(cudaMemcpyAsync(m_hashtbl_values,
-                                  other.m_hashtbl_values,
-                                  m_capacity * sizeof(value_type),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
+    CUDA_TRY(cudaMemcpyAsync(m_hashtbl_values,
+                             other.m_hashtbl_values,
+                             m_capacity * sizeof(value_type),
+                             cudaMemcpyDefault,
+                             stream.value()));
   }
 
   void clear_async(rmm::cuda_stream_view stream = rmm::cuda_stream_default)
@@ -460,10 +460,10 @@ class concurrent_unordered_map {
     cudaError_t status = cudaPointerGetAttributes(&hashtbl_values_ptr_attributes, m_hashtbl_values);
 
     if (cudaSuccess == status && isPtrManaged(hashtbl_values_ptr_attributes)) {
-      CUDF_CUDA_TRY(cudaMemPrefetchAsync(
+      CUDA_TRY(cudaMemPrefetchAsync(
         m_hashtbl_values, m_capacity * sizeof(value_type), dev_id, stream.value()));
     }
-    CUDF_CUDA_TRY(cudaMemPrefetchAsync(this, sizeof(*this), dev_id, stream.value()));
+    CUDA_TRY(cudaMemPrefetchAsync(this, sizeof(*this), dev_id, stream.value()));
   }
 
   /**
@@ -532,8 +532,8 @@ class concurrent_unordered_map {
 
       if (cudaSuccess == status && isPtrManaged(hashtbl_values_ptr_attributes)) {
         int dev_id = 0;
-        CUDF_CUDA_TRY(cudaGetDevice(&dev_id));
-        CUDF_CUDA_TRY(cudaMemPrefetchAsync(
+        CUDA_TRY(cudaGetDevice(&dev_id));
+        CUDA_TRY(cudaMemPrefetchAsync(
           m_hashtbl_values, m_capacity * sizeof(value_type), dev_id, stream.value()));
       }
     }
@@ -543,6 +543,6 @@ class concurrent_unordered_map {
         m_hashtbl_values, m_capacity, m_unused_key, m_unused_element);
     }
 
-    CUDF_CHECK_CUDA(stream.value());
+    CUDA_TRY(cudaGetLastError());
   }
 };
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index e5da4794ca3..01ca32e6a2f 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -144,7 +144,7 @@ std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
   // Make sure the current device ID matches the Tensor's device ID
   if (tensor.device.device_type != kDLCPU) {
     int device_id = 0;
-    CUDF_CUDA_TRY(cudaGetDevice(&device_id));
+    CUDA_TRY(cudaGetDevice(&device_id));
     CUDF_EXPECTS(tensor.device.device_id == device_id, "DLTensor device ID must be current device");
   }
 
@@ -184,11 +184,11 @@ std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
   for (auto& col : columns) {
     col = make_numeric_column(dtype, num_rows, mask_state::UNALLOCATED, stream, mr);
 
-    CUDF_CUDA_TRY(cudaMemcpyAsync(col->mutable_view().head<void>(),
-                                  reinterpret_cast<void*>(tensor_data),
-                                  bytes,
-                                  cudaMemcpyDefault,
-                                  stream.value()));
+    CUDA_TRY(cudaMemcpyAsync(col->mutable_view().head<void>(),
+                             reinterpret_cast<void*>(tensor_data),
+                             bytes,
+                             cudaMemcpyDefault,
+                             stream.value()));
 
     tensor_data += col_stride;
   }
@@ -234,7 +234,7 @@ DLManagedTensor* to_dlpack(table_view const& input,
     tensor.strides[1] = num_rows;
   }
 
-  CUDF_CUDA_TRY(cudaGetDevice(&tensor.device.device_id));
+  CUDA_TRY(cudaGetDevice(&tensor.device.device_id));
   tensor.device.device_type = kDLCUDA;
 
   // If there is only one column, then a 1D tensor can just copy the pointer
@@ -254,11 +254,11 @@ DLManagedTensor* to_dlpack(table_view const& input,
 
   auto tensor_data = reinterpret_cast<uintptr_t>(tensor.data);
   for (auto const& col : input) {
-    CUDF_CUDA_TRY(cudaMemcpyAsync(reinterpret_cast<void*>(tensor_data),
-                                  get_column_data(col),
-                                  stride_bytes,
-                                  cudaMemcpyDefault,
-                                  stream.value()));
+    CUDA_TRY(cudaMemcpyAsync(reinterpret_cast<void*>(tensor_data),
+                             get_column_data(col),
+                             stride_bytes,
+                             cudaMemcpyDefault,
+                             stream.value()));
     tensor_data += stride_bytes;
   }
 
diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu
index 6c5cd56d2a7..99b657fb9d5 100644
--- a/cpp/src/interop/from_arrow.cu
+++ b/cpp/src/interop/from_arrow.cu
@@ -108,11 +108,11 @@ struct dispatch_to_cudf_column {
       stream,
       mr);
     auto mask_buffer = array.null_bitmap();
-    CUDF_CUDA_TRY(cudaMemcpyAsync(mask->data(),
-                                  reinterpret_cast<const uint8_t*>(mask_buffer->address()),
-                                  array.null_bitmap()->size(),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
+    CUDA_TRY(cudaMemcpyAsync(mask->data(),
+                             reinterpret_cast<const uint8_t*>(mask_buffer->address()),
+                             array.null_bitmap()->size(),
+                             cudaMemcpyDefault,
+                             stream.value()));
     return mask;
   }
 
@@ -135,7 +135,7 @@ struct dispatch_to_cudf_column {
     auto const has_nulls     = skip_mask ? false : array.null_bitmap_data() != nullptr;
     auto col = make_fixed_width_column(type, num_rows, mask_state::UNALLOCATED, stream, mr);
     auto mutable_column_view = col->mutable_view();
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
+    CUDA_TRY(cudaMemcpyAsync(
       mutable_column_view.data<T>(),
       reinterpret_cast<const uint8_t*>(data_buffer->address()) + array.offset() * sizeof(T),
       sizeof(T) * num_rows,
@@ -191,7 +191,7 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<numeric::decimal128>
   auto col = make_fixed_width_column(type, num_rows, mask_state::UNALLOCATED, stream, mr);
   auto mutable_column_view = col->mutable_view();
 
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
+  CUDA_TRY(cudaMemcpyAsync(
     mutable_column_view.data<DeviceType>(),
     reinterpret_cast<const uint8_t*>(data_buffer->address()) + array.offset() * sizeof(DeviceType),
     sizeof(DeviceType) * num_rows,
@@ -227,11 +227,11 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<bool>(
 {
   auto data_buffer = array.data()->buffers[1];
   auto data        = rmm::device_buffer(data_buffer->size(), stream, mr);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(data.data(),
-                                reinterpret_cast<const uint8_t*>(data_buffer->address()),
-                                data_buffer->size(),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(data.data(),
+                           reinterpret_cast<const uint8_t*>(data_buffer->address()),
+                           data_buffer->size(),
+                           cudaMemcpyDefault,
+                           stream.value()));
   auto out_col = mask_to_bools(static_cast<bitmask_type*>(data.data()),
                                array.offset(),
                                array.offset() + array.length(),
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 517a83c716e..c7409978bb2 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -53,11 +53,11 @@ std::shared_ptr<arrow::Buffer> fetch_data_buffer(column_view input_view,
 
   auto data_buffer = allocate_arrow_buffer(data_size_in_bytes, ar_mr);
 
-  CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
-                                input_view.data<T>(),
-                                data_size_in_bytes,
-                                cudaMemcpyDeviceToHost,
-                                stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
+                           input_view.data<T>(),
+                           data_size_in_bytes,
+                           cudaMemcpyDeviceToHost,
+                           stream.value()));
 
   return std::move(data_buffer);
 }
@@ -73,7 +73,7 @@ std::shared_ptr<arrow::Buffer> fetch_mask_buffer(column_view input_view,
 
   if (input_view.has_nulls()) {
     auto mask_buffer = allocate_arrow_bitmap(static_cast<int64_t>(input_view.size()), ar_mr);
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
+    CUDA_TRY(cudaMemcpyAsync(
       mask_buffer->mutable_data(),
       (input_view.offset() > 0) ? cudf::copy_bitmask(input_view).data() : input_view.null_mask(),
       mask_size_in_bytes,
@@ -163,11 +163,11 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(
   auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType);
   auto data_buffer             = allocate_arrow_buffer(buf_size_in_bytes, ar_mr);
 
-  CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
-                                buf.data(),
-                                buf_size_in_bytes,
-                                cudaMemcpyDeviceToHost,
-                                stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
+                           buf.data(),
+                           buf_size_in_bytes,
+                           cudaMemcpyDeviceToHost,
+                           stream.value()));
 
   auto type    = arrow::decimal(18, -input.type().scale());
   auto mask    = fetch_mask_buffer(input, ar_mr, stream);
@@ -197,11 +197,11 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal128>
   auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType);
   auto data_buffer             = allocate_arrow_buffer(buf_size_in_bytes, ar_mr);
 
-  CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
-                                buf.data(),
-                                buf_size_in_bytes,
-                                cudaMemcpyDeviceToHost,
-                                stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
+                           buf.data(),
+                           buf_size_in_bytes,
+                           cudaMemcpyDeviceToHost,
+                           stream.value()));
 
   auto type    = arrow::decimal(18, -input.type().scale());
   auto mask    = fetch_mask_buffer(input, ar_mr, stream);
@@ -222,11 +222,11 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<bool>(column_view in
 
   auto data_buffer = allocate_arrow_buffer(static_cast<int64_t>(bitmask.first->size()), ar_mr);
 
-  CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
-                                bitmask.first->data(),
-                                bitmask.first->size(),
-                                cudaMemcpyDeviceToHost,
-                                stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
+                           bitmask.first->data(),
+                           bitmask.first->size(),
+                           cudaMemcpyDeviceToHost,
+                           stream.value()));
   return to_arrow_array(id,
                         static_cast<int64_t>(input.size()),
                         std::move(data_buffer),
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index 5885b61b35b..b5b76c2def8 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -192,9 +192,9 @@ rmm::device_buffer decompress_data(datasource& source,
 
     for (int loop_cnt = 0; loop_cnt < 2; loop_cnt++) {
       inflate_in.host_to_device(stream);
-      CUDF_CUDA_TRY(
+      CUDA_TRY(
         cudaMemsetAsync(inflate_out.device_ptr(), 0, inflate_out.memory_size(), stream.value()));
-      CUDF_CUDA_TRY(gpuinflate(
+      CUDA_TRY(gpuinflate(
         inflate_in.device_ptr(), inflate_out.device_ptr(), inflate_in.size(), 0, stream));
       inflate_out.device_to_host(stream, true);
 
@@ -424,11 +424,11 @@ std::vector<column_buffer> decode_data(metadata& meta,
   // Copy valid bits that are shared between columns
   for (size_t i = 0; i < out_buffers.size(); i++) {
     if (valid_alias[i] != nullptr) {
-      CUDF_CUDA_TRY(cudaMemcpyAsync(out_buffers[i].null_mask(),
-                                    valid_alias[i],
-                                    out_buffers[i].null_mask_size(),
-                                    cudaMemcpyHostToDevice,
-                                    stream.value()));
+      CUDA_TRY(cudaMemcpyAsync(out_buffers[i].null_mask(),
+                               valid_alias[i],
+                               out_buffers[i].null_mask_size(),
+                               cudaMemcpyHostToDevice,
+                               stream.value()));
     }
   }
   schema_desc.device_to_host(stream, true);
diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu
index 631cf19b2aa..b4a42a66133 100644
--- a/cpp/src/io/comp/debrotli.cu
+++ b/cpp/src/io/comp/debrotli.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -2048,7 +2048,7 @@ size_t __host__ get_gpu_debrotli_scratch_size(int max_num_inputs)
   int sm_count = 0;
   int dev      = 0;
   uint32_t max_fb_size, min_fb_size, fb_size;
-  CUDF_CUDA_TRY(cudaGetDevice(&dev));
+  CUDA_TRY(cudaGetDevice(&dev));
   if (cudaSuccess == cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev)) {
     // printf("%d SMs on device %d\n", sm_count, dev);
     max_num_inputs =
@@ -2092,14 +2092,14 @@ cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s* inputs,
   scratch_size = min(scratch_size, (size_t)0xffffffffu);
   fb_heap_size = (uint32_t)((scratch_size - sizeof(brotli_dictionary_s)) & ~0xf);
 
-  CUDF_CUDA_TRY(cudaMemsetAsync(scratch_u8, 0, 2 * sizeof(uint32_t), stream.value()));
+  CUDA_TRY(cudaMemsetAsync(scratch_u8, 0, 2 * sizeof(uint32_t), stream.value()));
   // NOTE: The 128KB dictionary copy can have a relatively large overhead since source isn't
   // page-locked
-  CUDF_CUDA_TRY(cudaMemcpyAsync(scratch_u8 + fb_heap_size,
-                                get_brotli_dictionary(),
-                                sizeof(brotli_dictionary_s),
-                                cudaMemcpyHostToDevice,
-                                stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(scratch_u8 + fb_heap_size,
+                           get_brotli_dictionary(),
+                           sizeof(brotli_dictionary_s),
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
   gpu_debrotli_kernel<<<dim_grid, dim_block, 0, stream.value()>>>(
     inputs, outputs, scratch_u8, fb_heap_size, count32);
 #if DUMP_FB_HEAP
@@ -2107,7 +2107,7 @@ cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s* inputs,
   uint32_t cur = 0;
   printf("heap dump (%d bytes)\n", fb_heap_size);
   while (cur < fb_heap_size && !(cur & 3)) {
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
+    CUDA_TRY(cudaMemcpyAsync(
       &dump[0], scratch_u8 + cur, 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream.value()));
     stream.synchronize();
     printf("@%d: next = %d, size = %d\n", cur, dump[0], dump[1]);
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index cd070d28f38..ae9738164f3 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -260,11 +260,11 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
 
     auto const previous_data_size = d_data.size();
     d_data.resize(target_pos - buffer_pos, stream);
-    CUDF_CUDA_TRY(cudaMemcpyAsync(d_data.begin() + previous_data_size,
-                                  data.begin() + buffer_pos + previous_data_size,
-                                  target_pos - buffer_pos - previous_data_size,
-                                  cudaMemcpyDefault,
-                                  stream.value()));
+    CUDA_TRY(cudaMemcpyAsync(d_data.begin() + previous_data_size,
+                             data.begin() + buffer_pos + previous_data_size,
+                             target_pos - buffer_pos - previous_data_size,
+                             cudaMemcpyDefault,
+                             stream.value()));
 
     // Pass 1: Count the potential number of rows in each character block for each
     // possible parser state at the beginning of the block.
@@ -280,11 +280,11 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
                                                                  range_end,
                                                                  skip_rows,
                                                                  stream);
-    CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
-                                  row_ctx.device_ptr(),
-                                  num_blocks * sizeof(uint64_t),
-                                  cudaMemcpyDeviceToHost,
-                                  stream.value()));
+    CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
+                             row_ctx.device_ptr(),
+                             num_blocks * sizeof(uint64_t),
+                             cudaMemcpyDeviceToHost,
+                             stream.value()));
     stream.synchronize();
 
     // Sum up the rows in each character block, selecting the row count that
@@ -300,11 +300,11 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
       // At least one row in range in this batch
       all_row_offsets.resize(total_rows - skip_rows, stream);
 
-      CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.device_ptr(),
-                                    row_ctx.host_ptr(),
-                                    num_blocks * sizeof(uint64_t),
-                                    cudaMemcpyHostToDevice,
-                                    stream.value()));
+      CUDA_TRY(cudaMemcpyAsync(row_ctx.device_ptr(),
+                               row_ctx.host_ptr(),
+                               num_blocks * sizeof(uint64_t),
+                               cudaMemcpyHostToDevice,
+                               stream.value()));
 
       // Pass 2: Output row offsets
       cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(),
@@ -321,11 +321,11 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
                                              stream);
       // With byte range, we want to keep only one row out of the specified range
       if (range_end < data.size()) {
-        CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
-                                      row_ctx.device_ptr(),
-                                      num_blocks * sizeof(uint64_t),
-                                      cudaMemcpyDeviceToHost,
-                                      stream.value()));
+        CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
+                                 row_ctx.device_ptr(),
+                                 num_blocks * sizeof(uint64_t),
+                                 cudaMemcpyDeviceToHost,
+                                 stream.value()));
         stream.synchronize();
 
         size_t rows_out_of_range = 0;
@@ -370,11 +370,11 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
   // Remove header rows and extract header
   const size_t header_row_index = std::max<size_t>(header_rows, 1) - 1;
   if (header_row_index + 1 < row_offsets.size()) {
-    CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
-                                  row_offsets.data() + header_row_index,
-                                  2 * sizeof(uint64_t),
-                                  cudaMemcpyDeviceToHost,
-                                  stream.value()));
+    CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
+                             row_offsets.data() + header_row_index,
+                             2 * sizeof(uint64_t),
+                             cudaMemcpyDeviceToHost,
+                             stream.value()));
     stream.synchronize();
 
     const auto header_start = buffer_pos + row_ctx[0];
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 2aa93ae4d0f..cb2197cf755 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -378,11 +378,11 @@ void write_chunked(data_sink* out_sink,
   } else {
     // copy the bytes to host to write them out
     thrust::host_vector<char> h_bytes(total_num_bytes);
-    CUDF_CUDA_TRY(cudaMemcpyAsync(h_bytes.data(),
-                                  ptr_all_bytes,
-                                  total_num_bytes * sizeof(char),
-                                  cudaMemcpyDeviceToHost,
-                                  stream.value()));
+    CUDA_TRY(cudaMemcpyAsync(h_bytes.data(),
+                             ptr_all_bytes,
+                             total_num_bytes * sizeof(char),
+                             cudaMemcpyDeviceToHost,
+                             stream.value()));
     stream.synchronize();
 
     out_sink->host_write(h_bytes.data(), total_num_bytes);
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index 56a00191ae4..d26831b9112 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -689,7 +689,7 @@ void convert_json_to_columns(parse_options_view const& opts,
 {
   int block_size;
   int min_grid_size;
-  CUDF_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(
+  CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(
     &min_grid_size, &block_size, convert_data_to_columns_kernel));
 
   const int grid_size = (row_offsets.size() + block_size - 1) / block_size;
@@ -703,7 +703,7 @@ void convert_json_to_columns(parse_options_view const& opts,
                                                                                valid_fields,
                                                                                num_valid_fields);
 
-  CUDF_CHECK_CUDA(stream.value());
+  CUDA_TRY(cudaGetLastError());
 }
 
 /**
@@ -721,7 +721,7 @@ std::vector<cudf::io::column_type_histogram> detect_data_types(
 {
   int block_size;
   int min_grid_size;
-  CUDF_CUDA_TRY(
+  CUDA_TRY(
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, detect_data_types_kernel));
 
   auto d_column_infos = [&]() {
@@ -763,7 +763,7 @@ void collect_keys_info(parse_options_view const& options,
 {
   int block_size;
   int min_grid_size;
-  CUDF_CUDA_TRY(
+  CUDA_TRY(
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, collect_keys_info_kernel));
 
   // Calculate actual block count to use based on records count
@@ -772,7 +772,7 @@ void collect_keys_info(parse_options_view const& options,
   collect_keys_info_kernel<<<grid_size, block_size, 0, stream.value()>>>(
     options, data, row_offsets, keys_cnt, keys_info);
 
-  CUDF_CHECK_CUDA(stream.value());
+  CUDA_TRY(cudaGetLastError());
 }
 
 }  // namespace gpu
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 20eeec267b1..5ca947f3ee5 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -274,7 +274,7 @@ rmm::device_uvector<uint64_t> find_record_starts(json_reader_options const& read
   // Manually adding an extra row to account for the first row in the file
   if (reader_opts.get_byte_range_offset() == 0) {
     find_result_ptr++;
-    CUDF_CUDA_TRY(cudaMemsetAsync(rec_starts.data(), 0ull, sizeof(uint64_t), stream.value()));
+    CUDA_TRY(cudaMemsetAsync(rec_starts.data(), 0ull, sizeof(uint64_t), stream.value()));
   }
 
   std::vector<char> chars_to_find{'\n'};
@@ -356,18 +356,18 @@ std::pair<std::vector<std::string>, col_map_ptr_type> get_column_names_and_map(
   uint64_t first_row_len = d_data.size();
   if (rec_starts.size() > 1) {
     // Set first_row_len to the offset of the second row, if it exists
-    CUDF_CUDA_TRY(cudaMemcpyAsync(&first_row_len,
-                                  rec_starts.data() + 1,
-                                  sizeof(uint64_t),
-                                  cudaMemcpyDeviceToHost,
-                                  stream.value()));
+    CUDA_TRY(cudaMemcpyAsync(&first_row_len,
+                             rec_starts.data() + 1,
+                             sizeof(uint64_t),
+                             cudaMemcpyDeviceToHost,
+                             stream.value()));
   }
   std::vector<char> first_row(first_row_len);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(first_row.data(),
-                                d_data.data(),
-                                first_row_len * sizeof(char),
-                                cudaMemcpyDeviceToHost,
-                                stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(first_row.data(),
+                           d_data.data(),
+                           first_row_len * sizeof(char),
+                           cudaMemcpyDeviceToHost,
+                           stream.value()));
   stream.synchronize();
 
   // Determine the row format between:
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 83c23774362..059df283c94 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -431,7 +431,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
     device_span<gpu_inflate_status_s> inflate_out_view(inflate_out.data(), num_compressed_blocks);
     switch (decompressor->GetKind()) {
       case orc::ZLIB:
-        CUDF_CUDA_TRY(
+        CUDA_TRY(
           gpuinflate(inflate_in.data(), inflate_out.data(), num_compressed_blocks, 0, stream));
         break;
       case orc::SNAPPY:
@@ -440,7 +440,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
                                                            num_compressed_blocks};
           snappy_decompress(inflate_in_view, inflate_out_view, max_uncomp_block_size, stream);
         } else {
-          CUDF_CUDA_TRY(
+          CUDA_TRY(
             gpu_unsnap(inflate_in.data(), inflate_out.data(), num_compressed_blocks, stream));
         }
         break;
@@ -449,7 +449,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
     decompress_check(inflate_out_view, any_block_failure.device_ptr(), stream);
   }
   if (num_uncompressed_blocks > 0) {
-    CUDF_CUDA_TRY(gpu_copy_uncompressed_blocks(
+    CUDA_TRY(gpu_copy_uncompressed_blocks(
       inflate_in.data() + num_compressed_blocks, num_uncompressed_blocks, stream));
   }
   gpu::PostDecompressionReassemble(compinfo.device_ptr(), compinfo.size(), stream);
@@ -1129,7 +1129,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                 _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
                   offset, len);
               CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
-              CUDF_CUDA_TRY(cudaMemcpyAsync(
+              CUDA_TRY(cudaMemcpyAsync(
                 d_dst, buffer->data(), len, cudaMemcpyHostToDevice, stream.value()));
               stream.synchronize();
             }
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index d0c1cea97a8..30385d395f1 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -706,11 +706,11 @@ std::vector<std::vector<rowgroup_rows>> calculate_aligned_rowgroup_bounds(
 
   auto aligned_rgs = hostdevice_2dvector<rowgroup_rows>(
     segmentation.num_rowgroups(), orc_table.num_columns(), stream);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(aligned_rgs.base_device_ptr(),
-                                segmentation.rowgroups.base_device_ptr(),
-                                aligned_rgs.count() * sizeof(rowgroup_rows),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(aligned_rgs.base_device_ptr(),
+                           segmentation.rowgroups.base_device_ptr(),
+                           aligned_rgs.count() * sizeof(rowgroup_rows),
+                           cudaMemcpyDefault,
+                           stream.value()));
   auto const d_stripes = cudf::detail::make_device_uvector_async(segmentation.stripes, stream);
 
   // One thread per column, per stripe
@@ -1330,7 +1330,7 @@ std::future<void> writer::impl::write_data_stream(gpu::StripeStream const& strm_
     if (out_sink_->is_device_write_preferred(length)) {
       return out_sink_->device_write_async(stream_in, length, stream);
     } else {
-      CUDF_CUDA_TRY(
+      CUDA_TRY(
         cudaMemcpyAsync(stream_out, stream_in, length, cudaMemcpyDeviceToHost, stream.value()));
       stream.synchronize();
 
@@ -1419,10 +1419,10 @@ void pushdown_lists_null_mask(orc_column_view const& col,
                               rmm::cuda_stream_view stream)
 {
   // Set all bits - correct unless there's a mismatch between offsets and null mask
-  CUDF_CUDA_TRY(cudaMemsetAsync(static_cast<void*>(out_mask.data()),
-                                255,
-                                out_mask.size() * sizeof(bitmask_type),
-                                stream.value()));
+  CUDA_TRY(cudaMemsetAsync(static_cast<void*>(out_mask.data()),
+                           255,
+                           out_mask.size() * sizeof(bitmask_type),
+                           stream.value()));
 
   // Reset bits where a null list element has rows in the child column
   thrust::for_each_n(
@@ -1946,7 +1946,7 @@ void writer::impl::write(table_view const& table)
       } else {
         return pinned_buffer<uint8_t>{[](size_t size) {
                                         uint8_t* ptr = nullptr;
-                                        CUDF_CUDA_TRY(cudaMallocHost(&ptr, size));
+                                        CUDA_TRY(cudaMallocHost(&ptr, size));
                                         return ptr;
                                       }(max_stream_size),
                                       cudaFreeHost};
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index 46b3206f731..df940bb15d3 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -1239,24 +1239,24 @@ rmm::device_buffer reader::impl::decompress_page_data(
         argc++;
       });
 
-      CUDF_CUDA_TRY(cudaMemcpyAsync(inflate_in.device_ptr(start_pos),
-                                    inflate_in.host_ptr(start_pos),
-                                    sizeof(decltype(inflate_in)::value_type) * (argc - start_pos),
-                                    cudaMemcpyHostToDevice,
-                                    stream.value()));
-      CUDF_CUDA_TRY(cudaMemcpyAsync(inflate_out.device_ptr(start_pos),
-                                    inflate_out.host_ptr(start_pos),
-                                    sizeof(decltype(inflate_out)::value_type) * (argc - start_pos),
-                                    cudaMemcpyHostToDevice,
-                                    stream.value()));
+      CUDA_TRY(cudaMemcpyAsync(inflate_in.device_ptr(start_pos),
+                               inflate_in.host_ptr(start_pos),
+                               sizeof(decltype(inflate_in)::value_type) * (argc - start_pos),
+                               cudaMemcpyHostToDevice,
+                               stream.value()));
+      CUDA_TRY(cudaMemcpyAsync(inflate_out.device_ptr(start_pos),
+                               inflate_out.host_ptr(start_pos),
+                               sizeof(decltype(inflate_out)::value_type) * (argc - start_pos),
+                               cudaMemcpyHostToDevice,
+                               stream.value()));
 
       switch (codec.compression_type) {
         case parquet::GZIP:
-          CUDF_CUDA_TRY(gpuinflate(inflate_in.device_ptr(start_pos),
-                                   inflate_out.device_ptr(start_pos),
-                                   argc - start_pos,
-                                   1,
-                                   stream))
+          CUDA_TRY(gpuinflate(inflate_in.device_ptr(start_pos),
+                              inflate_out.device_ptr(start_pos),
+                              argc - start_pos,
+                              1,
+                              stream))
           break;
         case parquet::SNAPPY:
           if (nvcomp_integration::is_stable_enabled()) {
@@ -1265,27 +1265,27 @@ rmm::device_buffer reader::impl::decompress_page_data(
                               codec.max_decompressed_size,
                               stream);
           } else {
-            CUDF_CUDA_TRY(gpu_unsnap(inflate_in.device_ptr(start_pos),
-                                     inflate_out.device_ptr(start_pos),
-                                     argc - start_pos,
-                                     stream));
+            CUDA_TRY(gpu_unsnap(inflate_in.device_ptr(start_pos),
+                                inflate_out.device_ptr(start_pos),
+                                argc - start_pos,
+                                stream));
           }
           break;
         case parquet::BROTLI:
-          CUDF_CUDA_TRY(gpu_debrotli(inflate_in.device_ptr(start_pos),
-                                     inflate_out.device_ptr(start_pos),
-                                     debrotli_scratch.data(),
-                                     debrotli_scratch.size(),
-                                     argc - start_pos,
-                                     stream));
+          CUDA_TRY(gpu_debrotli(inflate_in.device_ptr(start_pos),
+                                inflate_out.device_ptr(start_pos),
+                                debrotli_scratch.data(),
+                                debrotli_scratch.size(),
+                                argc - start_pos,
+                                stream));
           break;
         default: CUDF_FAIL("Unexpected decompression dispatch"); break;
       }
-      CUDF_CUDA_TRY(cudaMemcpyAsync(inflate_out.host_ptr(start_pos),
-                                    inflate_out.device_ptr(start_pos),
-                                    sizeof(decltype(inflate_out)::value_type) * (argc - start_pos),
-                                    cudaMemcpyDeviceToHost,
-                                    stream.value()));
+      CUDA_TRY(cudaMemcpyAsync(inflate_out.host_ptr(start_pos),
+                               inflate_out.device_ptr(start_pos),
+                               sizeof(decltype(inflate_out)::value_type) * (argc - start_pos),
+                               cudaMemcpyDeviceToHost,
+                               stream.value()));
     }
   }
 
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index cb1acb4d9ec..a29164ba051 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1076,7 +1076,7 @@ void writer::impl::encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks
       if (nvcomp_integration::is_stable_enabled()) {
         snappy_compress(comp_in, comp_stat, max_page_uncomp_data_size, stream);
       } else {
-        CUDF_CUDA_TRY(gpu_snap(comp_in.data(), comp_stat.data(), pages_in_batch, stream));
+        CUDA_TRY(gpu_snap(comp_in.data(), comp_stat.data(), pages_in_batch, stream));
       }
       break;
     default: break;
@@ -1089,11 +1089,11 @@ void writer::impl::encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks
   GatherPages(d_chunks_in_batch.flat_view(), pages, stream);
 
   auto h_chunks_in_batch = chunks.host_view().subspan(first_rowgroup, rowgroups_in_batch);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(h_chunks_in_batch.data(),
-                                d_chunks_in_batch.data(),
-                                d_chunks_in_batch.flat_view().size_bytes(),
-                                cudaMemcpyDeviceToHost,
-                                stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(h_chunks_in_batch.data(),
+                           d_chunks_in_batch.data(),
+                           d_chunks_in_batch.flat_view().size_bytes(),
+                           cudaMemcpyDeviceToHost,
+                           stream.value()));
   stream.synchronize();
 }
 
@@ -1532,28 +1532,28 @@ void writer::impl::write(table_view const& table, std::vector<partition_info> co
           // we still need to do a (much smaller) memcpy for the statistics.
           if (ck.ck_stat_size != 0) {
             column_chunk_meta.statistics_blob.resize(ck.ck_stat_size);
-            CUDF_CUDA_TRY(cudaMemcpyAsync(column_chunk_meta.statistics_blob.data(),
-                                          dev_bfr,
-                                          ck.ck_stat_size,
-                                          cudaMemcpyDeviceToHost,
-                                          stream.value()));
+            CUDA_TRY(cudaMemcpyAsync(column_chunk_meta.statistics_blob.data(),
+                                     dev_bfr,
+                                     ck.ck_stat_size,
+                                     cudaMemcpyDeviceToHost,
+                                     stream.value()));
             stream.synchronize();
           }
         } else {
           if (!host_bfr) {
             host_bfr = pinned_buffer<uint8_t>{[](size_t size) {
                                                 uint8_t* ptr = nullptr;
-                                                CUDF_CUDA_TRY(cudaMallocHost(&ptr, size));
+                                                CUDA_TRY(cudaMallocHost(&ptr, size));
                                                 return ptr;
                                               }(max_chunk_bfr_size),
                                               cudaFreeHost};
           }
           // copy the full data
-          CUDF_CUDA_TRY(cudaMemcpyAsync(host_bfr.get(),
-                                        dev_bfr,
-                                        ck.ck_stat_size + ck.compressed_size,
-                                        cudaMemcpyDeviceToHost,
-                                        stream.value()));
+          CUDA_TRY(cudaMemcpyAsync(host_bfr.get(),
+                                   dev_bfr,
+                                   ck.ck_stat_size + ck.compressed_size,
+                                   cudaMemcpyDeviceToHost,
+                                   stream.value()));
           stream.synchronize();
           out_sink_[p]->host_write(host_bfr.get() + ck.ck_stat_size, ck.compressed_size);
           if (ck.ck_stat_size != 0) {
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index fd510466477..34d8307b024 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -55,7 +55,7 @@ inline rmm::device_buffer create_data(data_type type,
   std::size_t data_size = size_of(type) * size;
 
   rmm::device_buffer data(data_size, stream, mr);
-  CUDF_CUDA_TRY(cudaMemsetAsync(data.data(), 0, data_size, stream.value()));
+  CUDA_TRY(cudaMemsetAsync(data.data(), 0, data_size, stream.value()));
 
   return data;
 }
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 5c73cf31428..367bbfcbdfa 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -54,7 +54,7 @@ class hostdevice_vector {
     : num_elements(initial_size), max_elements(max_size)
   {
     if (max_elements != 0) {
-      CUDF_CUDA_TRY(cudaMallocHost(&h_data, sizeof(T) * max_elements));
+      CUDA_TRY(cudaMallocHost(&h_data, sizeof(T) * max_elements));
       d_data.resize(sizeof(T) * max_elements, stream);
     }
   }
@@ -101,14 +101,14 @@ class hostdevice_vector {
 
   void host_to_device(rmm::cuda_stream_view stream, bool synchronize = false)
   {
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
+    CUDA_TRY(cudaMemcpyAsync(
       d_data.data(), h_data, memory_size(), cudaMemcpyHostToDevice, stream.value()));
     if (synchronize) { stream.synchronize(); }
   }
 
   void device_to_host(rmm::cuda_stream_view stream, bool synchronize = false)
   {
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
+    CUDA_TRY(cudaMemcpyAsync(
       h_data, d_data.data(), memory_size(), cudaMemcpyDeviceToHost, stream.value()));
     if (synchronize) { stream.synchronize(); }
   }
diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu
index a03789464cc..2db87736848 100644
--- a/cpp/src/io/utilities/parsing_utils.cu
+++ b/cpp/src/io/utilities/parsing_utils.cu
@@ -1,19 +1,3 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -122,7 +106,7 @@ cudf::size_type find_all_from_set(device_span<char const> data,
 {
   int block_size    = 0;  // suggested thread count to use
   int min_grid_size = 0;  // minimum block count required
-  CUDF_CUDA_TRY(
+  CUDA_TRY(
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions<T>));
   const int grid_size = divCeil(data.size(), (size_t)block_size);
 
@@ -147,7 +131,7 @@ cudf::size_type find_all_from_set(host_span<char const> data,
 
   int block_size    = 0;  // suggested thread count to use
   int min_grid_size = 0;  // minimum block count required
-  CUDF_CUDA_TRY(
+  CUDA_TRY(
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions<T>));
 
   const size_t chunk_count = divCeil(data.size(), max_chunk_bytes);
@@ -159,7 +143,7 @@ cudf::size_type find_all_from_set(host_span<char const> data,
     const int grid_size   = divCeil(chunk_bits, block_size);
 
     // Copy chunk to device
-    CUDF_CUDA_TRY(
+    CUDA_TRY(
       cudaMemcpyAsync(d_chunk.data(), h_chunk, chunk_bytes, cudaMemcpyDefault, stream.value()));
 
     for (char key : keys) {
diff --git a/cpp/src/jit/cache.cpp b/cpp/src/jit/cache.cpp
index 8228ff6da1f..159681eaffc 100644
--- a/cpp/src/jit/cache.cpp
+++ b/cpp/src/jit/cache.cpp
@@ -77,9 +77,9 @@ std::filesystem::path get_cache_dir()
     int device;
     int cc_major;
     int cc_minor;
-    CUDF_CUDA_TRY(cudaGetDevice(&device));
-    CUDF_CUDA_TRY(cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device));
-    CUDF_CUDA_TRY(cudaDeviceGetAttribute(&cc_minor, cudaDevAttrComputeCapabilityMinor, device));
+    CUDA_TRY(cudaGetDevice(&device));
+    CUDA_TRY(cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device));
+    CUDA_TRY(cudaDeviceGetAttribute(&cc_minor, cudaDevAttrComputeCapabilityMinor, device));
     int cc = cc_major * 10 + cc_minor;
 
     kernel_cache_path /= std::to_string(cc);
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 01a94457b69..043c04b409e 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -139,7 +139,7 @@ void materialize_bitmask(column_view const& left_col,
     }
   }
 
-  CUDF_CHECK_CUDA(stream.value());
+  CHECK_CUDA(stream.value());
 }
 
 struct side_index_generator {
@@ -212,7 +212,7 @@ index_vector generate_merged_indices(table_view const& left_table,
                   ineq_op);
   }
 
-  CUDF_CHECK_CUDA(stream.value());
+  CHECK_CUDA(stream.value());
 
   return merged_indices;
 }
diff --git a/cpp/src/quantiles/quantiles_util.hpp b/cpp/src/quantiles/quantiles_util.hpp
index 280a42d9e20..171b81152ff 100644
--- a/cpp/src/quantiles/quantiles_util.hpp
+++ b/cpp/src/quantiles/quantiles_util.hpp
@@ -29,7 +29,7 @@ CUDF_HOST_DEVICE inline Result get_array_value(T const* devarr, size_type locati
 #if defined(__CUDA_ARCH__)
   result = devarr[location];
 #else
-  CUDF_CUDA_TRY(cudaMemcpy(&result, devarr + location, sizeof(T), cudaMemcpyDeviceToHost));
+  CUDA_TRY(cudaMemcpy(&result, devarr + location, sizeof(T), cudaMemcpyDeviceToHost));
 #endif
   return static_cast<Result>(result);
 }
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index 454a8c9d694..61f728447e8 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -216,7 +216,7 @@ struct minmax_functor {
     // copy the minmax_pair to the host; does not copy the strings
     using OutputType = minmax_pair<cudf::string_view>;
     OutputType host_result;
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
+    CUDA_TRY(cudaMemcpyAsync(
       &host_result, dev_result.data(), sizeof(OutputType), cudaMemcpyDeviceToHost, stream.value()));
     // strings are copied to create the scalars here
     return {std::make_unique<string_scalar>(host_result.min_val, true, stream, mr),
@@ -235,7 +235,7 @@ struct minmax_functor {
     // copy the minmax_pair to the host to call get_element
     using OutputType = minmax_pair<T>;
     OutputType host_result;
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
+    CUDA_TRY(cudaMemcpyAsync(
       &host_result, dev_result.data(), sizeof(OutputType), cudaMemcpyDeviceToHost, stream.value()));
     // get the keys for those indexes
     auto const keys = dictionary_column_view(col).keys();
diff --git a/cpp/src/reductions/scan/scan_exclusive.cu b/cpp/src/reductions/scan/scan_exclusive.cu
index 885d7e904b4..3b8cc17c4aa 100644
--- a/cpp/src/reductions/scan/scan_exclusive.cu
+++ b/cpp/src/reductions/scan/scan_exclusive.cu
@@ -67,7 +67,7 @@ struct scan_dispatcher {
     thrust::exclusive_scan(
       rmm::exec_policy(stream), begin, begin + input.size(), output.data<T>(), identity, Op{});
 
-    CUDF_CHECK_CUDA(stream.value());
+    CHECK_CUDA(stream.value());
     return output_column;
   }
 
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index 5ffdf1f5c56..9d07f340ebf 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -120,7 +120,7 @@ struct scan_functor {
     thrust::inclusive_scan(
       rmm::exec_policy(stream), begin, begin + input_view.size(), result.data<T>(), Op{});
 
-    CUDF_CHECK_CUDA(stream.value());
+    CHECK_CUDA(stream.value());
     return output_column;
   }
 };
diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh
index ca07d60f426..d704b18774f 100644
--- a/cpp/src/rolling/rolling_detail.cuh
+++ b/cpp/src/rolling/rolling_detail.cuh
@@ -1281,7 +1281,7 @@ std::unique_ptr<column> rolling_window_udf(column_view const& input,
   output->set_null_count(output->size() - device_valid_count.value(stream));
 
   // check the stream for debugging
-  CUDF_CHECK_CUDA(stream.value());
+  CHECK_CUDA(stream.value());
 
   return output;
 }
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 19bb60ef1a8..76ec171052a 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -114,7 +114,7 @@ std::string string_scalar::to_string(rmm::cuda_stream_view stream) const
 {
   std::string result;
   result.resize(_data.size());
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
+  CUDA_TRY(cudaMemcpyAsync(
     &result[0], _data.data(), _data.size(), cudaMemcpyDeviceToHost, stream.value()));
   stream.synchronize();
   return result;
diff --git a/cpp/src/search/search.cu b/cpp/src/search/search.cu
index 29eddf703df..477666d93ae 100644
--- a/cpp/src/search/search.cu
+++ b/cpp/src/search/search.cu
@@ -97,8 +97,7 @@ std::unique_ptr<column> search_ordered(table_view const& t,
 
   // Handle empty inputs
   if (t.num_rows() == 0) {
-    CUDF_CUDA_TRY(
-      cudaMemsetAsync(result_out, 0, values.num_rows() * sizeof(size_type), stream.value()));
+    CUDA_TRY(cudaMemsetAsync(result_out, 0, values.num_rows() * sizeof(size_type), stream.value()));
     return result;
   }
 
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index 6a90a605ca3..adfd24f1ca2 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -87,11 +87,11 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
   auto offsets_view = offsets_column->mutable_view();
   // set the first entry to 0 and the last entry to bytes
   int32_t new_offsets[] = {0, static_cast<int32_t>(bytes)};
-  CUDF_CUDA_TRY(cudaMemcpyAsync(offsets_view.data<int32_t>(),
-                                new_offsets,
-                                sizeof(new_offsets),
-                                cudaMemcpyHostToDevice,
-                                stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(offsets_view.data<int32_t>(),
+                           new_offsets,
+                           sizeof(new_offsets),
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
 
   // build null mask
   // only one entry so it is either all valid or all null
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 1a423ef8eec..ac3c4df6aeb 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -153,11 +153,11 @@ struct format_compiler {
 
     // create program in device memory
     d_items.resize(items.size(), stream);
-    CUDF_CUDA_TRY(cudaMemcpyAsync(d_items.data(),
-                                  items.data(),
-                                  items.size() * sizeof(items[0]),
-                                  cudaMemcpyHostToDevice,
-                                  stream.value()));
+    CUDA_TRY(cudaMemcpyAsync(d_items.data(),
+                             items.data(),
+                             items.size() * sizeof(items[0]),
+                             cudaMemcpyHostToDevice,
+                             stream.value()));
   }
 
   format_item const* compiled_format_items() { return d_items.data(); }
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index fedb8d38a08..9fa033e9f9a 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -297,7 +297,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
           cudf::detail::get_value<offset_type>(offsets_child, column_size + column_offset, stream) -
           bytes_offset;
 
-        CUDF_CUDA_TRY(
+        CUDA_TRY(
           cudaMemcpyAsync(d_new_chars, d_chars, bytes, cudaMemcpyDeviceToDevice, stream.value()));
 
         // get ready for the next column
diff --git a/cpp/src/strings/regex/regexec.cu b/cpp/src/strings/regex/regexec.cu
index 3bcf55cf069..b286812226b 100644
--- a/cpp/src/strings/regex/regexec.cu
+++ b/cpp/src/strings/regex/regexec.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -163,7 +163,7 @@ std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_devic
   }
 
   // copy flat prog to device memory
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
+  CUDA_TRY(cudaMemcpyAsync(
     d_buffer->data(), h_buffer.data(), memsize, cudaMemcpyHostToDevice, stream.value()));
   //
   auto deleter = [d_buffer, d_relists](reprog_device* t) {
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index d496b46bc36..c0673a5e2b5 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -81,10 +81,10 @@ auto generate_empty_output(strings_column_view const& input,
 
   auto offsets_column = make_numeric_column(
     data_type{type_to_id<offset_type>()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
-  CUDF_CUDA_TRY(cudaMemsetAsync(offsets_column->mutable_view().template data<offset_type>(),
-                                0,
-                                offsets_column->size() * sizeof(offset_type),
-                                stream.value()));
+  CUDA_TRY(cudaMemsetAsync(offsets_column->mutable_view().template data<offset_type>(),
+                           0,
+                           offsets_column->size() * sizeof(offset_type),
+                           stream.value()));
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
@@ -264,7 +264,7 @@ auto make_strings_children(Func fn,
   } else {
     // Compute the offsets values from the provided output string sizes.
     auto const string_sizes = output_strings_sizes.value();
-    CUDF_CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(offset_type), stream.value()));
+    CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(offset_type), stream.value()));
     thrust::inclusive_scan(rmm::exec_policy(stream),
                            string_sizes.template begin<size_type>(),
                            string_sizes.template end<size_type>(),
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index d7cc72fdfff..825f09c66e6 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -130,9 +130,9 @@ const character_flags_table_type* get_character_flags_table()
 {
   return d_character_codepoint_flags.find_or_initialize([&](void) {
     character_flags_table_type* table = nullptr;
-    CUDF_CUDA_TRY(cudaMemcpyToSymbol(
+    CUDA_TRY(cudaMemcpyToSymbol(
       character_codepoint_flags, g_character_codepoint_flags, sizeof(g_character_codepoint_flags)));
-    CUDF_CUDA_TRY(cudaGetSymbolAddress((void**)&table, character_codepoint_flags));
+    CUDA_TRY(cudaGetSymbolAddress((void**)&table, character_codepoint_flags));
     return table;
   });
 }
@@ -144,9 +144,9 @@ const character_cases_table_type* get_character_cases_table()
 {
   return d_character_cases_table.find_or_initialize([&](void) {
     character_cases_table_type* table = nullptr;
-    CUDF_CUDA_TRY(cudaMemcpyToSymbol(
+    CUDA_TRY(cudaMemcpyToSymbol(
       character_cases_table, g_character_cases_table, sizeof(g_character_cases_table)));
-    CUDF_CUDA_TRY(cudaGetSymbolAddress((void**)&table, character_cases_table));
+    CUDA_TRY(cudaGetSymbolAddress((void**)&table, character_cases_table));
     return table;
   });
 }
@@ -158,9 +158,9 @@ const special_case_mapping* get_special_case_mapping_table()
 {
   return d_special_case_mappings.find_or_initialize([&](void) {
     special_case_mapping* table = nullptr;
-    CUDF_CUDA_TRY(cudaMemcpyToSymbol(
+    CUDA_TRY(cudaMemcpyToSymbol(
       character_special_case_mappings, g_special_case_mappings, sizeof(g_special_case_mappings)));
-    CUDF_CUDA_TRY(cudaGetSymbolAddress((void**)&table, character_special_case_mappings));
+    CUDA_TRY(cudaGetSymbolAddress((void**)&table, character_special_case_mappings));
     return table;
   });
 }
diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu
index b69d735f612..6ec364cc048 100644
--- a/cpp/src/text/edit_distance.cu
+++ b/cpp/src/text/edit_distance.cu
@@ -231,7 +231,7 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
   cudf::size_type n_upper = (strings_count * (strings_count - 1)) / 2;
   rmm::device_uvector<int32_t> offsets(n_upper, stream);
   auto d_offsets = offsets.data();
-  CUDF_CUDA_TRY(cudaMemsetAsync(d_offsets, 0, n_upper * sizeof(cudf::size_type), stream.value()));
+  CUDA_TRY(cudaMemsetAsync(d_offsets, 0, n_upper * sizeof(cudf::size_type), stream.value()));
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<cudf::size_type>(0),
diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu
index 00094f2de71..9ab769f9edd 100644
--- a/cpp/src/text/subword/load_hash_file.cu
+++ b/cpp/src/text/subword/load_hash_file.cu
@@ -52,12 +52,12 @@ rmm::device_uvector<codepoint_metadata_type> get_codepoint_metadata(rmm::cuda_st
                table + cp_section1_end,
                table + codepoint_metadata_size,
                codepoint_metadata_default_value);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(table,
-                                codepoint_metadata,
-                                cp_section1_end * sizeof(codepoint_metadata[0]),  // 1st section
-                                cudaMemcpyHostToDevice,
-                                stream.value()));
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
+  CUDA_TRY(cudaMemcpyAsync(table,
+                           codepoint_metadata,
+                           cp_section1_end * sizeof(codepoint_metadata[0]),  // 1st section
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(
     table + cp_section2_begin,
     cp_metadata_917505_917999,
     (cp_section2_end - cp_section2_begin + 1) * sizeof(codepoint_metadata[0]),  // 2nd section
@@ -80,24 +80,24 @@ rmm::device_uvector<aux_codepoint_data_type> get_aux_codepoint_data(rmm::cuda_st
                table + aux_section1_end,
                table + aux_codepoint_data_size,
                aux_codepoint_default_value);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(table,
-                                aux_codepoint_data,
-                                aux_section1_end * sizeof(aux_codepoint_data[0]),  // 1st section
-                                cudaMemcpyHostToDevice,
-                                stream.value()));
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
+  CUDA_TRY(cudaMemcpyAsync(table,
+                           aux_codepoint_data,
+                           aux_section1_end * sizeof(aux_codepoint_data[0]),  // 1st section
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(
     table + aux_section2_begin,
     aux_cp_data_44032_55203,
     (aux_section2_end - aux_section2_begin + 1) * sizeof(aux_codepoint_data[0]),  // 2nd section
     cudaMemcpyHostToDevice,
     stream.value()));
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
+  CUDA_TRY(cudaMemcpyAsync(
     table + aux_section3_begin,
     aux_cp_data_70475_71099,
     (aux_section3_end - aux_section3_begin + 1) * sizeof(aux_codepoint_data[0]),  // 3rd section
     cudaMemcpyHostToDevice,
     stream.value()));
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
+  CUDA_TRY(cudaMemcpyAsync(
     table + aux_section4_begin,
     aux_cp_data_119134_119232,
     (aux_section4_end - aux_section4_begin + 1) * sizeof(aux_codepoint_data[0]),  // 4th section
@@ -236,33 +236,33 @@ std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
                                            cudf::mask_state::UNALLOCATED,
                                            stream,
                                            mr);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(result.table->mutable_view().data<uint64_t>(),
-                                table.data(),
-                                table.size() * sizeof(uint64_t),
-                                cudaMemcpyHostToDevice,
-                                stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(result.table->mutable_view().data<uint64_t>(),
+                           table.data(),
+                           table.size() * sizeof(uint64_t),
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
 
   result.bin_coefficients = cudf::make_numeric_column(cudf::data_type{cudf::type_id::UINT64},
                                                       bin_coefficients.size(),
                                                       cudf::mask_state::UNALLOCATED,
                                                       stream,
                                                       mr);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(result.bin_coefficients->mutable_view().data<uint64_t>(),
-                                bin_coefficients.data(),
-                                bin_coefficients.size() * sizeof(uint64_t),
-                                cudaMemcpyHostToDevice,
-                                stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(result.bin_coefficients->mutable_view().data<uint64_t>(),
+                           bin_coefficients.data(),
+                           bin_coefficients.size() * sizeof(uint64_t),
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
 
   result.bin_offsets = cudf::make_numeric_column(cudf::data_type{cudf::type_id::UINT16},
                                                  bin_offsets.size(),
                                                  cudf::mask_state::UNALLOCATED,
                                                  stream,
                                                  mr);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(result.bin_offsets->mutable_view().data<uint16_t>(),
-                                bin_offsets.data(),
-                                bin_offsets.size() * sizeof(uint16_t),
-                                cudaMemcpyHostToDevice,
-                                stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(result.bin_offsets->mutable_view().data<uint16_t>(),
+                           bin_offsets.data(),
+                           bin_offsets.size() * sizeof(uint16_t),
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
 
   auto cp_metadata            = detail::get_codepoint_metadata(stream);
   auto const cp_metadata_size = static_cast<cudf::size_type>(cp_metadata.size());
diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu
index 7d8df583039..82bb50c6aaa 100644
--- a/cpp/src/text/subword/wordpiece_tokenizer.cu
+++ b/cpp/src/text/subword/wordpiece_tokenizer.cu
@@ -457,7 +457,7 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre
                                                                      num_code_points,
                                                                      device_token_ids.data(),
                                                                      device_tokens_per_word.data());
-  CUDF_CHECK_CUDA(stream.value());
+  CHECK_CUDA(stream.value());
 
   cudf::detail::grid_1d const grid_mark{static_cast<cudf::size_type>(num_strings + 1),
                                         THREADS_PER_BLOCK};
@@ -469,7 +469,7 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre
                                                          device_start_word_indices,
                                                          device_end_word_indices,
                                                          num_strings);
-  CUDF_CHECK_CUDA(stream.value());
+  CHECK_CUDA(stream.value());
 
   // check for special tokens and adjust indices
   thrust::for_each_n(
@@ -512,7 +512,7 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre
       num_words,
       device_token_ids.data(),
       device_tokens_per_word.data());
-  CUDF_CHECK_CUDA(stream.value());
+  CHECK_CUDA(stream.value());
 
   // Repurpose the input array for the token ids. In the worst case, each code point ends up being a
   // token so this will always have enough memory to store the contiguous tokens.
diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu
index 74433af9f05..bc3678380be 100644
--- a/cpp/src/transform/compute_column.cu
+++ b/cpp/src/transform/compute_column.cu
@@ -102,9 +102,9 @@ std::unique_ptr<column> compute_column(table_view const& table,
   // Configure kernel parameters
   auto const& device_expression_data = parser.device_expression_data;
   int device_id;
-  CUDF_CUDA_TRY(cudaGetDevice(&device_id));
+  CUDA_TRY(cudaGetDevice(&device_id));
   int shmem_limit_per_block;
-  CUDF_CUDA_TRY(
+  CUDA_TRY(
     cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
   auto constexpr MAX_BLOCK_SIZE = 128;
   auto const block_size =
@@ -125,7 +125,7 @@ std::unique_ptr<column> compute_column(table_view const& table,
       <<<config.num_blocks, config.num_threads_per_block, shmem_per_block, stream.value()>>>(
         *table_device, device_expression_data, *mutable_output_device);
   }
-  CUDF_CHECK_CUDA(stream.value());
+  CHECK_CUDA(stream.value());
   return output_column;
 }
 
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 744cec90fd9..0f06be0149e 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -505,9 +505,9 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
   // of memory of size (# input rows * sizeof(row_span) * max_branch_depth).
   auto const shmem_per_thread = sizeof(row_span) * h_info.max_branch_depth;
   int device_id;
-  CUDF_CUDA_TRY(cudaGetDevice(&device_id));
+  CUDA_TRY(cudaGetDevice(&device_id));
   int shmem_limit_per_block;
-  CUDF_CUDA_TRY(
+  CUDA_TRY(
     cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
   constexpr int max_block_size = 256;
   auto const block_size =
diff --git a/cpp/src/unary/unary_ops.cuh b/cpp/src/unary/unary_ops.cuh
index 08b68cc0591..19d78b010ec 100644
--- a/cpp/src/unary/unary_ops.cuh
+++ b/cpp/src/unary/unary_ops.cuh
@@ -70,7 +70,7 @@ struct launcher {
     thrust::transform(
       rmm::exec_policy(stream), input.begin<T>(), input.end<T>(), output_view.begin<Tout>(), F{});
 
-    CUDF_CHECK_CUDA(stream.value());
+    CHECK_CUDA(stream.value());
 
     return output;
   }
diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp
index 6c2c0716331..3a479f0860b 100644
--- a/cpp/tests/bitmask/bitmask_tests.cpp
+++ b/cpp/tests/bitmask/bitmask_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -88,10 +88,10 @@ rmm::device_uvector<cudf::bitmask_type> make_mask(cudf::size_type size, bool fil
     return cudf::detail::make_zeroed_device_uvector_sync<cudf::bitmask_type>(size);
   } else {
     auto ret = rmm::device_uvector<cudf::bitmask_type>(size, rmm::cuda_stream_default);
-    CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(),
-                                  ~cudf::bitmask_type{0},
-                                  size * sizeof(cudf::bitmask_type),
-                                  rmm::cuda_stream_default.value()));
+    CUDA_TRY(cudaMemsetAsync(ret.data(),
+                             ~cudf::bitmask_type{0},
+                             size * sizeof(cudf::bitmask_type),
+                             rmm::cuda_stream_default.value()));
     return ret;
   }
 }
@@ -530,10 +530,10 @@ void cleanEndWord(rmm::device_buffer& mask, int begin_bit, int end_bit)
   auto number_of_bits       = end_bit - begin_bit;
   if (number_of_bits % 32 != 0) {
     cudf::bitmask_type end_mask = 0;
-    CUDF_CUDA_TRY(cudaMemcpy(
+    CUDA_TRY(cudaMemcpy(
       &end_mask, ptr + number_of_mask_words - 1, sizeof(end_mask), cudaMemcpyDeviceToHost));
     end_mask = end_mask & ((1 << (number_of_bits % 32)) - 1);
-    CUDF_CUDA_TRY(cudaMemcpy(
+    CUDA_TRY(cudaMemcpy(
       ptr + number_of_mask_words - 1, &end_mask, sizeof(end_mask), cudaMemcpyHostToDevice));
   }
 }
diff --git a/cpp/tests/copying/concatenate_tests.cu b/cpp/tests/copying/concatenate_tests.cu
index 4d76008fd13..93e4e588e0e 100644
--- a/cpp/tests/copying/concatenate_tests.cu
+++ b/cpp/tests/copying/concatenate_tests.cu
@@ -62,9 +62,9 @@ struct TypedColumnTest : public cudf::test::BaseFixture {
     std::iota(h_data.begin(), h_data.end(), char{0});
     std::vector<char> h_mask(mask.size());
     std::iota(h_mask.begin(), h_mask.end(), char{0});
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
+    CUDA_TRY(cudaMemcpyAsync(
       typed_data, h_data.data(), data.size(), cudaMemcpyHostToDevice, stream.value()));
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
+    CUDA_TRY(cudaMemcpyAsync(
       typed_mask, h_mask.data(), mask.size(), cudaMemcpyHostToDevice, stream.value()));
     stream.synchronize();
   }
diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index 1067366d010..581268f26f4 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -155,7 +155,7 @@ struct AtomicsTest : public cudf::test::BaseFixture {
 
     auto host_result = cudf::detail::make_host_vector_sync(dev_result);
 
-    CUDF_CHECK_CUDA(rmm::cuda_stream_default.value());
+    CHECK_CUDA(rmm::cuda_stream_default.value());
 
     if (!is_timestamp_sum<T, cudf::DeviceSum>()) {
       EXPECT_EQ(host_result[0], exact[0]) << "atomicAdd test failed";
@@ -302,7 +302,7 @@ struct AtomicsBitwiseOpTest : public cudf::test::BaseFixture {
 
     auto host_result = cudf::detail::make_host_vector_sync(dev_result);
 
-    CUDF_CHECK_CUDA(rmm::cuda_stream_default.value());
+    CHECK_CUDA(rmm::cuda_stream_default.value());
 
     // print_exact(exact, "exact");
     // print_exact(host_result.data(), "result");
diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu
index 4327a8b694b..da9509e94a6 100644
--- a/cpp/tests/error/error_handling_test.cu
+++ b/cpp/tests/error/error_handling_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,28 +36,28 @@ TEST(ExpectsTest, TryCatch)
 
 TEST(CudaTryTest, Error)
 {
-  CUDA_EXPECT_THROW_MESSAGE(CUDF_CUDA_TRY(cudaErrorLaunchFailure),
+  CUDA_EXPECT_THROW_MESSAGE(CUDA_TRY(cudaErrorLaunchFailure),
                             "cudaErrorLaunchFailure unspecified launch failure");
 }
-TEST(CudaTryTest, Success) { EXPECT_NO_THROW(CUDF_CUDA_TRY(cudaSuccess)); }
+TEST(CudaTryTest, Success) { EXPECT_NO_THROW(CUDA_TRY(cudaSuccess)); }
 
 TEST(CudaTryTest, TryCatch)
 {
-  CUDA_EXPECT_THROW_MESSAGE(CUDF_CUDA_TRY(cudaErrorMemoryAllocation),
+  CUDA_EXPECT_THROW_MESSAGE(CUDA_TRY(cudaErrorMemoryAllocation),
                             "cudaErrorMemoryAllocation out of memory");
 }
 
-TEST(StreamCheck, success) { EXPECT_NO_THROW(CUDF_CHECK_CUDA(0)); }
+TEST(StreamCheck, success) { EXPECT_NO_THROW(CHECK_CUDA(0)); }
 
 namespace {
 // Some silly kernel that will cause an error
 void __global__ test_kernel(int* data) { data[threadIdx.x] = threadIdx.x; }
 }  // namespace
 
-// In a release build and without explicit synchronization, CUDF_CHECK_CUDA may
+// In a release build and without explicit synchronization, CHECK_CUDA may
 // or may not fail on erroneous asynchronous CUDA calls. Invoke
 // cudaStreamSynchronize to guarantee failure on error. In a non-release build,
-// CUDF_CHECK_CUDA deterministically fails on erroneous asynchronous CUDA
+// CHECK_CUDA deterministically fails on erroneous asynchronous CUDA
 // calls.
 TEST(StreamCheck, FailedKernel)
 {
@@ -67,7 +67,7 @@ TEST(StreamCheck, FailedKernel)
 #ifdef NDEBUG
   stream.synchronize();
 #endif
-  EXPECT_THROW(CUDF_CHECK_CUDA(stream.value()), cudf::cuda_error);
+  EXPECT_THROW(CHECK_CUDA(stream.value()), cudf::cuda_error);
 }
 
 TEST(StreamCheck, CatchFailedKernel)
@@ -78,7 +78,7 @@ TEST(StreamCheck, CatchFailedKernel)
 #ifndef NDEBUG
   stream.synchronize();
 #endif
-  CUDA_EXPECT_THROW_MESSAGE(CUDF_CHECK_CUDA(stream.value()),
+  CUDA_EXPECT_THROW_MESSAGE(CHECK_CUDA(stream.value()),
                             "cudaErrorInvalidConfiguration "
                             "invalid configuration argument");
 }
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index cd0aab3caeb..1f4a8a7e508 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -1101,11 +1101,11 @@ class custom_test_data_sink : public cudf::io::data_sink {
   {
     return std::async(std::launch::deferred, [=] {
       char* ptr = nullptr;
-      CUDF_CUDA_TRY(cudaMallocHost(&ptr, size));
-      CUDF_CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDeviceToHost, stream.value()));
+      CUDA_TRY(cudaMallocHost(&ptr, size));
+      CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDeviceToHost, stream.value()));
       stream.synchronize();
       outfile_.write(ptr, size);
-      CUDF_CUDA_TRY(cudaFreeHost(ptr));
+      CUDA_TRY(cudaFreeHost(ptr));
     });
   }
 
@@ -2166,11 +2166,11 @@ class custom_test_memmap_sink : public cudf::io::data_sink {
   {
     return std::async(std::launch::deferred, [=] {
       char* ptr = nullptr;
-      CUDF_CUDA_TRY(cudaMallocHost(&ptr, size));
-      CUDF_CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDeviceToHost, stream.value()));
+      CUDA_TRY(cudaMallocHost(&ptr, size));
+      CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDeviceToHost, stream.value()));
       stream.synchronize();
       mm_writer->host_write(ptr, size);
-      CUDF_CUDA_TRY(cudaFreeHost(ptr));
+      CUDA_TRY(cudaFreeHost(ptr));
     });
   }
 
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index f560ce7f20c..57041e448a2 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -1423,7 +1423,7 @@ TEST_F(JoinTest, HashJoinLargeOutputSize)
   // self-join a table of zeroes to generate an output row count that would overflow int32_t
   std::size_t col_size = 65567;
   rmm::device_buffer zeroes(col_size * sizeof(int32_t), rmm::cuda_stream_default);
-  CUDF_CUDA_TRY(cudaMemsetAsync(zeroes.data(), 0, zeroes.size(), rmm::cuda_stream_default.value()));
+  CUDA_TRY(cudaMemsetAsync(zeroes.data(), 0, zeroes.size(), rmm::cuda_stream_default.value()));
   cudf::column_view col_zeros(cudf::data_type{cudf::type_id::INT32}, col_size, zeroes.data());
   cudf::table_view tview{{col_zeros}};
   cudf::hash_join hash_join(tview, cudf::null_equality::UNEQUAL);
diff --git a/cpp/tests/scalar/scalar_device_view_test.cu b/cpp/tests/scalar/scalar_device_view_test.cu
index 30c843a91c4..ee4c878726f 100644
--- a/cpp/tests/scalar/scalar_device_view_test.cu
+++ b/cpp/tests/scalar/scalar_device_view_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -60,13 +60,13 @@ TYPED_TEST(TypedScalarDeviceViewTest, Value)
   rmm::device_scalar<bool> result{rmm::cuda_stream_default};
 
   test_set_value<<<1, 1>>>(scalar_device_view, scalar_device_view1);
-  CUDF_CHECK_CUDA(0);
+  CHECK_CUDA(0);
 
   EXPECT_EQ(s1.value(), value);
   EXPECT_TRUE(s1.is_valid());
 
   test_value<<<1, 1>>>(scalar_device_view, scalar_device_view1, result.data());
-  CUDF_CHECK_CUDA(0);
+  CHECK_CUDA(0);
 
   EXPECT_TRUE(result.value(rmm::cuda_stream_default));
 }
@@ -85,7 +85,7 @@ TYPED_TEST(TypedScalarDeviceViewTest, ConstructNull)
   rmm::device_scalar<bool> result{rmm::cuda_stream_default};
 
   test_null<<<1, 1>>>(scalar_device_view, result.data());
-  CUDF_CHECK_CUDA(0);
+  CHECK_CUDA(0);
 
   EXPECT_FALSE(result.value(rmm::cuda_stream_default));
 }
@@ -105,7 +105,7 @@ TYPED_TEST(TypedScalarDeviceViewTest, SetNull)
   EXPECT_TRUE(s.is_valid());
 
   test_setnull<<<1, 1>>>(scalar_device_view);
-  CUDF_CHECK_CUDA(0);
+  CHECK_CUDA(0);
 
   EXPECT_FALSE(s.is_valid());
 }
@@ -131,7 +131,7 @@ TEST_F(StringScalarDeviceViewTest, Value)
   auto value_v = cudf::detail::make_device_uvector_sync(value);
 
   test_string_value<<<1, 1>>>(scalar_device_view, value_v.data(), value.size(), result.data());
-  CUDF_CHECK_CUDA(0);
+  CHECK_CUDA(0);
 
   EXPECT_TRUE(result.value(rmm::cuda_stream_default));
 }
diff --git a/cpp/tests/sort/sort_test.cpp b/cpp/tests/sort/sort_test.cpp
index 1dd7e21b821..f79c77ce429 100644
--- a/cpp/tests/sort/sort_test.cpp
+++ b/cpp/tests/sort/sort_test.cpp
@@ -86,7 +86,7 @@ TYPED_TEST(Sort, WithNullMax)
     // the rest of the values are equivalent and yields random sorted order.
     auto to_host = [](column_view const& col) {
       thrust::host_vector<int32_t> h_data(col.size());
-      CUDF_CUDA_TRY(cudaMemcpy(
+      CUDA_TRY(cudaMemcpy(
         h_data.data(), col.data<int32_t>(), h_data.size() * sizeof(int32_t), cudaMemcpyDefault));
       return h_data;
     };
@@ -124,7 +124,7 @@ TYPED_TEST(Sort, WithNullMin)
     // the rest of the values are equivalent and yields random sorted order.
     auto to_host = [](column_view const& col) {
       thrust::host_vector<int32_t> h_data(col.size());
-      CUDF_CUDA_TRY(cudaMemcpy(
+      CUDA_TRY(cudaMemcpy(
         h_data.data(), col.data<int32_t>(), h_data.size() * sizeof(int32_t), cudaMemcpyDefault));
       return h_data;
     };
@@ -160,7 +160,7 @@ TYPED_TEST(Sort, WithMixedNullOrder)
     // the rest of the values are equivalent and yields random sorted order.
     auto to_host = [](column_view const& col) {
       thrust::host_vector<int32_t> h_data(col.size());
-      CUDF_CUDA_TRY(cudaMemcpy(
+      CUDA_TRY(cudaMemcpy(
         h_data.data(), col.data<int32_t>(), h_data.size() * sizeof(int32_t), cudaMemcpyDefault));
       return h_data;
     };
diff --git a/cpp/tests/sort/stable_sort_tests.cpp b/cpp/tests/sort/stable_sort_tests.cpp
index ee43c9e7b4b..b6b7495136e 100644
--- a/cpp/tests/sort/stable_sort_tests.cpp
+++ b/cpp/tests/sort/stable_sort_tests.cpp
@@ -94,7 +94,7 @@ TYPED_TEST(StableSort, WithNullMax)
     // the rest of the values are equivalent and yields random sorted order.
     auto to_host = [](column_view const& col) {
       thrust::host_vector<int32_t> h_data(col.size());
-      CUDF_CUDA_TRY(cudaMemcpy(
+      CUDA_TRY(cudaMemcpy(
         h_data.data(), col.data<int32_t>(), h_data.size() * sizeof(int32_t), cudaMemcpyDefault));
       return h_data;
     };
@@ -130,7 +130,7 @@ TYPED_TEST(StableSort, WithNullMin)
     // the rest of the values are equivalent and yields random sorted order.
     auto to_host = [](column_view const& col) {
       thrust::host_vector<int32_t> h_data(col.size());
-      CUDF_CUDA_TRY(cudaMemcpy(
+      CUDA_TRY(cudaMemcpy(
         h_data.data(), col.data<int32_t>(), h_data.size() * sizeof(int32_t), cudaMemcpyDefault));
       return h_data;
     };
diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu
index 6861737bfb5..0ba4b268c70 100644
--- a/cpp/tests/strings/factories_test.cu
+++ b/cpp/tests/strings/factories_test.cu
@@ -78,7 +78,7 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
     h_offsets[idx + 1] = offset;
   }
   auto d_strings = cudf::detail::make_device_uvector_sync(strings);
-  CUDF_CUDA_TRY(cudaMemcpy(d_buffer.data(), h_buffer.data(), memsize, cudaMemcpyHostToDevice));
+  CUDA_TRY(cudaMemcpy(d_buffer.data(), h_buffer.data(), memsize, cudaMemcpyHostToDevice));
   auto column = cudf::make_strings_column(d_strings);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_id::STRING});
   EXPECT_EQ(column->null_count(), nulls);
diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp
index 5802a1ddc0a..7f8a31ef9bb 100644
--- a/cpp/tests/strings/integers_tests.cpp
+++ b/cpp/tests/strings/integers_tests.cpp
@@ -302,10 +302,10 @@ TYPED_TEST(StringsIntegerConvertTest, FromToInteger)
   auto integers      = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
                                             (cudf::size_type)d_integers.size());
   auto integers_view = integers->mutable_view();
-  CUDF_CUDA_TRY(cudaMemcpy(integers_view.data<TypeParam>(),
-                           d_integers.data(),
-                           d_integers.size() * sizeof(TypeParam),
-                           cudaMemcpyDeviceToDevice));
+  CUDA_TRY(cudaMemcpy(integers_view.data<TypeParam>(),
+                      d_integers.data(),
+                      d_integers.size() * sizeof(TypeParam),
+                      cudaMemcpyDeviceToDevice));
   integers_view.set_null_count(0);
 
   // convert to strings
diff --git a/cpp/tests/types/type_dispatcher_test.cu b/cpp/tests/types/type_dispatcher_test.cu
index d8b2a736bde..dca80b597c0 100644
--- a/cpp/tests/types/type_dispatcher_test.cu
+++ b/cpp/tests/types/type_dispatcher_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -71,7 +71,7 @@ TYPED_TEST(TypedDispatcherTest, DeviceDispatch)
 {
   auto result = cudf::detail::make_zeroed_device_uvector_sync<bool>(1);
   dispatch_test_kernel<<<1, 1>>>(cudf::type_to_id<TypeParam>(), result.data());
-  CUDF_CUDA_TRY(cudaDeviceSynchronize());
+  CUDA_TRY(cudaDeviceSynchronize());
   EXPECT_EQ(true, result.front_element(rmm::cuda_stream_default));
 }
 
@@ -132,7 +132,7 @@ TYPED_TEST(TypedDoubleDispatcherTest, DeviceDoubleDispatch)
   auto result = cudf::detail::make_zeroed_device_uvector_sync<bool>(1);
   double_dispatch_test_kernel<<<1, 1>>>(
     cudf::type_to_id<TypeParam>(), cudf::type_to_id<TypeParam>(), result.data());
-  CUDF_CUDA_TRY(cudaDeviceSynchronize());
+  CUDA_TRY(cudaDeviceSynchronize());
   EXPECT_EQ(true, result.front_element(rmm::cuda_stream_default));
 }
 
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 015178f8c7c..68626c2d4d3 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -824,16 +824,16 @@ std::vector<bitmask_type> bitmask_to_host(cudf::column_view const& c)
     auto num_bitmasks = num_bitmask_words(c.size());
     std::vector<bitmask_type> host_bitmask(num_bitmasks);
     if (c.offset() == 0) {
-      CUDF_CUDA_TRY(cudaMemcpy(host_bitmask.data(),
-                               c.null_mask(),
-                               num_bitmasks * sizeof(bitmask_type),
-                               cudaMemcpyDeviceToHost));
+      CUDA_TRY(cudaMemcpy(host_bitmask.data(),
+                          c.null_mask(),
+                          num_bitmasks * sizeof(bitmask_type),
+                          cudaMemcpyDeviceToHost));
     } else {
       auto mask = copy_bitmask(c.null_mask(), c.offset(), c.offset() + c.size());
-      CUDF_CUDA_TRY(cudaMemcpy(host_bitmask.data(),
-                               mask.data(),
-                               num_bitmasks * sizeof(bitmask_type),
-                               cudaMemcpyDeviceToHost));
+      CUDA_TRY(cudaMemcpy(host_bitmask.data(),
+                          mask.data(),
+                          num_bitmasks * sizeof(bitmask_type),
+                          cudaMemcpyDeviceToHost));
     }
 
     return host_bitmask;
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index cebe476dd87..78ac8a18107 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -137,8 +137,8 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
           left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available;
       char *copy_to = current_buffer_data + current_buffer_written;
 
-      CUDF_CUDA_TRY(cudaMemcpyAsync(copy_to, copy_from, amount_to_copy, cudaMemcpyDeviceToHost,
-                                    stream.value()));
+      CUDA_TRY(cudaMemcpyAsync(copy_to, copy_from, amount_to_copy, cudaMemcpyDeviceToHost,
+                               stream.value()));
 
       copy_from = copy_from + amount_to_copy;
       current_buffer_written += amount_to_copy;
diff --git a/java/src/main/native/src/map_lookup.cu b/java/src/main/native/src/map_lookup.cu
index 13d1a5a94a9..683651799e7 100644
--- a/java/src/main/native/src/map_lookup.cu
+++ b/java/src/main/native/src/map_lookup.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -123,7 +123,7 @@ get_gather_map_for_map_values(column_view const &input, string_scalar &lookup_ke
   gpu_find_first<block_size, has_nulls><<<grid.num_blocks, block_size, 0, stream.value()>>>(
       *input_device_view, *output_view, lookup_key_device_view);
 
-  CUDF_CHECK_CUDA(stream.value());
+  CHECK_CUDA(stream.value());
 
   return gather_map;
 }
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 96ee95c476d..4d78f416134 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -1766,9 +1766,9 @@ std::vector<std::unique_ptr<column>> convert_to_rows(
     std::optional<rmm::device_uvector<strings_column_view::offset_iterator>> variable_width_offsets,
     rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
   int device_id;
-  CUDF_CUDA_TRY(cudaGetDevice(&device_id));
+  CUDA_TRY(cudaGetDevice(&device_id));
   int total_shmem_in_bytes;
-  CUDF_CUDA_TRY(
+  CUDA_TRY(
       cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
 #ifndef __CUDA_ARCH__ // __host__ code.
@@ -2097,9 +2097,9 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
   auto const num_rows = input.parent().size();
 
   int device_id;
-  CUDF_CUDA_TRY(cudaGetDevice(&device_id));
+  CUDA_TRY(cudaGetDevice(&device_id));
   int total_shmem_in_bytes;
-  CUDF_CUDA_TRY(
+  CUDA_TRY(
       cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
 #ifndef __CUDA_ARCH__ // __host__ code.