diff --git a/build.sh b/build.sh
index 45074a6645f..c2eba134c35 100755
--- a/build.sh
+++ b/build.sh
@@ -230,6 +230,7 @@ if buildAll || hasArg libcudf; then
         fi
         echo "$MSG"
         python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$MSG" > ${LIB_BUILD_DIR}/ninja_log.html
+        cp ${LIB_BUILD_DIR}/.ninja_log ${LIB_BUILD_DIR}/ninja.log
     fi
 
     if [[ ${INSTALL_TARGET} != "" ]]; then
diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index f23296038f2..6f19f174da0 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -85,6 +85,7 @@ if [ "$BUILD_LIBCUDF" == '1' ]; then
       gpuci_logger "Copying build metrics results"
       mkdir -p "$WORKSPACE/build-metrics"
       cp "$LIBCUDF_BUILD_DIR/ninja_log.html" "$WORKSPACE/build-metrics/BuildMetrics.html"
+      cp "$LIBCUDF_BUILD_DIR/ninja.log" "$WORKSPACE/build-metrics/ninja.log"
   fi
 
   gpuci_logger "Build conda pkg for libcudf_kafka"
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 57592de59af..13ef02efc99 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -123,7 +123,7 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp
 
 # ##################################################################################################
 # * stream_compaction benchmark -------------------------------------------------------------------
-ConfigureBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates.cpp)
+ConfigureNVBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates.cpp)
 
 # ##################################################################################################
 # * join benchmark --------------------------------------------------------------------------------
diff --git a/cpp/benchmarks/io/orc/orc_reader.cpp b/cpp/benchmarks/io/orc/orc_reader.cpp
index bb4a0ce72d8..e15513275ee 100644
--- a/cpp/benchmarks/io/orc/orc_reader.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader.cpp
@@ -88,13 +88,13 @@ void BM_orc_read_varying_options(benchmark::State& state)
   auto const use_np_dtypes = (flags & 2) != 0;
   auto const ts_type       = cudf::data_type{static_cast<cudf::type_id>(state.range(state_idx++))};
 
+  // skip_rows is not supported on nested types
   auto const data_types =
     dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
                                                    int32_t(type_group_id::FLOATING_POINT),
                                                    int32_t(type_group_id::FIXED_POINT),
                                                    int32_t(type_group_id::TIMESTAMP),
-                                                   int32_t(cudf::type_id::STRING),
-                                                   int32_t(cudf::type_id::LIST)}),
+                                                   int32_t(cudf::type_id::STRING)}),
                                 col_sel);
   auto const tbl  = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
   auto const view = tbl->view();
@@ -181,11 +181,12 @@ BENCHMARK_REGISTER_F(OrcRead, column_selection)
   ->Unit(benchmark::kMillisecond)
   ->UseManualTime();
 
+// Need an API to get the number of stripes to enable row_selection::STRIPES here
 BENCHMARK_DEFINE_F(OrcRead, row_selection)
 (::benchmark::State& state) { BM_orc_read_varying_options(state); }
 BENCHMARK_REGISTER_F(OrcRead, row_selection)
   ->ArgsProduct({{int32_t(column_selection::ALL)},
-                 {int32_t(row_selection::STRIPES), int32_t(row_selection::NROWS)},
+                 {int32_t(row_selection::NROWS)},
                  {1, 8},
                  {0b11},  // defaults
                  {int32_t(cudf::type_id::EMPTY)}})
diff --git a/cpp/benchmarks/io/parquet/parquet_reader.cpp b/cpp/benchmarks/io/parquet/parquet_reader.cpp
index d7a3a668bd1..09194931498 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader.cpp
@@ -88,13 +88,13 @@ void BM_parq_read_varying_options(benchmark::State& state)
   auto const use_pandas_metadata = (flags & 2) != 0;
   auto const ts_type = cudf::data_type{static_cast<cudf::type_id>(state.range(state_idx++))};
 
+  // No nested types here, because of https://github.com/rapidsai/cudf/issues/9970
   auto const data_types = dtypes_for_column_selection(
     get_type_or_group({static_cast<int32_t>(type_group_id::INTEGRAL),
                        static_cast<int32_t>(type_group_id::FLOATING_POINT),
                        static_cast<int32_t>(type_group_id::FIXED_POINT),
                        static_cast<int32_t>(type_group_id::TIMESTAMP),
-                       static_cast<int32_t>(cudf::type_id::STRING),
-                       static_cast<int32_t>(cudf::type_id::LIST)}),
+                       static_cast<int32_t>(cudf::type_id::STRING)}),
     col_sel);
   auto const tbl  = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
   auto const view = tbl->view();
@@ -181,20 +181,18 @@ BENCHMARK_REGISTER_F(ParquetRead, column_selection)
   ->Unit(benchmark::kMillisecond)
   ->UseManualTime();
 
-// Disabled until we add an API to read metadata from a parquet file and determine num row groups.
-// https://github.com/rapidsai/cudf/pull/9963#issuecomment-1004832863
-/*
+// row_selection::ROW_GROUPS disabled until we add an API to read metadata from a parquet file and
+// determine num row groups. https://github.com/rapidsai/cudf/pull/9963#issuecomment-1004832863
 BENCHMARK_DEFINE_F(ParquetRead, row_selection)
 (::benchmark::State& state) { BM_parq_read_varying_options(state); }
 BENCHMARK_REGISTER_F(ParquetRead, row_selection)
   ->ArgsProduct({{int32_t(column_selection::ALL)},
-                 {int32_t(row_selection::ROW_GROUPS), int32_t(row_selection::NROWS)},
+                 {int32_t(row_selection::NROWS)},
                  {1, 4},
                  {0b01},  // defaults
                  {int32_t(cudf::type_id::EMPTY)}})
   ->Unit(benchmark::kMillisecond)
   ->UseManualTime();
-*/
 
 BENCHMARK_DEFINE_F(ParquetRead, misc_options)
 (::benchmark::State& state) { BM_parq_read_varying_options(state); }
diff --git a/cpp/benchmarks/stream_compaction/drop_duplicates.cpp b/cpp/benchmarks/stream_compaction/drop_duplicates.cpp
index 8039d7d065f..317db92ae8b 100644
--- a/cpp/benchmarks/stream_compaction/drop_duplicates.cpp
+++ b/cpp/benchmarks/stream_compaction/drop_duplicates.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,64 +15,102 @@
  */
 
 #include <cudf/column/column_view.hpp>
-#include <cudf/stream_compaction.hpp>
+#include <cudf/detail/stream_compaction.hpp>
 #include <cudf/types.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <fixture/benchmark_fixture.hpp>
-#include <synchronization/synchronization.hpp>
+
+#include <fixture/rmm_pool_raii.hpp>
+
+#include <nvbench/nvbench.cuh>
 
 #include <memory>
 #include <random>
 
-class Compaction : public cudf::benchmark {
-};
+// necessary for custom enum types
+// see: https://github.com/NVIDIA/nvbench/blob/main/examples/enums.cu
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  // Enum type:
+  cudf::duplicate_keep_option,
+  // Callable to generate input strings:
+  [](cudf::duplicate_keep_option option) {
+    switch (option) {
+      case cudf::duplicate_keep_option::KEEP_FIRST: return "KEEP_FIRST";
+      case cudf::duplicate_keep_option::KEEP_LAST: return "KEEP_LAST";
+      case cudf::duplicate_keep_option::KEEP_NONE: return "KEEP_NONE";
+      default: return "ERROR";
+    }
+  },
+  // Callable to generate descriptions:
+  [](auto) { return std::string{}; })
+
+NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms");
+
+template <typename Type, cudf::duplicate_keep_option Keep>
+void nvbench_drop_duplicates(nvbench::state& state,
+                             nvbench::type_list<Type, nvbench::enum_type<Keep>>)
+{
+  if constexpr (not std::is_same_v<Type, int32_t> and
+                Keep != cudf::duplicate_keep_option::KEEP_FIRST) {
+    state.skip("Skip unwanted benchmarks.");
+  }
+
+  cudf::rmm_pool_raii pool_raii;
+
+  auto const num_rows = state.get_int64("NumRows");
+
+  cudf::test::UniformRandomGenerator<long> rand_gen(0, 100);
+  auto elements = cudf::detail::make_counting_transform_iterator(
+    0, [&rand_gen](auto row) { return rand_gen.generate(); });
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 100 == 0 ? false : true; });
+  cudf::test::fixed_width_column_wrapper<Type, long> values(elements, elements + num_rows, valids);
+
+  auto input_column = cudf::column_view(values);
+  auto input_table  = cudf::table_view({input_column, input_column, input_column, input_column});
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    rmm::cuda_stream_view stream_view{launch.get_stream()};
+    auto result = cudf::detail::drop_duplicates(
+      input_table, {0}, Keep, cudf::null_equality::EQUAL, cudf::null_order::BEFORE, stream_view);
+  });
+}
 
 template <typename Type>
-void BM_compaction(benchmark::State& state, cudf::duplicate_keep_option keep)
+void nvbench_unordered_drop_duplicates(nvbench::state& state, nvbench::type_list<Type>)
 {
-  auto const n_rows = static_cast<cudf::size_type>(state.range(0));
+  cudf::rmm_pool_raii pool_raii;
+
+  auto const num_rows = state.get_int64("NumRows");
 
   cudf::test::UniformRandomGenerator<long> rand_gen(0, 100);
   auto elements = cudf::detail::make_counting_transform_iterator(
     0, [&rand_gen](auto row) { return rand_gen.generate(); });
   auto valids = cudf::detail::make_counting_transform_iterator(
     0, [](auto i) { return i % 100 == 0 ? false : true; });
-  cudf::test::fixed_width_column_wrapper<Type, long> values(elements, elements + n_rows, valids);
+  cudf::test::fixed_width_column_wrapper<Type, long> values(elements, elements + num_rows, valids);
 
   auto input_column = cudf::column_view(values);
   auto input_table  = cudf::table_view({input_column, input_column, input_column, input_column});
 
-  for (auto _ : state) {
-    cuda_event_timer timer(state, true);
-    auto result = cudf::drop_duplicates(input_table, {0}, keep);
-  }
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    rmm::cuda_stream_view stream_view{launch.get_stream()};
+    auto result = cudf::detail::unordered_drop_duplicates(
+      input_table, {0}, cudf::null_equality::EQUAL, stream_view);
+  });
 }
 
-#define concat(a, b, c) a##b##c
-#define get_keep(op)    cudf::duplicate_keep_option::KEEP_##op
-
-// TYPE, OP
-#define RBM_BENCHMARK_DEFINE(name, type, keep)                     \
-  BENCHMARK_DEFINE_F(Compaction, name)(::benchmark::State & state) \
-  {                                                                \
-    BM_compaction<type>(state, get_keep(keep));                    \
-  }                                                                \
-  BENCHMARK_REGISTER_F(Compaction, name)                           \
-    ->UseManualTime()                                              \
-    ->Arg(10000)    /* 10k */                                      \
-    ->Arg(100000)   /* 100k */                                     \
-    ->Arg(1000000)  /* 1M */                                       \
-    ->Arg(10000000) /* 10M */
-
-#define COMPACTION_BENCHMARK_DEFINE(type, keep) \
-  RBM_BENCHMARK_DEFINE(concat(type, _, keep), type, keep)
-
-COMPACTION_BENCHMARK_DEFINE(bool, NONE);
-COMPACTION_BENCHMARK_DEFINE(int8_t, NONE);
-COMPACTION_BENCHMARK_DEFINE(int32_t, NONE);
-COMPACTION_BENCHMARK_DEFINE(int32_t, FIRST);
-COMPACTION_BENCHMARK_DEFINE(int32_t, LAST);
-using cudf::timestamp_ms;
-COMPACTION_BENCHMARK_DEFINE(timestamp_ms, NONE);
-COMPACTION_BENCHMARK_DEFINE(float, NONE);
+using data_type   = nvbench::type_list<bool, int8_t, int32_t, int64_t, float, cudf::timestamp_ms>;
+using keep_option = nvbench::enum_type_list<cudf::duplicate_keep_option::KEEP_FIRST,
+                                            cudf::duplicate_keep_option::KEEP_LAST,
+                                            cudf::duplicate_keep_option::KEEP_NONE>;
+
+NVBENCH_BENCH_TYPES(nvbench_drop_duplicates, NVBENCH_TYPE_AXES(data_type, keep_option))
+  .set_name("drop_duplicates")
+  .set_type_axes_names({"Type", "KeepOption"})
+  .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});
+
+NVBENCH_BENCH_TYPES(nvbench_unordered_drop_duplicates, NVBENCH_TYPE_AXES(data_type))
+  .set_name("unordered_drop_duplicates")
+  .set_type_axes_names({"Type"})
+  .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});
diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
index c964c85156c..5a20f78b798 100644
--- a/cpp/cmake/thirdparty/get_cucollections.cmake
+++ b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -21,7 +21,7 @@ function(find_and_configure_cucollections)
     cuco 0.0
     GLOBAL_TARGETS cuco::cuco
     CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections
-    GIT_TAG 0ca860b824f5dc22cf8a41f09912e62e11f07d82
+    GIT_TAG 6ec8b6dcdeceea07ab4456d32461a05c18864411
     OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF"
   )
 
diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp
index 87823d71c6f..3d065556827 100644
--- a/cpp/include/cudf/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/detail/stream_compaction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -75,6 +75,18 @@ std::unique_ptr<table> drop_duplicates(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc cudf::unordered_drop_duplicates
+ *
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<table> unordered_drop_duplicates(
+  table_view const& input,
+  std::vector<size_type> const& keys,
+  null_equality nulls_equal           = null_equality::EQUAL,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @copydoc cudf::distinct_count(column_view const&, null_policy, nan_policy)
  *
@@ -94,5 +106,24 @@ cudf::size_type distinct_count(table_view const& input,
                                null_equality nulls_equal    = null_equality::EQUAL,
                                rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
+/**
+ * @copydoc cudf::unordered_distinct_count(column_view const&, null_policy, nan_policy)
+ *
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ */
+cudf::size_type unordered_distinct_count(column_view const& input,
+                                         null_policy null_handling,
+                                         nan_policy nan_handling,
+                                         rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
+/**
+ * @copydoc cudf::unordered_distinct_count(table_view const&, null_equality)
+ *
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ */
+cudf::size_type unordered_distinct_count(table_view const& input,
+                                         null_equality nulls_equal    = null_equality::EQUAL,
+                                         rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index 6a85428d8f0..a7112ae415d 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -387,9 +387,9 @@ class fixed_point {
   /**
    * @brief operator + (for adding two `fixed_point` numbers)
    *
-   * If `_scale`s are equal, `_value`s are added <br>
-   * If `_scale`s are not equal, number with smaller `_scale` is shifted to the
-   * greater `_scale`, and then `_value`s are added
+   * If `_scale`s are equal, `_value`s are added.
+   * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
+   * smaller `_scale`, and then the `_value`s are added.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -402,9 +402,9 @@ class fixed_point {
   /**
    * @brief operator - (for subtracting two `fixed_point` numbers)
    *
-   * If `_scale`s are equal, `_value`s are subtracted <br>
-   * If `_scale`s are not equal, number with smaller `_scale` is shifted to the
-   * greater `_scale`, and then `_value`s are subtracted
+   * If `_scale`s are equal, `_value`s are subtracted.
+   * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
+   * smaller `_scale`, and then the `_value`s are subtracted.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -417,7 +417,7 @@ class fixed_point {
   /**
    * @brief operator * (for multiplying two `fixed_point` numbers)
    *
-   * `_scale`s are added and `_value`s are multiplied
+   * `_scale`s are added and `_value`s are multiplied.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -430,7 +430,7 @@ class fixed_point {
   /**
    * @brief operator / (for dividing two `fixed_point` numbers)
    *
-   * `_scale`s are subtracted and `_value`s are divided
+   * `_scale`s are subtracted and `_value`s are divided.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -443,9 +443,9 @@ class fixed_point {
   /**
    * @brief operator == (for comparing two `fixed_point` numbers)
    *
-   * If `_scale`s are equal, `_value`s are compared <br>
-   * If `_scale`s are not equal, number with smaller `_scale` is shifted to the
-   * greater `_scale`, and then `_value`s are compared
+   * If `_scale`s are equal, `_value`s are compared.
+   * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
+   * smaller `_scale`, and then the `_value`s are compared.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -458,9 +458,9 @@ class fixed_point {
   /**
    * @brief operator != (for comparing two `fixed_point` numbers)
    *
-   * If `_scale`s are equal, `_value`s are compared <br>
-   * If `_scale`s are not equal, number with smaller `_scale` is shifted to the
-   * greater `_scale`, and then `_value`s are compared
+   * If `_scale`s are equal, `_value`s are compared.
+   * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
+   * smaller `_scale`, and then the `_value`s are compared.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -473,9 +473,9 @@ class fixed_point {
   /**
    * @brief operator <= (for comparing two `fixed_point` numbers)
    *
-   * If `_scale`s are equal, `_value`s are compared <br>
-   * If `_scale`s are not equal, number with smaller `_scale` is shifted to the
-   * greater `_scale`, and then `_value`s are compared
+   * If `_scale`s are equal, `_value`s are compared.
+   * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
+   * smaller `_scale`, and then the `_value`s are compared.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -488,9 +488,9 @@ class fixed_point {
   /**
    * @brief operator >= (for comparing two `fixed_point` numbers)
    *
-   * If `_scale`s are equal, `_value`s are compared <br>
-   * If `_scale`s are not equal, number with smaller `_scale` is shifted to the
-   * greater `_scale`, and then `_value`s are compared
+   * If `_scale`s are equal, `_value`s are compared.
+   * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
+   * smaller `_scale`, and then the `_value`s are compared.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -503,9 +503,9 @@ class fixed_point {
   /**
    * @brief operator < (for comparing two `fixed_point` numbers)
    *
-   * If `_scale`s are equal, `_value`s are compared <br>
-   * If `_scale`s are not equal, number with smaller `_scale` is shifted to the
-   * greater `_scale`, and then `_value`s are compared
+   * If `_scale`s are equal, `_value`s are compared.
+   * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
+   * smaller `_scale`, and then the `_value`s are compared.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -518,9 +518,9 @@ class fixed_point {
   /**
    * @brief operator > (for comparing two `fixed_point` numbers)
    *
-   * If `_scale`s are equal, `_value`s are compared <br>
-   * If `_scale`s are not equal, number with smaller `_scale` is shifted to the
-   * greater `_scale`, and then `_value`s are compared
+   * If `_scale`s are equal, `_value`s are compared.
+   * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
+   * smaller `_scale`, and then the `_value`s are compared.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -534,7 +534,7 @@ class fixed_point {
    * @brief Method for creating a `fixed_point` number with a new `scale`
    *
    * The `fixed_point` number returned will have the same value, underlying representation and
-   * radix as `this`, the only thing changed is the scale
+   * radix as `this`, the only thing changed is the scale.
    *
    * @param scale The `scale` of the returned `fixed_point` number
    * @return `fixed_point` number with a new `scale`
diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp
index 7551511d281..94039d81f31 100644
--- a/cpp/include/cudf/stream_compaction.hpp
+++ b/cpp/include/cudf/stream_compaction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -189,7 +189,7 @@ std::unique_ptr<table> drop_nans(
  * @note if @p input.num_rows() is zero, there is no error, and an empty table
  * is returned.
  *
- * @throws cudf::logic_error if The `input` size  and `boolean_mask` size mismatches.
+ * @throws cudf::logic_error if `input.num_rows() != boolean_mask.size()`.
  * @throws cudf::logic_error if `boolean_mask` is not `type_id::BOOL8` type.
  *
  * @param[in] input The input table_view to filter
@@ -214,7 +214,10 @@ enum class duplicate_keep_option {
 };
 
 /**
- * @brief Create a new table without duplicate rows
+ * @brief Create a new table without duplicate rows.
+ *
+ * The output table is sorted according to the lexicographic ordering of the data in the columns
+ * indexed by `keys`.
  *
  * Given an `input` table_view, each row is copied to output table if the corresponding
  * row of `keys` columns is unique, where the definition of unique depends on the value of @p keep:
@@ -222,18 +225,18 @@ enum class duplicate_keep_option {
  * - KEEP_LAST: only the last of a sequence of duplicate rows is copied
  * - KEEP_NONE: no duplicate rows are copied
  *
- * @throws cudf::logic_error if The `input` row size mismatches with `keys`.
+ * @throws cudf::logic_error if the `keys` column indices are out of bounds in the `input` table.
  *
  * @param[in] input           input table_view to copy only unique rows
  * @param[in] keys            vector of indices representing key columns from `input`
- * @param[in] keep            keep first entry, last entry, or no entries if duplicates found
+ * @param[in] keep            keep first row, last row, or no rows of the found duplicates
  * @param[in] nulls_equal     flag to denote nulls are equal if null_equality::EQUAL, nulls are not
  *                            equal if null_equality::UNEQUAL
  * @param[in] null_precedence flag to denote nulls should appear before or after non-null items
  * @param[in] mr              Device memory resource used to allocate the returned table's device
- * memory
+ *                            memory
  *
- * @return Table with unique rows as per specified `keep`.
+ * @return Table with sorted unique rows as specified by `keep`.
  */
 std::unique_ptr<table> drop_duplicates(
   table_view const& input,
@@ -244,37 +247,95 @@ std::unique_ptr<table> drop_duplicates(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Count the unique elements in the column_view
+ * @brief Create a new table without duplicate rows with hash-based algorithms.
+ *
+ * Given an `input` table_view, each row is copied to output table if the corresponding
+ * row of `keys` columns is unique. If duplicate rows are present, it is unspecified which
+ * row is copied.
  *
- * Given an input column_view, number of unique elements in this column_view is returned
+ * The order of elements in the output table is not specified.
+ *
+ * @param[in] input           input table_view to copy only unique rows
+ * @param[in] keys            vector of indices representing key columns from `input`
+ * @param[in] nulls_equal     flag to denote nulls are equal if null_equality::EQUAL, nulls are not
+ *                            equal if null_equality::UNEQUAL
+ * @param[in] mr              Device memory resource used to allocate the returned table's device
+ *                            memory
+ *
+ * @return Table with unique rows in an unspecified order.
+ */
+std::unique_ptr<table> unordered_drop_duplicates(
+  table_view const& input,
+  std::vector<size_type> const& keys,
+  null_equality nulls_equal           = null_equality::EQUAL,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Count the number of consecutive groups of equivalent elements in a column.
  *
  * If `null_handling` is null_policy::EXCLUDE and `nan_handling` is  nan_policy::NAN_IS_NULL, both
  * `NaN` and `null` values are ignored. If `null_handling` is null_policy::EXCLUDE and
- * `nan_handling` is nan_policy::NAN_IS_VALID, only `null` is ignored, `NaN` is considered in unique
- * count.
+ * `nan_handling` is nan_policy::NAN_IS_VALID, only `null` is ignored, `NaN` is considered in count.
+ *
+ * `null`s are handled as equal.
  *
- * @param[in] input The column_view whose unique elements will be counted.
+ * @param[in] input The column_view whose number of distinct consecutive groups will be counted
  * @param[in] null_handling flag to include or ignore `null` while counting
- * @param[in] nan_handling flag to consider `NaN==null` or not.
+ * @param[in] nan_handling flag to consider `NaN==null` or not
  *
- * @return number of unique elements
+ * @return number of distinct consecutive groups in the column
  */
 cudf::size_type distinct_count(column_view const& input,
                                null_policy null_handling,
                                nan_policy nan_handling);
 
 /**
- * @brief Count the unique rows in a table.
- *
+ * @brief Count the number of consecutive groups of equivalent elements in a table.
  *
- * @param[in] input Table whose unique rows will be counted.
- * @param[in] nulls_equal flag to denote if null elements should be considered equal
- * nulls are not equal if null_equality::UNEQUAL
+ * @param[in] input Table whose number of distinct consecutive groups will be counted
+ * @param[in] nulls_equal flag to denote if null elements should be considered equal.
+ *            nulls are not equal if null_equality::UNEQUAL.
  *
- * @return number of unique rows in the table
+ * @return number of distinct consecutive groups in the table
  */
 cudf::size_type distinct_count(table_view const& input,
                                null_equality nulls_equal = null_equality::EQUAL);
 
+/**
+ * @brief Count the unique elements in the column_view.
+ *
+ * If `nulls_equal == nulls_equal::UNEQUAL`, all `null`s are unique.
+ *
+ * Given an input column_view, number of unique elements in this column_view is returned.
+ *
+ * If `null_handling` is null_policy::EXCLUDE and `nan_handling` is  nan_policy::NAN_IS_NULL, both
+ * `NaN` and `null` values are ignored. If `null_handling` is null_policy::EXCLUDE and
+ * `nan_handling` is nan_policy::NAN_IS_VALID, only `null` is ignored, `NaN` is considered in unique
+ * count.
+ *
+ * `null`s are handled as equal.
+ *
+ * @param[in] input The column_view whose unique elements will be counted
+ * @param[in] null_handling flag to include or ignore `null` while counting
+ * @param[in] nan_handling flag to consider `NaN==null` or not
+ *
+ * @return number of unique elements
+ */
+cudf::size_type unordered_distinct_count(column_view const& input,
+                                         null_policy null_handling,
+                                         nan_policy nan_handling);
+
+/**
+ * @brief Count the unique rows in a table.
+ *
+ * @param[in] input Table whose unique rows will be counted
+ * @param[in] nulls_equal flag to denote if null elements should be considered equal.
+ *            nulls are not equal if null_equality::UNEQUAL.
+ *
+ * @return number of unique rows in the table
+ */
+cudf::size_type unordered_distinct_count(table_view const& input,
+                                         null_equality nulls_equal = null_equality::EQUAL);
+
 /** @} */
 }  // namespace cudf
diff --git a/cpp/scripts/sort_ninja_log.py b/cpp/scripts/sort_ninja_log.py
index bac6697da82..33c369b254f 100755
--- a/cpp/scripts/sort_ninja_log.py
+++ b/cpp/scripts/sort_ninja_log.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 import argparse
 import os
@@ -34,49 +34,63 @@
 # build a map of the log entries
 entries = {}
 with open(log_file, "r") as log:
+    last = 0
+    files = {}
     for line in log:
         entry = line.split()
         if len(entry) > 4:
-            elapsed = int(entry[1]) - int(entry[0])
             obj_file = entry[3]
             file_size = (
                 os.path.getsize(os.path.join(log_path, obj_file))
                 if os.path.exists(obj_file)
                 else 0
             )
-            entries[entry[3]] = (elapsed, file_size)
+            start = int(entry[0])
+            end = int(entry[1])
+            # logic based on ninjatracing
+            if end < last:
+                files = {}
+            last = end
+            files.setdefault(entry[4], (entry[3], start, end, file_size))
 
-# check file could be loaded
+    # build entries from files dict
+    for entry in files.values():
+        entries[entry[0]] = (entry[1], entry[2], entry[3])
+
+# check file could be loaded and we have entries to report
 if len(entries) == 0:
     print("Could not parse", log_file)
     exit()
 
-# sort the keys by build time (descending order)
-keys = list(entries.keys())
-sl = sorted(keys, key=lambda k: entries[k][0], reverse=True)
+# sort the entries by build-time (descending order)
+sorted_list = sorted(
+    list(entries.keys()),
+    key=lambda k: entries[k][1] - entries[k][0],
+    reverse=True,
+)
 
-if output_fmt == "xml":
-    # output results in XML format
+# output results in XML format
+def output_xml(entries, sorted_list, args):
     root = ET.Element("testsuites")
     testsuite = ET.Element(
         "testsuite",
         attrib={
             "name": "build-time",
-            "tests": str(len(keys)),
+            "tests": str(len(sorted_list)),
             "failures": str(0),
             "errors": str(0),
         },
     )
     root.append(testsuite)
-    for key in sl:
-        entry = entries[key]
-        elapsed = float(entry[0]) / 1000
+    for name in sorted_list:
+        entry = entries[name]
+        build_time = float(entry[1] - entry[0]) / 1000
         item = ET.Element(
             "testcase",
             attrib={
                 "classname": "BuildTime",
-                "name": key,
-                "time": str(elapsed),
+                "name": name,
+                "time": str(build_time),
             },
         )
         testsuite.append(item)
@@ -85,62 +99,219 @@
     xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml(indent="   ")
     print(xmlstr)
 
-elif output_fmt == "html":
-    # output results in HTML format
-    print("<html><head><title>Sorted Ninja Build Times</title>")
-    # Note: Jenkins does not support style defined in the html
+
+# utility converts a millisecond value to a colum width in pixels
+def time_to_width(value, end):
+    # map a value from (0,end) to (0,1000)
+    r = (float(value) / float(end)) * 1000.0
+    return int(r)
+
+
+# assign each entry to a thread by analyzing the start/end times and
+# slotting them into thread buckets where they fit
+def assign_entries_to_threads(entries):
+    # first sort the entries' keys by end timestamp
+    sorted_keys = sorted(
+        list(entries.keys()), key=lambda k: entries[k][1], reverse=True
+    )
+
+    # build the chart data by assigning entries to threads
+    results = {}
+    threads = []
+    for name in sorted_keys:
+        entry = entries[name]
+
+        # assign this entry by finding the first available thread identified
+        # by the thread's current start time greater than the entry's end time
+        tid = -1
+        for t in range(len(threads)):
+            if threads[t] >= entry[1]:
+                threads[t] = entry[0]
+                tid = t
+                break
+
+        # if no current thread found, create a new one with this entry
+        if tid < 0:
+            threads.append(entry[0])
+            tid = len(threads) - 1
+
+        # add entry name to the array associated with this tid
+        if tid not in results.keys():
+            results[tid] = []
+        results[tid].append(name)
+
+    # first entry has the last end time
+    end_time = entries[sorted_keys[0]][1]
+
+    # return the threaded entries and the last end time
+    return (results, end_time)
+
+
+# output chart results in HTML format
+def output_html(entries, sorted_list, args):
+    print("<html><head><title>Build Metrics Report</title>")
+    # Note: Jenkins does not support javascript nor style defined in the html
     # https://www.jenkins.io/doc/book/security/configuring-content-security-policy/
     print("</head><body>")
     if args.msg is not None:
         print("<p>", args.msg, "</p>")
-    print("<table>")
-    print(
-        "<tr><th>File</th>",
-        "<th>Compile time<br/>(ms)</th>",
-        "<th>Size<br/>(bytes)</th><tr>",
-        sep="",
-    )
-    summary = {"red": 0, "yellow": 0, "green": 0}
+
+    # map entries to threads
+    # the end_time is used to scale all the entries to a fixed output width
+    threads, end_time = assign_entries_to_threads(entries)
+
+    # color ranges for build times
+    summary = {"red": 0, "yellow": 0, "green": 0, "white": 0}
     red = "bgcolor='#FFBBD0'"
     yellow = "bgcolor='#FFFF80'"
     green = "bgcolor='#AAFFBD'"
-    for key in sl:
-        result = entries[key]
-        elapsed = result[0]
-        color = green
-        if elapsed > 300000:  # 5 minutes
-            color = red
-            summary["red"] += 1
-        elif elapsed > 120000:  # 2 minutes
-            color = yellow
-            summary["yellow"] += 1
-        else:
-            summary["green"] += 1
+    white = "bgcolor='#FFFFFF'"
+
+    # create the build-time chart
+    print("<table id='chart' width='1000px' bgcolor='#BBBBBB'>")
+    for tid in range(len(threads)):
+        names = threads[tid]
+        # sort the names for this thread by start time
+        names = sorted(names, key=lambda k: entries[k][0])
+
+        # use the last entry's end time as the total row size
+        # (this is an estimate and does not have to be exact)
+        last_entry = entries[names[len(names) - 1]]
+        last_time = time_to_width(last_entry[1], end_time)
         print(
-            "<tr ",
-            color,
-            "><td>",
-            key,
-            "</td><td align='right'>",
-            result[0],
-            "</td><td align='right'>",
-            result[1],
-            "</td></tr>",
+            "<tr><td><table width='",
+            last_time,
+            "px' border='0' cellspacing='1' cellpadding='0'><tr>",
             sep="",
         )
-    print("</table><br/><table border='2'>")
+
+        prev_end = 0  # used for spacing between entries
+
+        # write out each entry for this thread as a column for a single row
+        for name in names:
+            entry = entries[name]
+            start = entry[0]
+            end = entry[1]
+
+            # this handles minor gaps between end of the
+            # previous entry and the start of the next
+            if prev_end > 0 and start > prev_end:
+                size = time_to_width(start - prev_end, end_time)
+                print("<td width='", size, "px'></td>")
+            # adjust for the cellspacing
+            prev_end = end + int(end_time / 500)
+
+            # format the build-time
+            build_time = end - start
+            build_time_str = str(build_time) + " ms"
+            if build_time > 120000:  # 2 minutes
+                minutes = int(build_time / 60000)
+                seconds = int(((build_time / 60000) - minutes) * 60)
+                build_time_str = "{:d}:{:02d} min".format(minutes, seconds)
+            elif build_time > 1000:
+                build_time_str = "{:.3f} s".format(build_time / 1000)
+
+            # assign color and accumulate legend values
+            color = white
+            if build_time > 300000:  # 5 minutes
+                color = red
+                summary["red"] += 1
+            elif build_time > 120000:  # 2 minutes
+                color = yellow
+                summary["yellow"] += 1
+            elif build_time > 1000:  # 1 second
+                color = green
+                summary["green"] += 1
+            else:
+                summary["white"] += 1
+
+            # compute the pixel width based on build-time
+            size = max(time_to_width(build_time, end_time), 2)
+            # output the column for this entry
+            print("<td height='20px' width='", size, "px' ", sep="", end="")
+            # title text is shown as hover-text by most browsers
+            print(color, "title='", end="")
+            print(name, "\n", build_time_str, "' ", sep="", end="")
+            # centers the name if it fits in the box
+            print("align='center' nowrap>", end="")
+            # use a slightly smaller, fixed-width font
+            print("<font size='-2' face='courier'>", end="")
+
+            # add the file-name if it fits, otherwise, truncate the name
+            file_name = os.path.basename(name)
+            if len(file_name) + 3 > size / 7:
+                abbr_size = int(size / 7) - 3
+                if abbr_size > 1:
+                    print(file_name[:abbr_size], "...", sep="", end="")
+            else:
+                print(file_name, end="")
+            # done with this entry
+            print("</font></td>")
+            # update the entry with just the computed output info
+            entries[name] = (build_time_str, color, entry[2])
+
+        # add a filler column at the end of each row
+        print("<td width='*'></td></tr></table></td></tr>")
+
+    # done with the chart
+    print("</table><br/>")
+
+    # output detail table in build-time descending order
+    print("<table id='detail' bgcolor='#EEEEEE'>")
+    print(
+        "<tr><th>File</th>",
+        "<th>Compile time</th>",
+        "<th>Size</th><tr>",
+        sep="",
+    )
+    for name in sorted_list:
+        entry = entries[name]
+        build_time_str = entry[0]
+        color = entry[1]
+        file_size = entry[2]
+
+        # format file size
+        file_size_str = ""
+        if file_size > 1000000:
+            file_size_str = "{:.3f} MB".format(file_size / 1000000)
+        elif file_size > 1000:
+            file_size_str = "{:.3f} KB".format(file_size / 1000)
+        elif file_size > 0:
+            file_size_str = str(file_size) + " bytes"
+
+        # output entry row
+        print("<tr ", color, "><td>", name, "</td>", sep="", end="")
+        print("<td align='right'>", build_time_str, "</td>", sep="", end="")
+        print("<td align='right'>", file_size_str, "</td></tr>", sep="")
+
+    print("</table><br/>")
+
     # include summary table with color legend
+    print("<table id='legend' border='2' bgcolor='#EEEEEE'>")
     print("<tr><td", red, ">time &gt; 5 minutes</td>")
     print("<td align='right'>", summary["red"], "</td></tr>")
     print("<tr><td", yellow, ">2 minutes &lt; time &lt; 5 minutes</td>")
     print("<td align='right'>", summary["yellow"], "</td></tr>")
-    print("<tr><td", green, ">time &lt; 2 minutes</td>")
+    print("<tr><td", green, ">1 second &lt; time &lt; 2 minutes</td>")
     print("<td align='right'>", summary["green"], "</td></tr>")
+    print("<tr><td", white, ">time &lt; 1 second</td>")
+    print("<td align='right'>", summary["white"], "</td></tr>")
     print("</table></body></html>")
 
-else:
-    # output results in CSV format
+
+# output results in CSV format
+def output_csv(entries, sorted_list, args):
     print("time,size,file")
-    for key in sl:
-        result = entries[key]
-        print(result[0], result[1], key, sep=",")
+    for name in sorted_list:
+        entry = entries[name]
+        build_time = entry[1] - entry[0]
+        file_size = entry[2]
+        print(build_time, file_size, name, sep=",")
+
+
+if output_fmt == "xml":
+    output_xml(entries, sorted_list, args)
+elif output_fmt == "html":
+    output_html(entries, sorted_list, args)
+else:
+    output_csv(entries, sorted_list, args)
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index e3d1ea88ece..96b7fd48dc9 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/search.hpp>
+#include <cudf/detail/sorting.hpp>
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/dictionary/detail/encode.hpp>
@@ -57,26 +58,29 @@ std::unique_ptr<column> add_keys(
   // [a,b,c,d,f] + [d,b,e] = [a,b,c,d,f,d,b,e]
   auto combined_keys =
     cudf::detail::concatenate(std::vector<column_view>{old_keys, new_keys}, stream);
-  // sort and remove any duplicates from the combined keys
-  // drop_duplicates([a,b,c,d,f,d,b,e]) = [a,b,c,d,e,f]
-  auto table_keys = cudf::detail::drop_duplicates(table_view{{combined_keys->view()}},
-                                                  std::vector<size_type>{0},  // only one key column
-                                                  duplicate_keep_option::KEEP_FIRST,
-                                                  null_equality::EQUAL,
-                                                  null_order::BEFORE,
-                                                  stream,
-                                                  mr)
-                      ->release();
-  std::unique_ptr<column> keys_column(std::move(table_keys.front()));
+
+  // Drop duplicates from the combined keys, then sort the result.
+  // sort(unordered_drop_duplicates([a,b,c,d,f,d,b,e])) = [a,b,c,d,e,f]
+  auto table_keys =
+    cudf::detail::unordered_drop_duplicates(table_view{{combined_keys->view()}},
+                                            std::vector<size_type>{0},  // only one key column
+                                            null_equality::EQUAL,
+                                            stream,
+                                            mr);
+  std::vector<order> column_order{order::ASCENDING};
+  std::vector<null_order> null_precedence{null_order::AFTER};  // should be no nulls here
+  auto sorted_keys =
+    cudf::detail::sort(table_keys->view(), column_order, null_precedence, stream, mr)->release();
+
+  std::unique_ptr<column> keys_column(std::move(sorted_keys.front()));
   // create a map for the indices
   // lower_bound([a,b,c,d,e,f],[a,b,c,d,f]) = [0,1,2,3,5]
-  auto map_indices = cudf::detail::lower_bound(
-    table_view{{keys_column->view()}},
-    table_view{{old_keys}},
-    std::vector<order>{order::ASCENDING},
-    std::vector<null_order>{null_order::AFTER},  // should be no nulls here
-    stream,
-    mr);
+  auto map_indices = cudf::detail::lower_bound(table_view{{keys_column->view()}},
+                                               table_view{{old_keys}},
+                                               column_order,
+                                               null_precedence,
+                                               stream,
+                                               mr);
   // now create the indices column -- map old values to the new ones
   // gather([4,0,3,1,2,2,2,4,0],[0,1,2,3,5]) = [5,0,3,1,2,2,2,5,0]
   column_view indices_view(dictionary_column.indices().type(),
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index fd86d8ec7d4..301338fa1a8 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/detail/concatenate.cuh>
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/sorting.hpp>
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/concatenate.hpp>
@@ -216,15 +217,15 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
 
   // sort keys and remove duplicates;
   // this becomes the keys child for the output dictionary column
-  auto table_keys = cudf::detail::drop_duplicates(table_view{{all_keys->view()}},
-                                                  std::vector<size_type>{0},
-                                                  duplicate_keep_option::KEEP_FIRST,
-                                                  null_equality::EQUAL,
-                                                  null_order::BEFORE,
-                                                  stream,
-                                                  mr)
-                      ->release();
-  std::unique_ptr<column> keys_column(std::move(table_keys.front()));
+  auto table_keys = cudf::detail::unordered_drop_duplicates(
+    table_view{{all_keys->view()}}, std::vector<size_type>{0}, null_equality::EQUAL, stream, mr);
+  auto sorted_keys = cudf::detail::sort(table_keys->view(),
+                                        std::vector<order>{order::ASCENDING},
+                                        std::vector<null_order>{null_order::BEFORE},
+                                        stream,
+                                        mr)
+                       ->release();
+  std::unique_ptr<column> keys_column(std::move(sorted_keys.front()));
 
   // next, concatenate the indices
   std::vector<column_view> indices_views(columns.size());
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index 72f6e034479..c1fb1fa2180 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/search.hpp>
+#include <cudf/detail/sorting.hpp>
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/dictionary/detail/encode.hpp>
@@ -120,16 +121,17 @@ std::unique_ptr<column> set_keys(
   auto keys = dictionary_column.keys();
   CUDF_EXPECTS(keys.type() == new_keys.type(), "keys types must match");
 
-  // copy the keys -- use drop_duplicates to make sure they are sorted and unique
-  auto table_keys = cudf::detail::drop_duplicates(table_view{{new_keys}},
-                                                  std::vector<size_type>{0},
-                                                  duplicate_keep_option::KEEP_FIRST,
-                                                  null_equality::EQUAL,
-                                                  null_order::BEFORE,
-                                                  stream,
-                                                  mr)
-                      ->release();
-  std::unique_ptr<column> keys_column(std::move(table_keys.front()));
+  // copy the keys -- use unordered_drop_duplicates to make sure they are unique, then
+  // sort the results.
+  auto unique_keys = cudf::detail::unordered_drop_duplicates(
+    table_view{{new_keys}}, std::vector<size_type>{0}, null_equality::EQUAL, stream, mr);
+  auto sorted_keys = cudf::detail::sort(unique_keys->view(),
+                                        std::vector<order>{order::ASCENDING},
+                                        std::vector<null_order>{null_order::BEFORE},
+                                        stream,
+                                        mr)
+                       ->release();
+  std::unique_ptr<column> keys_column(std::move(sorted_keys.front()));
 
   // compute the new nulls
   auto matches   = cudf::detail::contains(keys, keys_column->view(), stream, mr);
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index e8b4a8b1cbf..57bb222aaa0 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 #include <cudf/detail/groupby/sort_helper.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/structs/utilities.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/strings/string_view.hpp>
@@ -37,7 +38,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <thrust/copy.h>
 #include <thrust/iterator/counting_iterator.h>
 
 #include <memory>
@@ -219,20 +219,18 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::scan
 groupby::groups groupby::get_groups(table_view values, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  auto grouped_keys = helper().sorted_keys(rmm::cuda_stream_default, mr);
+  auto const stream = rmm::cuda_stream_default;
+  auto grouped_keys = helper().sorted_keys(stream, mr);
 
-  auto const& group_offsets = helper().group_offsets(rmm::cuda_stream_default);
-  std::vector<size_type> group_offsets_vector(group_offsets.size());
-  thrust::copy(thrust::device_pointer_cast(group_offsets.begin()),
-               thrust::device_pointer_cast(group_offsets.end()),
-               group_offsets_vector.begin());
+  auto const& group_offsets       = helper().group_offsets(stream);
+  auto const group_offsets_vector = cudf::detail::make_std_vector_sync(group_offsets, stream);
 
-  if (values.num_columns()) {
+  if (not values.is_empty()) {
     auto grouped_values = cudf::detail::gather(values,
-                                               helper().key_sort_order(rmm::cuda_stream_default),
+                                               helper().key_sort_order(stream),
                                                cudf::out_of_bounds_policy::DONT_CHECK,
                                                cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                               rmm::cuda_stream_default,
+                                               stream,
                                                mr);
     return groupby::groups{
       std::move(grouped_keys), std::move(group_offsets_vector), std::move(grouped_values)};
diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh
index c2115c3caa4..21bfd8120f7 100644
--- a/cpp/src/join/hash_join.cuh
+++ b/cpp/src/join/hash_join.cuh
@@ -68,7 +68,7 @@ class make_pair_function {
   {
     // Compute the hash value of row `i`
     auto row_hash_value = remap_sentinel_hash(_hash(i), _empty_key_sentinel);
-    return cuco::make_pair<hash_value_type, size_type>(std::move(row_hash_value), std::move(i));
+    return cuco::make_pair(row_hash_value, i);
   }
 
  private:
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index f38e653c4a6..e492968b8a6 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -45,7 +45,7 @@ struct make_pair_function_semi {
   {
     // The value is irrelevant since we only ever use the hash map to check for
     // membership of a particular row index.
-    return cuco::make_pair<hash_value_type, size_type>(i, 0);
+    return cuco::make_pair(static_cast<hash_value_type>(i), 0);
   }
 };
 
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index 8563a2a3bd3..39fe0b60c8c 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -49,7 +49,7 @@ struct make_pair_function {
   {
     // The value is irrelevant since we only ever use the hash map to check for
     // membership of a particular row index.
-    return cuco::make_pair<hash_value_type, size_type>(i, 0);
+    return cuco::make_pair(static_cast<hash_value_type>(i), 0);
   }
 };
 
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index 6f9149a47e2..234eaf51f96 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -93,7 +93,7 @@ struct reduce_dispatch_functor {
       case aggregation::NUNIQUE: {
         auto nunique_agg = dynamic_cast<nunique_aggregation const*>(agg.get());
         return make_fixed_width_scalar(
-          detail::distinct_count(
+          detail::unordered_distinct_count(
             col, nunique_agg->_null_handling, nan_policy::NAN_IS_VALID, stream),
           stream,
           mr);
diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index 5c695f8a16f..2c7488084b5 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,18 @@
  * limitations under the License.
  */
 
+#include "stream_compaction_common.cuh"
+#include "stream_compaction_common.hpp"
+
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sorting.hpp>
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/stream_compaction.hpp>
-#include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -30,39 +34,19 @@
 
 #include <thrust/count.h>
 #include <thrust/execution_policy.h>
+#include <thrust/logical.h>
+
+#include <cmath>
+#include <cstddef>
+#include <type_traits>
+#include <utility>
 #include <vector>
 
 namespace cudf {
 namespace detail {
-
-cudf::size_type distinct_count(table_view const& keys,
-                               null_equality nulls_equal,
-                               rmm::cuda_stream_view stream)
-{
-  // sort only indices
-  auto sorted_indices = sorted_order(keys,
-                                     std::vector<order>{},
-                                     std::vector<null_order>{},
-                                     stream,
-                                     rmm::mr::get_current_device_resource());
-
-  // count unique elements
-  auto sorted_row_index   = sorted_indices->view().data<cudf::size_type>();
-  auto device_input_table = cudf::table_device_view::create(keys, stream);
-
-  row_equality_comparator comp(
-    nullate::DYNAMIC{cudf::has_nulls(keys)}, *device_input_table, *device_input_table, nulls_equal);
-  return thrust::count_if(
-    rmm::exec_policy(stream),
-    thrust::counting_iterator<cudf::size_type>(0),
-    thrust::counting_iterator<cudf::size_type>(keys.num_rows()),
-    [sorted_row_index, comp] __device__(cudf::size_type i) {
-      return (i == 0 || not comp(sorted_row_index[i], sorted_row_index[i - 1]));
-    });
-}
-
+namespace {
 /**
- * @brief Functor to check for `NAN` at an index in a `column_device_view`.
+ * @brief Functor to check for `NaN` at an index in a `column_device_view`.
  *
  * @tparam T The type of `column_device_view`
  */
@@ -76,97 +60,199 @@ struct check_for_nan {
   check_for_nan(cudf::column_device_view input) : _input{input} {}
 
   /**
-   * @brief Operator to be called to check for `NAN` at `index` in `_input`
+   * @brief Operator to be called to check for `NaN` at `index` in `_input`
    *
-   * @param[in] index The index at which the `NAN` needs to be checked in `input`
+   * @param[in] index The index at which the `NaN` needs to be checked in `input`
    *
-   * @returns bool true if value at `index` is `NAN` and not null, else false
+   * @returns bool true if value at `index` is `NaN` and not null, else false
    */
-  __device__ bool operator()(size_type index)
+  __device__ bool operator()(size_type index) const noexcept
   {
     return std::isnan(_input.data<T>()[index]) and _input.is_valid(index);
   }
 
- protected:
   cudf::column_device_view _input;
 };
 
 /**
  * @brief A structure to be used along with type_dispatcher to check if a
- * `column_view` has `NAN`.
+ * `column_view` has `NaN`.
  */
 struct has_nans {
   /**
-   * @brief Checks if `input` has `NAN`
+   * @brief Checks if `input` has `NaN`
    *
    * @note This will be applicable only for floating point type columns.
    *
-   * @param[in] input The `column_view` which will be checked for `NAN`
+   * @param[in] input The `column_view` which will be checked for `NaN`
    * @param[in] stream CUDA stream used for device memory operations and kernel launches.
    *
-   * @returns bool true if `input` has `NAN` else false
+   * @returns bool true if `input` has `NaN` else false
    */
-  template <typename T, std::enable_if_t<std::is_floating_point<T>::value>* = nullptr>
+  template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
   bool operator()(column_view const& input, rmm::cuda_stream_view stream)
   {
     auto input_device_view = cudf::column_device_view::create(input, stream);
     auto device_view       = *input_device_view;
-    auto count             = thrust::count_if(rmm::exec_policy(stream),
-                                  thrust::counting_iterator<cudf::size_type>(0),
-                                  thrust::counting_iterator<cudf::size_type>(input.size()),
-                                  check_for_nan<T>(device_view));
-    return count > 0;
+    return thrust::any_of(rmm::exec_policy(stream),
+                          thrust::counting_iterator<cudf::size_type>(0),
+                          thrust::counting_iterator<cudf::size_type>(input.size()),
+                          check_for_nan<T>(device_view));
   }
 
   /**
-   * @brief Checks if `input` has `NAN`
+   * @brief Checks if `input` has `NaN`
    *
    * @note This will be applicable only for non-floating point type columns. And
-   * non-floating point columns can never have `NAN`, so it will always return
+   * non-floating point columns can never have `NaN`, so it will always return
    * false
    *
-   * @param[in] input The `column_view` which will be checked for `NAN`
+   * @param[in] input The `column_view` which will be checked for `NaN`
    * @param[in] stream CUDA stream used for device memory operations and kernel launches.
    *
-   * @returns bool Always false as non-floating point columns can't have `NAN`
+   * @returns bool Always false as non-floating point columns can't have `NaN`
    */
-  template <typename T, std::enable_if_t<not std::is_floating_point<T>::value>* = nullptr>
-  bool operator()(column_view const& input, rmm::cuda_stream_view stream)
+  template <typename T, std::enable_if_t<not std::is_floating_point_v<T>>* = nullptr>
+  bool operator()(column_view const&, rmm::cuda_stream_view)
+  {
+    return false;
+  }
+};
+
+/**
+ * @brief A functor to be used along with device type_dispatcher to check if
+ * the row `index` of `column_device_view` is `NaN`.
+ */
+struct check_nan {
+  // Check if it's `NaN` for floating point type columns
+  template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
+  __device__ inline bool operator()(column_device_view const& input, size_type index)
+  {
+    return std::isnan(input.data<T>()[index]);
+  }
+  // Non-floating point type columns can never have `NaN`, so it will always return false.
+  template <typename T, std::enable_if_t<not std::is_floating_point_v<T>>* = nullptr>
+  __device__ inline bool operator()(column_device_view const&, size_type)
   {
     return false;
   }
 };
+}  // namespace
+
+cudf::size_type distinct_count(table_view const& keys,
+                               null_equality nulls_equal,
+                               rmm::cuda_stream_view stream)
+{
+  auto table_ptr = cudf::table_device_view::create(keys, stream);
+  row_equality_comparator comp(
+    nullate::DYNAMIC{cudf::has_nulls(keys)}, *table_ptr, *table_ptr, nulls_equal);
+  return thrust::count_if(
+    rmm::exec_policy(stream),
+    thrust::counting_iterator<cudf::size_type>(0),
+    thrust::counting_iterator<cudf::size_type>(keys.num_rows()),
+    [comp] __device__(cudf::size_type i) { return (i == 0 or not comp(i, i - 1)); });
+}
+
+cudf::size_type unordered_distinct_count(table_view const& keys,
+                                         null_equality nulls_equal,
+                                         rmm::cuda_stream_view stream)
+{
+  auto table_ptr      = cudf::table_device_view::create(keys, stream);
+  auto const num_rows = table_ptr->num_rows();
+  auto const has_null = nullate::DYNAMIC{cudf::has_nulls(keys)};
+
+  hash_map_type key_map{compute_hash_table_size(num_rows),
+                        COMPACTION_EMPTY_KEY_SENTINEL,
+                        COMPACTION_EMPTY_VALUE_SENTINEL,
+                        detail::hash_table_allocator_type{default_allocator<char>{}, stream},
+                        stream.value()};
+
+  compaction_hash hash_key{has_null, *table_ptr};
+  row_equality_comparator row_equal(has_null, *table_ptr, *table_ptr, nulls_equal);
+  auto iter = cudf::detail::make_counting_transform_iterator(
+    0, [] __device__(size_type i) { return cuco::make_pair(i, i); });
+
+  // when nulls are equal, insert non-null rows only to improve efficiency
+  if (nulls_equal == null_equality::EQUAL and has_null) {
+    thrust::counting_iterator<size_type> stencil(0);
+    auto const [row_bitmask, null_count] = cudf::detail::bitmask_or(keys, stream);
+    row_validity pred{static_cast<bitmask_type const*>(row_bitmask.data())};
+
+    key_map.insert_if(iter, iter + num_rows, stencil, pred, hash_key, row_equal, stream.value());
+    return key_map.get_size() + static_cast<std::size_t>((null_count > 0) ? 1 : 0);
+  }
+  // otherwise, insert all
+  key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value());
+  return key_map.get_size();
+}
 
 cudf::size_type distinct_count(column_view const& input,
                                null_policy null_handling,
                                nan_policy nan_handling,
                                rmm::cuda_stream_view stream)
 {
-  if (0 == input.size() || input.null_count() == input.size()) { return 0; }
-
-  cudf::size_type nrows = input.size();
-
-  bool has_nan = false;
-  // Check for Nans
-  // Checking for nulls in input and flag nan_handling, as the count will
-  // only get affected if these two conditions are true. NAN will only be
-  // be an extra if nan_handling was NAN_IS_NULL and input also had null, which
-  // will increase the count by 1.
-  if (input.has_nulls() and nan_handling == nan_policy::NAN_IS_NULL) {
-    has_nan = cudf::type_dispatcher(input.type(), has_nans{}, input, stream);
-  }
+  auto const num_rows = input.size();
 
-  auto count = detail::distinct_count(table_view{{input}}, null_equality::EQUAL, stream);
+  if (num_rows == 0 or num_rows == input.null_count()) { return 0; }
 
-  // if nan is considered null and there are already null values
-  if (nan_handling == nan_policy::NAN_IS_NULL and has_nan and input.has_nulls()) --count;
+  auto const count_nulls      = null_handling == null_policy::INCLUDE;
+  auto const nan_is_null      = nan_handling == nan_policy::NAN_IS_NULL;
+  auto const should_check_nan = cudf::is_floating_point(input.type());
+  auto input_device_view      = cudf::column_device_view::create(input, stream);
+  auto device_view            = *input_device_view;
+  auto input_table_view       = table_view{{input}};
+  auto table_ptr              = cudf::table_device_view::create(input_table_view, stream);
+  row_equality_comparator comp(nullate::DYNAMIC{cudf::has_nulls(input_table_view)},
+                               *table_ptr,
+                               *table_ptr,
+                               null_equality::EQUAL);
 
-  if (null_handling == null_policy::EXCLUDE and input.has_nulls())
-    return --count;
-  else
-    return count;
+  return thrust::count_if(
+    rmm::exec_policy(stream),
+    thrust::counting_iterator<cudf::size_type>(0),
+    thrust::counting_iterator<cudf::size_type>(num_rows),
+    [count_nulls, nan_is_null, should_check_nan, device_view, comp] __device__(cudf::size_type i) {
+      auto const is_null = device_view.is_null(i);
+      auto const is_nan  = nan_is_null and should_check_nan and
+                          cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i);
+      if (not count_nulls and (is_null or (nan_is_null and is_nan))) { return false; }
+      if (i == 0) { return true; }
+      if (count_nulls and nan_is_null and (is_nan or is_null)) {
+        auto const prev_is_nan =
+          should_check_nan and
+          cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i - 1);
+        return not(prev_is_nan or device_view.is_null(i - 1));
+      }
+      return not comp(i, i - 1);
+    });
 }
 
+cudf::size_type unordered_distinct_count(column_view const& input,
+                                         null_policy null_handling,
+                                         nan_policy nan_handling,
+                                         rmm::cuda_stream_view stream)
+{
+  if (0 == input.size() or input.null_count() == input.size()) { return 0; }
+
+  auto count = detail::unordered_distinct_count(table_view{{input}}, null_equality::EQUAL, stream);
+
+  // Check for nulls. If the null policy is EXCLUDE and null values were found,
+  // we decrement the count.
+  auto const has_null = input.has_nulls();
+  if (null_handling == null_policy::EXCLUDE and has_null) { --count; }
+
+  // Check for NaNs. There are two cases that can lead to decrementing the
+  // count. The first case is when the input has no nulls, but has NaN values
+  // handled as a null via NAN_IS_NULL and has a policy to EXCLUDE null values
+  // from the count. The second case is when the input has null values and NaN
+  // values handled as nulls via NAN_IS_NULL. Regardless of whether the null
+  // policy is set to EXCLUDE, we decrement the count to avoid double-counting
+  // null and NaN as distinct entities.
+  auto const has_nan_as_null = (nan_handling == nan_policy::NAN_IS_NULL) and
+                               cudf::type_dispatcher(input.type(), has_nans{}, input, stream);
+  if (has_nan_as_null and (has_null or null_handling == null_policy::EXCLUDE)) { --count; }
+  return count;
+}
 }  // namespace detail
 
 cudf::size_type distinct_count(column_view const& input,
@@ -183,4 +269,18 @@ cudf::size_type distinct_count(table_view const& input, null_equality nulls_equa
   return detail::distinct_count(input, nulls_equal);
 }
 
+cudf::size_type unordered_distinct_count(column_view const& input,
+                                         null_policy null_handling,
+                                         nan_policy nan_handling)
+{
+  CUDF_FUNC_RANGE();
+  return detail::unordered_distinct_count(input, null_handling, nan_handling);
+}
+
+cudf::size_type unordered_distinct_count(table_view const& input, null_equality nulls_equal)
+{
+  CUDF_FUNC_RANGE();
+  return detail::unordered_distinct_count(input, nulls_equal);
+}
+
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu
index abc34663aee..2fd1f530b6d 100644
--- a/cpp/src/stream_compaction/drop_duplicates.cu
+++ b/cpp/src/stream_compaction/drop_duplicates.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,16 @@
  * limitations under the License.
  */
 
-#include <stream_compaction/drop_duplicates.cuh>
+#include "drop_duplicates.cuh"
+#include "stream_compaction_common.cuh"
+#include "stream_compaction_common.hpp"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sorting.hpp>
 #include <cudf/detail/stream_compaction.hpp>
@@ -37,6 +40,7 @@
 #include <thrust/copy.h>
 #include <thrust/execution_policy.h>
 
+#include <utility>
 #include <vector>
 
 namespace cudf {
@@ -85,12 +89,12 @@ column_view get_unique_ordered_indices(cudf::table_view const& keys,
 
   auto comp = row_equality_comparator(
     nullate::DYNAMIC{cudf::has_nulls(keys)}, *device_input_table, *device_input_table, nulls_equal);
-  auto result_end = unique_copy(sorted_indices->view().begin<cudf::size_type>(),
-                                sorted_indices->view().end<cudf::size_type>(),
-                                unique_indices.begin<cudf::size_type>(),
-                                comp,
-                                keep,
-                                stream);
+  auto result_end = cudf::detail::unique_copy(sorted_indices->view().begin<cudf::size_type>(),
+                                              sorted_indices->view().end<cudf::size_type>(),
+                                              unique_indices.begin<cudf::size_type>(),
+                                              comp,
+                                              keep,
+                                              stream);
 
   return cudf::detail::slice(column_view(unique_indices),
                              0,
@@ -106,7 +110,7 @@ std::unique_ptr<table> drop_duplicates(table_view const& input,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
-  if (0 == input.num_rows() || 0 == input.num_columns() || 0 == keys.size()) {
+  if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) {
     return empty_like(input);
   }
 
@@ -130,6 +134,62 @@ std::unique_ptr<table> drop_duplicates(table_view const& input,
                         mr);
 }
 
+std::unique_ptr<table> unordered_drop_duplicates(table_view const& input,
+                                                 std::vector<size_type> const& keys,
+                                                 null_equality nulls_equal,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
+{
+  if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) {
+    return empty_like(input);
+  }
+
+  auto keys_view = input.select(keys);
+  auto table_ptr = cudf::table_device_view::create(keys_view, stream);
+  auto has_null  = nullate::DYNAMIC{cudf::has_nulls(keys_view)};
+  auto const num_rows{table_ptr->num_rows()};
+
+  hash_map_type key_map{compute_hash_table_size(num_rows),
+                        COMPACTION_EMPTY_KEY_SENTINEL,
+                        COMPACTION_EMPTY_VALUE_SENTINEL,
+                        detail::hash_table_allocator_type{default_allocator<char>{}, stream},
+                        stream.value()};
+
+  compaction_hash hash_key{has_null, *table_ptr};
+  row_equality_comparator row_equal(has_null, *table_ptr, *table_ptr, nulls_equal);
+
+  auto iter = cudf::detail::make_counting_transform_iterator(
+    0, [] __device__(size_type i) { return cuco::make_pair(i, i); });
+  // insert unique indices into the map.
+  key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value());
+
+  auto counting_iter = thrust::make_counting_iterator<size_type>(0);
+  rmm::device_uvector<bool> index_exists_in_map(num_rows, stream, mr);
+  // enumerate all indices to check if they are present in the map.
+  key_map.contains(counting_iter, counting_iter + num_rows, index_exists_in_map.begin(), hash_key);
+
+  auto const output_size{key_map.get_size()};
+
+  // write unique indices to a numeric column
+  auto unique_indices = cudf::make_numeric_column(
+    data_type{type_id::INT32}, output_size, mask_state::UNALLOCATED, stream, mr);
+  auto mutable_view = mutable_column_device_view::create(*unique_indices, stream);
+  thrust::copy_if(rmm::exec_policy(stream),
+                  counting_iter,
+                  counting_iter + num_rows,
+                  index_exists_in_map.begin(),
+                  mutable_view->begin<size_type>(),
+                  thrust::identity<bool>{});
+
+  // run gather operation to establish new order
+  return detail::gather(input,
+                        unique_indices->view(),
+                        out_of_bounds_policy::DONT_CHECK,
+                        detail::negative_index_policy::NOT_ALLOWED,
+                        stream,
+                        mr);
+}
+
 }  // namespace detail
 
 std::unique_ptr<table> drop_duplicates(table_view const& input,
@@ -144,4 +204,13 @@ std::unique_ptr<table> drop_duplicates(table_view const& input,
     input, keys, keep, nulls_equal, null_precedence, rmm::cuda_stream_default, mr);
 }
 
+std::unique_ptr<table> unordered_drop_duplicates(table_view const& input,
+                                                 std::vector<size_type> const& keys,
+                                                 null_equality nulls_equal,
+                                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::unordered_drop_duplicates(input, keys, nulls_equal, rmm::cuda_stream_default, mr);
+}
+
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/stream_compaction_common.cuh b/cpp/src/stream_compaction/stream_compaction_common.cuh
new file mode 100644
index 00000000000..8ba9223a1bc
--- /dev/null
+++ b/cpp/src/stream_compaction/stream_compaction_common.cuh
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "stream_compaction_common.hpp"
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief Device callable to hash a given row.
+ */
+template <typename Nullate>
+class compaction_hash {
+ public:
+  compaction_hash(Nullate has_nulls, table_device_view t) : _hash{has_nulls, t} {}
+
+  __device__ inline auto operator()(size_type i) const noexcept
+  {
+    auto hash = _hash(i);
+    return (hash == COMPACTION_EMPTY_KEY_SENTINEL) ? (hash - 1) : hash;
+  }
+
+ private:
+  row_hash _hash;
+};
+
+/**
+￼ * @brief Device functor to determine if a row is valid.
+￼ */
+class row_validity {
+ public:
+  row_validity(bitmask_type const* row_bitmask) : _row_bitmask{row_bitmask} {}
+
+  __device__ inline bool operator()(const size_type& i) const noexcept
+  {
+    return cudf::bit_is_set(_row_bitmask, i);
+  }
+
+ private:
+  bitmask_type const* _row_bitmask;
+};
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/stream_compaction/stream_compaction_common.hpp b/cpp/src/stream_compaction/stream_compaction_common.hpp
new file mode 100644
index 00000000000..1d743eccdbe
--- /dev/null
+++ b/cpp/src/stream_compaction/stream_compaction_common.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/table/row_operators.cuh>
+#include <cudf/table/table_device_view.cuh>
+
+#include <hash/hash_allocator.cuh>
+#include <hash/helper_functions.cuh>
+
+#include <rmm/mr/device/polymorphic_allocator.hpp>
+
+#include <cuco/static_map.cuh>
+
+#include <limits>
+
+namespace cudf {
+namespace detail {
+
+constexpr auto COMPACTION_EMPTY_KEY_SENTINEL   = std::numeric_limits<size_type>::max();
+constexpr auto COMPACTION_EMPTY_VALUE_SENTINEL = std::numeric_limits<size_type>::min();
+
+using hash_type = cuco::detail::MurmurHash3_32<size_type>;
+
+using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
+
+using hash_map_type =
+  cuco::static_map<size_type, size_type, cuda::thread_scope_device, hash_table_allocator_type>;
+
+using row_hash = cudf::row_hasher<default_hash, cudf::nullate::DYNAMIC>;
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu
index dadeaf7d1e0..405c83ab872 100644
--- a/cpp/src/transform/encode.cu
+++ b/cpp/src/transform/encode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,7 +30,10 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <memory>
 #include <numeric>
+#include <utility>
+#include <vector>
 
 namespace cudf {
 namespace detail {
@@ -38,29 +41,23 @@ namespace detail {
 std::pair<std::unique_ptr<table>, std::unique_ptr<column>> encode(
   table_view const& input_table, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
-  std::vector<size_type> drop_keys(input_table.num_columns());
+  auto const num_cols = input_table.num_columns();
+
+  std::vector<size_type> drop_keys(num_cols);
   std::iota(drop_keys.begin(), drop_keys.end(), 0);
 
-  // side effects of this function we are now dependent on:
-  // - resulting column elements are sorted ascending
-  // - nulls are sorted to the beginning
-  auto keys_table = cudf::detail::drop_duplicates(input_table,
-                                                  drop_keys,
-                                                  duplicate_keep_option::KEEP_FIRST,
-                                                  null_equality::EQUAL,
-                                                  null_order::AFTER,
-                                                  stream,
-                                                  mr);
+  auto unique_keys = cudf::detail::unordered_drop_duplicates(
+    input_table, drop_keys, null_equality::EQUAL, stream, mr);
+
+  std::vector<order> column_order(num_cols, order::ASCENDING);
+  std::vector<null_order> null_precedence(num_cols, null_order::AFTER);
+  auto sorted_unique_keys =
+    cudf::detail::sort(unique_keys->view(), column_order, null_precedence, stream, mr);
 
-  auto indices_column =
-    cudf::detail::lower_bound(keys_table->view(),
-                              input_table,
-                              std::vector<order>(input_table.num_columns(), order::ASCENDING),
-                              std::vector<null_order>(input_table.num_columns(), null_order::AFTER),
-                              stream,
-                              mr);
+  auto indices_column = cudf::detail::lower_bound(
+    sorted_unique_keys->view(), input_table, column_order, null_precedence, stream, mr);
 
-  return std::make_pair(std::move(keys_table), std::move(indices_column));
+  return std::make_pair(std::move(sorted_unique_keys), std::move(indices_column));
 }
 
 }  // namespace detail
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index e23403e68e4..6b5670630ec 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -292,6 +292,7 @@ ConfigureTest(
 ConfigureTest(
   STREAM_COMPACTION_TEST
   stream_compaction/apply_boolean_mask_tests.cpp
+  stream_compaction/distinct_count_tests.cpp
   stream_compaction/drop_nulls_tests.cpp
   stream_compaction/drop_nans_tests.cpp
   stream_compaction/drop_duplicates_tests.cpp
diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp
new file mode 100644
index 00000000000..78b52db5255
--- /dev/null
+++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/copying.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/stream_compaction.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <algorithm>
+#include <cmath>
+
+using cudf::nan_policy;
+using cudf::null_equality;
+using cudf::null_policy;
+
+constexpr int32_t XXX{70};  // Mark for null elements
+constexpr int32_t YYY{3};   // Mark for null elements
+
+template <typename T>
+struct DistinctCountCommon : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_SUITE(DistinctCountCommon, cudf::test::NumericTypes);
+
+TYPED_TEST(DistinctCountCommon, NoNull)
+{
+  using T = TypeParam;
+
+  auto const input = cudf::test::make_type_param_vector<T>(
+    {1, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4});
+
+  cudf::test::fixed_width_column_wrapper<T> input_col(input.begin(), input.end());
+
+  // explicit instantiation to one particular type (`double`) to reduce build time
+  auto const expected =
+    static_cast<cudf::size_type>(std::set<double>(input.begin(), input.end()).size());
+  EXPECT_EQ(
+    expected,
+    cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
+
+  // explicit instantiation to one particular type (`double`) to reduce build time
+  std::vector<double> input_data(input.begin(), input.end());
+  auto const new_end      = std::unique(input_data.begin(), input_data.end());
+  auto const gold_ordered = std::distance(input_data.begin(), new_end);
+  EXPECT_EQ(gold_ordered,
+            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
+}
+
+TYPED_TEST(DistinctCountCommon, TableNoNull)
+{
+  using T = TypeParam;
+
+  auto const input1 = cudf::test::make_type_param_vector<T>(
+    {1, 3, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4});
+  auto const input2 = cudf::test::make_type_param_vector<T>(
+    {3, 3, 3, 4, 31, 1, 8, 5, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4, 1});
+
+  std::vector<std::pair<T, T>> pair_input;
+  std::transform(
+    input1.begin(), input1.end(), input2.begin(), std::back_inserter(pair_input), [](T a, T b) {
+      return std::make_pair(a, b);
+    });
+
+  cudf::test::fixed_width_column_wrapper<T> input_col1(input1.begin(), input1.end());
+  cudf::test::fixed_width_column_wrapper<T> input_col2(input2.begin(), input2.end());
+  cudf::table_view input_table({input_col1, input_col2});
+
+  auto const expected = static_cast<cudf::size_type>(
+    std::set<std::pair<T, T>>(pair_input.begin(), pair_input.end()).size());
+  EXPECT_EQ(expected, cudf::unordered_distinct_count(input_table, null_equality::EQUAL));
+
+  auto const new_end      = std::unique(pair_input.begin(), pair_input.end());
+  auto const gold_ordered = std::distance(pair_input.begin(), new_end);
+  EXPECT_EQ(gold_ordered, cudf::distinct_count(input_table, null_equality::EQUAL));
+}
+
+struct DistinctCount : public cudf::test::BaseFixture {
+};
+
+TEST_F(DistinctCount, WithNull)
+{
+  using T = int32_t;
+
+  std::vector<T> input               = {1,   3,  3,  XXX, 31, 1, 8,  2, 0, XXX, XXX,
+                          XXX, 10, 40, 31,  42, 0, 42, 8, 5, XXX};
+  std::vector<cudf::size_type> valid = {1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0,
+                                        0, 1, 1, 1, 1, 1, 1, 1, 1, 0};
+
+  cudf::test::fixed_width_column_wrapper<T> input_col(input.begin(), input.end(), valid.begin());
+
+  // explicit instantiation to one particular type (`double`) to reduce build time
+  auto const expected =
+    static_cast<cudf::size_type>(std::set<double>(input.begin(), input.end()).size());
+  EXPECT_EQ(
+    expected,
+    cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
+
+  auto const new_end      = std::unique(input.begin(), input.end());
+  auto const gold_ordered = std::distance(input.begin(), new_end) - 3;
+  EXPECT_EQ(gold_ordered,
+            cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID));
+}
+
+TEST_F(DistinctCount, IgnoringNull)
+{
+  using T = int32_t;
+
+  std::vector<T> input               = {1,   YYY, YYY, XXX, 31, 1, 8,  2, 0, XXX, 1,
+                          XXX, 10,  40,  31,  42, 0, 42, 8, 5, XXX};
+  std::vector<cudf::size_type> valid = {1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
+                                        0, 1, 1, 1, 1, 1, 1, 1, 1, 0};
+
+  cudf::test::fixed_width_column_wrapper<T> input_col(input.begin(), input.end(), valid.begin());
+
+  auto const expected =
+    static_cast<cudf::size_type>(std::set<T>(input.begin(), input.end()).size());
+  // Removing 2 from expected to remove count for `XXX` and `YYY`
+  EXPECT_EQ(
+    expected - 2,
+    cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID));
+
+  auto const new_end = std::unique(input.begin(), input.end());
+  // -1 since `YYY, YYY, XXX` is in the same group of equivalent rows
+  auto const gold_ordered = std::distance(input.begin(), new_end) - 1;
+  EXPECT_EQ(gold_ordered,
+            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
+}
+
+TEST_F(DistinctCount, WithNansAndNull)
+{
+  using T = float;
+
+  std::vector<T> input               = {1,   3,  NAN, XXX, 31,  1, 8,   2, 0, XXX, 1,
+                          XXX, 10, 40,  31,  NAN, 0, NAN, 8, 5, XXX};
+  std::vector<cudf::size_type> valid = {1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
+                                        0, 1, 1, 1, 1, 1, 1, 1, 1, 0};
+
+  cudf::test::fixed_width_column_wrapper<T> input_col{input.begin(), input.end(), valid.begin()};
+
+  auto const expected =
+    static_cast<cudf::size_type>(std::set<T>(input.begin(), input.end()).size());
+  EXPECT_EQ(
+    expected + 1,  // +1 since `NAN` is not in std::set
+    cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
+
+  auto const new_end      = std::unique(input.begin(), input.end());
+  auto const gold_ordered = std::distance(input.begin(), new_end);
+  EXPECT_EQ(gold_ordered,
+            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
+
+  input     = {NAN, NAN, XXX};
+  valid     = {1, 1, 0};
+  input_col = cudf::test::fixed_width_column_wrapper<T>{input.begin(), input.end(), valid.begin()};
+
+  constexpr auto expected_all_nan = 2;
+  EXPECT_EQ(
+    expected_all_nan,
+    cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
+  EXPECT_EQ(expected_all_nan,
+            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
+}
+
+TEST_F(DistinctCount, WithNansOnly)
+{
+  using T = float;
+
+  std::vector<T> input               = {1, 3, NAN, 70, 31};
+  std::vector<cudf::size_type> valid = {1, 1, 1, 1, 1};
+
+  cudf::test::fixed_width_column_wrapper<T> input_col{input.begin(), input.end(), valid.begin()};
+
+  constexpr auto expected = 5;
+  EXPECT_EQ(
+    expected,
+    cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
+  EXPECT_EQ(expected,
+            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
+
+  input     = {NAN, NAN, NAN};
+  valid     = {1, 1, 1};
+  input_col = cudf::test::fixed_width_column_wrapper<T>{input.begin(), input.end(), valid.begin()};
+
+  constexpr auto expected_all_nan = 1;
+  EXPECT_EQ(
+    expected_all_nan,
+    cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
+  EXPECT_EQ(expected_all_nan,
+            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
+}
+
+TEST_F(DistinctCount, NansAsNullWithNoNull)
+{
+  using T = float;
+
+  std::vector<T> input               = {1, 3, NAN, 70, 31};
+  std::vector<cudf::size_type> valid = {1, 1, 1, 1, 1};
+
+  cudf::test::fixed_width_column_wrapper<T> input_col{input.begin(), input.end(), valid.begin()};
+
+  constexpr auto expected = 5;
+  EXPECT_EQ(
+    expected,
+    cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL));
+  EXPECT_EQ(expected,
+            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL));
+
+  input     = {NAN, NAN, NAN};
+  valid     = {1, 1, 1};
+  input_col = cudf::test::fixed_width_column_wrapper<T>{input.begin(), input.end(), valid.begin()};
+
+  constexpr auto expected_all_nan = 1;
+  EXPECT_EQ(
+    expected_all_nan,
+    cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL));
+  EXPECT_EQ(expected_all_nan,
+            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL));
+}
+
+TEST_F(DistinctCount, NansAsNullWithNull)
+{
+  using T = float;
+
+  std::vector<T> input               = {1, 3, NAN, XXX, 31};
+  std::vector<cudf::size_type> valid = {1, 1, 1, 0, 1};
+
+  cudf::test::fixed_width_column_wrapper<T> input_col{input.begin(), input.end(), valid.begin()};
+
+  constexpr auto expected = 4;
+  EXPECT_EQ(
+    expected,
+    cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL));
+  EXPECT_EQ(expected,
+            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL));
+
+  input     = {NAN, NAN, XXX};
+  valid     = {1, 1, 0};
+  input_col = cudf::test::fixed_width_column_wrapper<T>{input.begin(), input.end(), valid.begin()};
+
+  constexpr auto expected_all_null = 1;
+  EXPECT_EQ(
+    expected_all_null,
+    cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL));
+  EXPECT_EQ(expected_all_null,
+            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL));
+}
+
+TEST_F(DistinctCount, NansAsNullWithIgnoreNull)
+{
+  using T = float;
+
+  std::vector<T> input               = {1, 3, NAN, XXX, 31};
+  std::vector<cudf::size_type> valid = {1, 1, 1, 0, 1};
+
+  cudf::test::fixed_width_column_wrapper<T> input_col{input.begin(), input.end(), valid.begin()};
+
+  constexpr auto expected = 3;
+  EXPECT_EQ(
+    expected,
+    cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL));
+  EXPECT_EQ(expected,
+            cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL));
+
+  input     = {NAN, NAN, NAN};
+  valid     = {1, 1, 1};
+  input_col = cudf::test::fixed_width_column_wrapper<T>{input.begin(), input.end(), valid.begin()};
+
+  constexpr auto expected_all_nan = 0;
+  EXPECT_EQ(
+    expected_all_nan,
+    cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL));
+  EXPECT_EQ(expected_all_nan,
+            cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL));
+}
+
+TEST_F(DistinctCount, EmptyColumn)
+{
+  using T = float;
+
+  cudf::test::fixed_width_column_wrapper<T> input_col{};
+
+  constexpr auto expected = 0;
+  EXPECT_EQ(
+    expected,
+    cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL));
+  EXPECT_EQ(expected,
+            cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL));
+}
+
+TEST_F(DistinctCount, StringColumnWithNull)
+{
+  cudf::test::strings_column_wrapper input_col{
+    {"", "this", "is", "this", "This", "a", "column", "of", "the", "strings"},
+    {1, 1, 1, 1, 1, 1, 1, 1, 0, 1}};
+
+  cudf::size_type const expected =
+    (std::vector<std::string>{"", "this", "is", "This", "a", "column", "of", "strings"}).size();
+  EXPECT_EQ(
+    expected,
+    cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID));
+}
+
+TEST_F(DistinctCount, TableWithNull)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> col1{{5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1},
+                                                       {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col2{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1},
+                                                       {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}};
+  cudf::table_view input{{col1, col2}};
+
+  EXPECT_EQ(8, cudf::unordered_distinct_count(input, null_equality::EQUAL));
+  EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL));
+}
+
+TEST_F(DistinctCount, EmptyColumnedTable)
+{
+  std::vector<cudf::column_view> cols{};
+
+  cudf::table_view input(cols);
+
+  EXPECT_EQ(0, cudf::unordered_distinct_count(input, null_equality::EQUAL));
+  EXPECT_EQ(0, cudf::unordered_distinct_count(input, null_equality::UNEQUAL));
+}
+
+TEST_F(DistinctCount, TableMixedTypes)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> col1{{5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1},
+                                                       {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}};
+  cudf::test::fixed_width_column_wrapper<double> col2{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1},
+                                                      {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}};
+  cudf::test::fixed_width_column_wrapper<uint32_t> col3{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1},
+                                                        {1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0}};
+  cudf::table_view input{{col1, col2, col3}};
+
+  EXPECT_EQ(9, cudf::unordered_distinct_count(input, null_equality::EQUAL));
+  EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL));
+}
+
+TEST_F(DistinctCount, TableWithStringColumnWithNull)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> col1{{0, 9, 8, 9, 6, 5, 4, 3, 2, 1, 0},
+                                                       {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}};
+  cudf::test::strings_column_wrapper col2{
+    {"", "this", "is", "this", "this", "a", "column", "of", "the", "strings", ""},
+    {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}};
+
+  cudf::table_view input{{col1, col2}};
+  EXPECT_EQ(9, cudf::unordered_distinct_count(input, null_equality::EQUAL));
+  EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL));
+}
diff --git a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp
index 916d2a33b97..d49b8208094 100644
--- a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp
+++ b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,254 +14,98 @@
  * limitations under the License.
  */
 
-#include <algorithm>
-#include <cmath>
-#include <ctgmath>
 #include <cudf/copying.hpp>
+#include <cudf/sorting.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <algorithm>
+#include <cmath>
+
 using cudf::nan_policy;
 using cudf::null_equality;
 using cudf::null_policy;
-template <typename T>
-struct DistinctCountCommon : public cudf::test::BaseFixture {
-};
-
-TYPED_TEST_SUITE(DistinctCountCommon, cudf::test::NumericTypes);
-
-TYPED_TEST(DistinctCountCommon, NoNull)
-{
-  using T = TypeParam;
-
-  auto const input = cudf::test::make_type_param_vector<T>(
-    {1, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4});
-
-  cudf::test::fixed_width_column_wrapper<T> input_col(input.begin(), input.end());
-
-  cudf::size_type expected = std::set<double>(input.begin(), input.end()).size();
-  EXPECT_EQ(expected,
-            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
-}
-
-TYPED_TEST(DistinctCountCommon, TableNoNull)
-{
-  using T = TypeParam;
-
-  auto const input1 = cudf::test::make_type_param_vector<T>(
-    {1, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4});
-  auto const input2 = cudf::test::make_type_param_vector<T>(
-    {3, 3, 4, 31, 1, 8, 5, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4, 1});
 
-  std::vector<std::pair<T, T>> pair_input;
-  std::transform(
-    input1.begin(), input1.end(), input2.begin(), std::back_inserter(pair_input), [](T a, T b) {
-      return std::make_pair(a, b);
-    });
-
-  cudf::test::fixed_width_column_wrapper<T> input_col1(input1.begin(), input1.end());
-  cudf::test::fixed_width_column_wrapper<T> input_col2(input2.begin(), input2.end());
-
-  std::vector<cudf::column_view> cols{input_col1, input_col2};
-  cudf::table_view input_table(cols);
-
-  cudf::size_type expected = std::set<std::pair<T, T>>(pair_input.begin(), pair_input.end()).size();
-  EXPECT_EQ(expected, cudf::distinct_count(input_table, null_equality::EQUAL));
-}
-
-struct DistinctCount : public cudf::test::BaseFixture {
+struct DropDuplicatesCommon : public cudf::test::BaseFixture {
 };
 
-TEST_F(DistinctCount, WithNull)
-{
-  using T = int32_t;
-
-  // Considering 70 as null
-  std::vector<T> input = {1, 3, 3, 70, 31, 1, 8, 2, 0, 70, 1, 70, 10, 40, 31, 42, 0, 42, 8, 5, 70};
-  std::vector<cudf::size_type> valid = {1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
-                                        0, 1, 1, 1, 1, 1, 1, 1, 1, 0};
-
-  cudf::test::fixed_width_column_wrapper<T> input_col(input.begin(), input.end(), valid.begin());
-
-  cudf::size_type expected = std::set<double>(input.begin(), input.end()).size();
-  EXPECT_EQ(expected,
-            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
-}
-
-TEST_F(DistinctCount, IgnoringNull)
-{
-  using T = int32_t;
-
-  // Considering 70 and 3 as null
-  std::vector<T> input = {1, 3, 3, 70, 31, 1, 8, 2, 0, 70, 1, 70, 10, 40, 31, 42, 0, 42, 8, 5, 70};
-  std::vector<cudf::size_type> valid = {1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
-                                        0, 1, 1, 1, 1, 1, 1, 1, 1, 0};
-
-  cudf::test::fixed_width_column_wrapper<T> input_col(input.begin(), input.end(), valid.begin());
-
-  cudf::size_type expected = std::set<T>(input.begin(), input.end()).size();
-  // Removing 2 from expected to remove count for 70 and 3
-  EXPECT_EQ(expected - 2,
-            cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID));
-}
-
-TEST_F(DistinctCount, WithNansAndNull)
+TEST_F(DropDuplicatesCommon, StringKeyColumn)
 {
-  using T = float;
-
-  std::vector<T> input               = {1,  3,  NAN, 70, 31,  1, 8,   2, 0, 70, 1,
-                          70, 10, 40,  31, NAN, 0, NAN, 8, 5, 70};
-  std::vector<cudf::size_type> valid = {1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
-                                        0, 1, 1, 1, 1, 1, 1, 1, 1, 0};
-
-  cudf::test::fixed_width_column_wrapper<T> input_col{input.begin(), input.end(), valid.begin()};
-
-  cudf::size_type expected = std::set<T>(input.begin(), input.end()).size();
-  EXPECT_EQ(expected,
-            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
-}
-
-TEST_F(DistinctCount, WithNansOnly)
-{
-  using T = float;
-
-  std::vector<T> input               = {1, 3, NAN, 70, 31};
-  std::vector<cudf::size_type> valid = {1, 1, 1, 1, 1};
-
-  cudf::test::fixed_width_column_wrapper<T> input_col{input.begin(), input.end(), valid.begin()};
-
-  cudf::size_type expected = 5;
-  EXPECT_EQ(expected,
-            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
-}
-
-TEST_F(DistinctCount, NansAsNullWithNoNull)
-{
-  using T = float;
-
-  std::vector<T> input               = {1, 3, NAN, 70, 31};
-  std::vector<cudf::size_type> valid = {1, 1, 1, 1, 1};
-
-  cudf::test::fixed_width_column_wrapper<T> input_col{input.begin(), input.end(), valid.begin()};
-
-  cudf::size_type expected = 5;
-  EXPECT_EQ(expected,
-            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL));
-}
+  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 4, 5, 5, 8, 1}, {1, 0, 0, 1, 1, 1, 1}};
+  cudf::test::strings_column_wrapper key_col{{"all", "new", "new", "all", "new", "the", "strings"},
+                                             {1, 1, 1, 1, 0, 1, 1}};
+  cudf::table_view input{{col, key_col}};
+  std::vector<cudf::size_type> keys{1};
 
-TEST_F(DistinctCount, NansAsNullWithNull)
-{
-  using T = float;
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_sort_col{{5, 5, 4, 1, 8}, {1, 1, 0, 1, 1}};
+  cudf::test::strings_column_wrapper exp_sort_key_col{{"new", "all", "new", "strings", "the"},
+                                                      {0, 1, 1, 1, 1}};
+  cudf::table_view expected_sort{{exp_sort_col, exp_sort_key_col}};
 
-  std::vector<T> input               = {1, 3, NAN, 70, 31};
-  std::vector<cudf::size_type> valid = {1, 1, 1, 0, 1};
+  auto got_sort = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, got_sort->view());
 
-  cudf::test::fixed_width_column_wrapper<T> input_col{input.begin(), input.end(), valid.begin()};
+  auto got_unordered = unordered_drop_duplicates(input, keys);
+  auto key_view      = got_unordered->select(keys.begin(), keys.end());
+  auto sorted_result = cudf::sort_by_key(got_unordered->view(), key_view);
 
-  cudf::size_type expected = 4;
-  EXPECT_EQ(expected,
-            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL));
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, sorted_result->view());
 }
 
-TEST_F(DistinctCount, NansAsNullWithIgnoreNull)
+TEST_F(DropDuplicatesCommon, EmptyInputTable)
 {
-  using T = float;
-
-  std::vector<T> input               = {1, 3, NAN, 70, 31};
-  std::vector<cudf::size_type> valid = {1, 1, 1, 0, 1};
+  cudf::test::fixed_width_column_wrapper<int32_t> col(std::initializer_list<int32_t>{});
+  cudf::table_view input{{col}};
+  std::vector<cudf::size_type> keys{1, 2};
 
-  cudf::test::fixed_width_column_wrapper<T> input_col{input.begin(), input.end(), valid.begin()};
+  auto got =
+    drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view());
 
-  cudf::size_type expected = 3;
-  EXPECT_EQ(expected,
-            cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL));
+  auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(input, got_unordered->view());
 }
 
-TEST_F(DistinctCount, EmptyColumn)
+TEST_F(DropDuplicatesCommon, NoColumnInputTable)
 {
-  using T = float;
-
-  cudf::test::fixed_width_column_wrapper<T> input_col{};
-
-  cudf::size_type expected = 0;
-  EXPECT_EQ(expected,
-            cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL));
-}
+  cudf::table_view input{std::vector<cudf::column_view>()};
+  std::vector<cudf::size_type> keys{1, 2};
 
-TEST_F(DistinctCount, StringColumnWithNull)
-{
-  cudf::test::strings_column_wrapper input_col{
-    {"", "this", "is", "this", "This", "a", "column", "of", "the", "strings"},
-    {1, 1, 1, 1, 1, 1, 1, 1, 0, 1}};
-
-  cudf::size_type expected =
-    (std::vector<std::string>{"", "this", "is", "This", "a", "column", "of", "strings"}).size();
-  EXPECT_EQ(expected,
-            cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID));
-}
+  auto got =
+    drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view());
 
-TEST_F(DistinctCount, TableWithNull)
-{
-  cudf::test::fixed_width_column_wrapper<int32_t> col1{{5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1},
-                                                       {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}};
-  cudf::test::fixed_width_column_wrapper<int32_t> col2{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1},
-                                                       {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}};
-  cudf::table_view input{{col1, col2}};
-
-  EXPECT_EQ(8, cudf::distinct_count(input, null_equality::EQUAL));
-  EXPECT_EQ(10, cudf::distinct_count(input, null_equality::UNEQUAL));
+  auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(input, got_unordered->view());
 }
 
-TEST_F(DistinctCount, EmptyColumnedTable)
+TEST_F(DropDuplicatesCommon, EmptyKeys)
 {
-  std::vector<cudf::column_view> cols{};
-
-  cudf::table_view input(cols);
-
-  EXPECT_EQ(0, cudf::distinct_count(input, null_equality::EQUAL));
-  EXPECT_EQ(0, cudf::distinct_count(input, null_equality::UNEQUAL));
-  EXPECT_EQ(0, cudf::distinct_count(cudf::table_view{}, null_equality::EQUAL));
-  EXPECT_EQ(0, cudf::distinct_count(cudf::table_view{}, null_equality::UNEQUAL));
-}
+  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> empty_col{};
+  cudf::table_view input{{col}};
+  std::vector<cudf::size_type> keys{};
 
-TEST_F(DistinctCount, TableMixedTypes)
-{
-  cudf::test::fixed_width_column_wrapper<int32_t> col1{{5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1},
-                                                       {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}};
-  cudf::test::fixed_width_column_wrapper<double> col2{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1},
-                                                      {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}};
-  cudf::test::fixed_width_column_wrapper<uint32_t> col3{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1},
-                                                        {1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0}};
-  cudf::table_view input{{col1, col2, col3}};
-
-  EXPECT_EQ(9, cudf::distinct_count(input, null_equality::EQUAL));
-  EXPECT_EQ(10, cudf::distinct_count(input, null_equality::UNEQUAL));
-}
+  auto got =
+    drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got->view());
 
-TEST_F(DistinctCount, TableWithStringColumnWithNull)
-{
-  cudf::test::fixed_width_column_wrapper<int32_t> col1{{0, 9, 8, 9, 6, 5, 4, 3, 2, 1, 0},
-                                                       {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}};
-  cudf::test::strings_column_wrapper col2{
-    {"", "this", "is", "this", "this", "a", "column", "of", "the", "strings", ""},
-    {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}};
-
-  cudf::table_view input{{col1, col2}};
-  EXPECT_EQ(9, cudf::distinct_count(input, null_equality::EQUAL));
-  EXPECT_EQ(10, cudf::distinct_count(input, null_equality::UNEQUAL));
+  auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got_unordered->view());
 }
 
-struct DropDuplicate : public cudf::test::BaseFixture {
+struct DropDuplicates : public cudf::test::BaseFixture {
 };
 
-TEST_F(DropDuplicate, NonNullTable)
+TEST_F(DropDuplicates, NonNullTable)
 {
   cudf::test::fixed_width_column_wrapper<int32_t> col1{{5, 4, 3, 5, 8, 5}};
   cudf::test::fixed_width_column_wrapper<float> col2{{4, 5, 3, 4, 9, 4}};
@@ -271,7 +115,7 @@ TEST_F(DropDuplicate, NonNullTable)
   cudf::table_view input{{col1, col2, col1_key, col2_key}};
   std::vector<cudf::size_type> keys{2, 3};
 
-  // Keep first of duplicate
+  // Keep the first duplicate row
   // The expected table would be sorted in ascending order with respect to keys
   cudf::test::fixed_width_column_wrapper<int32_t> exp_col1_first{{5, 5, 5, 3, 8}};
   cudf::test::fixed_width_column_wrapper<float> exp_col2_first{{4, 4, 4, 3, 9}};
@@ -284,7 +128,7 @@ TEST_F(DropDuplicate, NonNullTable)
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view());
 
-  // keep last of duplicate
+  // Keep the last duplicate row
   cudf::test::fixed_width_column_wrapper<int32_t> exp_col1_last{{5, 5, 4, 3, 8}};
   cudf::test::fixed_width_column_wrapper<float> exp_col2_last{{4, 4, 5, 3, 9}};
   cudf::test::fixed_width_column_wrapper<int32_t> exp_col1_key_last{{9, 19, 20, 20, 21}};
@@ -296,7 +140,7 @@ TEST_F(DropDuplicate, NonNullTable)
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view());
 
-  // Keep unique
+  // Keep no duplicate rows
   cudf::test::fixed_width_column_wrapper<int32_t> exp_col1_unique{{5, 5, 3, 8}};
   cudf::test::fixed_width_column_wrapper<float> exp_col2_unique{{4, 4, 3, 9}};
   cudf::test::fixed_width_column_wrapper<int32_t> exp_col1_key_unique{{9, 19, 20, 21}};
@@ -309,88 +153,145 @@ TEST_F(DropDuplicate, NonNullTable)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view());
 }
 
-TEST_F(DropDuplicate, WithNull)
+TEST_F(DropDuplicates, KeepFirstWithNull)
 {
-  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<int32_t> key{{20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 3, 2, 5, 8, 1}, {1, 0, 1, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> key{{20, 20, 20, 20, 19, 21, 19},
+                                                      {1, 1, 0, 0, 1, 1, 1}};
   cudf::table_view input{{col, key}};
   std::vector<cudf::size_type> keys{1};
 
-  // Keep first of duplicate
-  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_first{{4, 5, 5, 8}, {0, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_first{{20, 19, 20, 21}, {0, 1, 1, 1}};
-  cudf::table_view expected_first{{exp_col_first, exp_key_col_first}};
-  auto got_first =
+  // nulls are equal
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_first_equal{{3, 5, 5, 8}, {1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_first_equal{{20, 19, 20, 21},
+                                                                          {0, 1, 1, 1}};
+  cudf::table_view expected_first_equal{{exp_col_first_equal, exp_key_col_first_equal}};
+  auto got_first_equal =
     drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first_equal, got_first_equal->view());
 
-  // Keep last of duplicate
-  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_last{{3, 1, 5, 8}, {1, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_last{{20, 19, 20, 21}, {0, 1, 1, 1}};
-  cudf::table_view expected_last{{exp_col_last, exp_key_col_last}};
-  auto got_last = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST);
+  // nulls are unequal
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_first_unequal{{3, 2, 5, 5, 8},
+                                                                        {1, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_first_unequal{{20, 20, 19, 20, 21},
+                                                                            {0, 0, 1, 1, 1}};
+  cudf::table_view expected_first_unequal{{exp_col_first_unequal, exp_key_col_first_unequal}};
+  auto got_first_unequal =
+    drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::UNEQUAL);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view());
-
-  // Keep unique of duplicate
-  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_unique{{5, 8}, {1, 1}};
-  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_unique{{20, 21}, {1, 1}};
-  cudf::table_view expected_unique{{exp_col_unique, exp_key_col_unique}};
-  auto got_unique = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first_unequal, got_first_unequal->view());
 }
 
-TEST_F(DropDuplicate, StringKeyColumn)
+TEST_F(DropDuplicates, KeepLastWithNull)
 {
-  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}};
-  cudf::test::strings_column_wrapper key_col{{"all", "new", "all", "new", "the", "strings"},
-                                             {1, 1, 1, 0, 1, 1}};
-  cudf::table_view input{{col, key_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 3, 2, 5, 8, 1}, {1, 0, 1, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> key{{20, 20, 20, 20, 19, 21, 19},
+                                                      {1, 1, 0, 0, 1, 1, 1}};
+  cudf::table_view input{{col, key}};
   std::vector<cudf::size_type> keys{1};
-  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_last{{5, 3, 4, 1, 8}, {1, 1, 0, 1, 1}};
-  cudf::test::strings_column_wrapper exp_key_col_last{{"new", "all", "new", "strings", "the"},
-                                                      {0, 1, 1, 1, 1}};
-  cudf::table_view expected_last{{exp_col_last, exp_key_col_last}};
 
-  auto got_last = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view());
+  // nulls are equal
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_last_equal{{2, 1, 4, 8}, {1, 1, 0, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_last_equal{{20, 19, 20, 21},
+                                                                         {0, 1, 1, 1}};
+  cudf::table_view expected_last_equal{{exp_col_last_equal, exp_key_col_last_equal}};
+  auto got_last_equal =
+    drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST, null_equality::EQUAL);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last_equal, got_last_equal->view());
+
+  // nulls are unequal
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_last_unequal{{3, 2, 1, 4, 8},
+                                                                       {1, 1, 1, 0, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_last_unequal{{20, 20, 19, 20, 21},
+                                                                           {0, 0, 1, 1, 1}};
+  cudf::table_view expected_last_unequal{{exp_col_last_unequal, exp_key_col_last_unequal}};
+  auto got_last_unequal =
+    drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST, null_equality::UNEQUAL);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last_unequal, got_last_unequal->view());
 }
 
-TEST_F(DropDuplicate, EmptyInputTable)
+TEST_F(DropDuplicates, KeepNoneWithNull)
 {
-  cudf::test::fixed_width_column_wrapper<int32_t> col(std::initializer_list<int32_t>{});
-  cudf::table_view input{{col}};
-  std::vector<cudf::size_type> keys{1, 2};
+  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 3, 2, 5, 8, 1}, {1, 0, 1, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> key{{20, 20, 20, 20, 19, 21, 19},
+                                                      {1, 1, 0, 0, 1, 1, 1}};
+  cudf::table_view input{{col, key}};
+  std::vector<cudf::size_type> keys{1};
 
-  auto got =
-    drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL);
+  // nulls are equal
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_unique_equal{{8}, {1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_unique_equal{{21}, {1}};
+  cudf::table_view expected_unique_equal{{exp_col_unique_equal, exp_key_col_unique_equal}};
+  auto got_unique_equal =
+    drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE, null_equality::EQUAL);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique_equal, got_unique_equal->view());
+
+  // nulls are unequal
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_unique_unequal{{3, 2, 8}, {1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_unique_unequal{{20, 20, 21},
+                                                                             {0, 0, 1}};
+  cudf::table_view expected_unique_unequal{{exp_col_unique_unequal, exp_key_col_unique_unequal}};
+  auto got_unique_unequal =
+    drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE, null_equality::UNEQUAL);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique_unequal, got_unique_unequal->view());
 }
 
-TEST_F(DropDuplicate, NoColumnInputTable)
+struct UnorderedDropDuplicates : public cudf::test::BaseFixture {
+};
+
+TEST_F(UnorderedDropDuplicates, NonNullTable)
 {
-  cudf::table_view input{std::vector<cudf::column_view>()};
-  std::vector<cudf::size_type> keys{1, 2};
+  cudf::test::fixed_width_column_wrapper<int32_t> col1{{6, 6, 3, 5, 8, 5}};
+  cudf::test::fixed_width_column_wrapper<float> col2{{6, 6, 3, 4, 9, 4}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col1_key{{20, 20, 20, 19, 21, 9}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col2_key{{19, 19, 20, 20, 9, 21}};
 
-  auto got =
-    drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL);
+  cudf::table_view input{{col1, col2, col1_key, col2_key}};
+  std::vector<cudf::size_type> keys{2, 3};
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view());
+  // The expected table would be sorted in ascending order with respect to keys
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_col1{{5, 5, 6, 3, 8}};
+  cudf::test::fixed_width_column_wrapper<float> exp_col2{{4, 4, 6, 3, 9}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_col1_key{{9, 19, 20, 20, 21}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_col2_key{{21, 20, 19, 20, 9}};
+  cudf::table_view expected{{exp_col1, exp_col2, exp_col1_key, exp_col2_key}};
+
+  auto result        = unordered_drop_duplicates(input, keys);
+  auto key_view      = result->select(keys.begin(), keys.end());
+  auto sorted_result = cudf::sort_by_key(result->view(), key_view);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, sorted_result->view());
 }
 
-TEST_F(DropDuplicate, EmptyKeys)
+TEST_F(UnorderedDropDuplicates, WithNull)
 {
-  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<int32_t> empty_col{};
-  cudf::table_view input{{col}};
-  std::vector<cudf::size_type> keys{};
-
-  auto got =
-    drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL);
+  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 4, 1, 8, 1}, {1, 0, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> key{{20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}};
+  cudf::table_view input{{col, key}};
+  std::vector<cudf::size_type> keys{1};
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got->view());
+  // nulls are equal
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_equal_col{{4, 1, 5, 8}, {0, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_equal_key_col{{20, 19, 20, 21}, {0, 1, 1, 1}};
+  cudf::table_view expected_equal{{exp_equal_col, exp_equal_key_col}};
+  auto res_equal    = unordered_drop_duplicates(input, keys, null_equality::EQUAL);
+  auto equal_keys   = res_equal->select(keys.begin(), keys.end());
+  auto sorted_equal = cudf::sort_by_key(res_equal->view(), equal_keys);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_equal, sorted_equal->view());
+
+  // nulls are unequal
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_unequal_col{{4, 1, 4, 5, 8}, {0, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_unequal_key_col{{20, 19, 20, 20, 21},
+                                                                      {0, 1, 0, 1, 1}};
+  cudf::table_view expected_unequal{{exp_unequal_col, exp_unequal_key_col}};
+  auto res_unequal    = unordered_drop_duplicates(input, keys, null_equality::UNEQUAL);
+  auto sorted_unequal = cudf::sort(res_unequal->view());
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unequal, sorted_unequal->view());
 }
diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst
index 2a4dd5ff9c8..d705504cc0c 100644
--- a/docs/cudf/source/api_docs/index_objects.rst
+++ b/docs/cudf/source/api_docs/index_objects.rst
@@ -22,7 +22,6 @@ Properties
    :toctree: api/
 
    Index.empty
-   Index.gpu_values
    Index.has_duplicates
    Index.is_monotonic
    Index.is_monotonic_increasing
@@ -93,9 +92,9 @@ Conversion
    :toctree: api/
 
    Index.astype
-   Index.to_array
    Index.to_arrow
    Index.to_list
+   Index.to_numpy
    Index.to_series
    Index.to_frame
    Index.to_pandas
diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
index 891bb3a1e61..cf5dd4a2a1d 100644
--- a/docs/cudf/source/api_docs/series.rst
+++ b/docs/cudf/source/api_docs/series.rst
@@ -408,13 +408,13 @@ Serialization / IO / conversion
 .. autosummary::
    :toctree: api/
 
-   Series.to_array
    Series.to_arrow
+   Series.to_cupy
    Series.to_dlpack
    Series.to_frame
-   Series.to_gpu_array
    Series.to_hdf
    Series.to_json
+   Series.to_numpy
    Series.to_pandas
    Series.to_string
    Series.from_arrow
diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb
index a7e959a05a7..0034584a6f7 100644
--- a/docs/cudf/source/user_guide/10min.ipynb
+++ b/docs/cudf/source/user_guide/10min.ipynb
@@ -4550,7 +4550,7 @@
     }
    ],
    "source": [
-    "df['a'].to_array()"
+    "df['a'].to_numpy()"
    ]
   },
   {
@@ -4571,7 +4571,7 @@
     }
    ],
    "source": [
-    "ddf['a'].compute().to_array()"
+    "ddf['a'].compute().to_numpy()"
    ]
   },
   {
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
index 61981b34615..cb3234bf706 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
@@ -45,7 +45,6 @@ public final class ColumnVector extends ColumnView {
     NativeDepsLoader.loadNativeDeps();
   }
 
-  private final OffHeapState offHeap;
   private Optional<Long> nullCount = Optional.empty();
   private int refCount;
 
@@ -56,14 +55,23 @@ public final class ColumnVector extends ColumnView {
    *                      owned by this instance.
    */
   public ColumnVector(long nativePointer) {
-    super(getColumnViewFromColumn(nativePointer));
+    super(new OffHeapState(nativePointer));
     assert nativePointer != 0;
-    offHeap = new OffHeapState(nativePointer);
     MemoryCleaner.register(this, offHeap);
     this.refCount = 0;
     incRefCountInternal(true);
   }
 
+  private static OffHeapState makeOffHeap(DType type, long rows, Optional<Long> nullCount,
+      DeviceMemoryBuffer dataBuffer, DeviceMemoryBuffer validityBuffer,
+      DeviceMemoryBuffer offsetBuffer) {
+    long viewHandle = initViewHandle(
+        type, (int)rows, nullCount.orElse(UNKNOWN_NULL_COUNT).intValue(),
+        dataBuffer, validityBuffer, offsetBuffer, null);
+    return new OffHeapState(type, (int) rows, dataBuffer, validityBuffer,
+        offsetBuffer, null, viewHandle);
+  }
+
   /**
    * Create a new column vector based off of data already on the device.
    * @param type the type of the vector
@@ -81,24 +89,29 @@ public ColumnVector(long nativePointer) {
   public ColumnVector(DType type, long rows, Optional<Long> nullCount,
       DeviceMemoryBuffer dataBuffer, DeviceMemoryBuffer validityBuffer,
       DeviceMemoryBuffer offsetBuffer) {
-    super(ColumnVector.initViewHandle(
-        type, (int)rows, nullCount.orElse(UNKNOWN_NULL_COUNT).intValue(),
-        dataBuffer, validityBuffer, offsetBuffer, null));
+    super(makeOffHeap(type, rows, nullCount, dataBuffer, validityBuffer, offsetBuffer));
     assert !type.equals(DType.LIST) : "This constructor should not be used for list type";
     if (!type.equals(DType.STRING)) {
       assert offsetBuffer == null : "offsets are only supported for STRING";
     }
     assert (nullCount.isPresent() && nullCount.get() <= Integer.MAX_VALUE)
         || !nullCount.isPresent();
-    offHeap = new OffHeapState(type, (int) rows, dataBuffer, validityBuffer,
-        offsetBuffer, null, viewHandle);
     MemoryCleaner.register(this, offHeap);
     this.nullCount = nullCount;
-
     this.refCount = 0;
     incRefCountInternal(true);
   }
 
+  private static OffHeapState makeOffHeap(DType type, long rows, Optional<Long> nullCount,
+      DeviceMemoryBuffer dataBuffer, DeviceMemoryBuffer validityBuffer,
+      DeviceMemoryBuffer offsetBuffer, List<DeviceMemoryBuffer> toClose, long[] childHandles) {
+    long viewHandle = initViewHandle(type, (int)rows, nullCount.orElse(UNKNOWN_NULL_COUNT).intValue(),
+        dataBuffer, validityBuffer,
+        offsetBuffer, childHandles);
+    return new OffHeapState(type, (int) rows, dataBuffer, validityBuffer, offsetBuffer,
+        toClose, viewHandle);
+  }
+
   /**
    * Create a new column vector based off of data already on the device with child columns.
    * @param type the type of the vector, typically a nested type
@@ -118,16 +131,12 @@ public ColumnVector(DType type, long rows, Optional<Long> nullCount,
   public ColumnVector(DType type, long rows, Optional<Long> nullCount,
                       DeviceMemoryBuffer dataBuffer, DeviceMemoryBuffer validityBuffer,
                       DeviceMemoryBuffer offsetBuffer, List<DeviceMemoryBuffer> toClose, long[] childHandles) {
-    super(initViewHandle(type, (int)rows, nullCount.orElse(UNKNOWN_NULL_COUNT).intValue(),
-        dataBuffer, validityBuffer,
-        offsetBuffer, childHandles));
+    super(makeOffHeap(type, rows, nullCount, dataBuffer, validityBuffer, offsetBuffer, toClose, childHandles));
     if (!type.equals(DType.STRING) && !type.equals(DType.LIST)) {
       assert offsetBuffer == null : "offsets are only supported for STRING, LISTS";
     }
     assert (nullCount.isPresent() && nullCount.get() <= Integer.MAX_VALUE)
         || !nullCount.isPresent();
-    offHeap = new OffHeapState(type, (int) rows, dataBuffer, validityBuffer, offsetBuffer,
-            toClose, viewHandle);
     MemoryCleaner.register(this, offHeap);
 
     this.refCount = 0;
@@ -143,8 +152,7 @@ public ColumnVector(DType type, long rows, Optional<Long> nullCount,
    * @param contiguousBuffer the buffer that this is based off of.
    */
   private ColumnVector(long viewAddress, DeviceMemoryBuffer contiguousBuffer) {
-    super(viewAddress);
-    offHeap = new OffHeapState(viewAddress, contiguousBuffer);
+    super(new OffHeapState(viewAddress, contiguousBuffer));
     MemoryCleaner.register(this, offHeap);
     // TODO we may want to ask for the null count anyways...
     this.nullCount = Optional.empty();
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 422311fc8e0..8155fe79080 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -40,6 +40,7 @@ public class ColumnView implements AutoCloseable, BinaryOperable {
   protected final DType type;
   protected final long rows;
   protected final long nullCount;
+  protected final ColumnVector.OffHeapState offHeap;
 
   /**
    * Constructs a Column View given a native view address
@@ -50,6 +51,22 @@ public class ColumnView implements AutoCloseable, BinaryOperable {
     this.type = DType.fromNative(ColumnView.getNativeTypeId(viewHandle), ColumnView.getNativeTypeScale(viewHandle));
     this.rows = ColumnView.getNativeRowCount(viewHandle);
     this.nullCount = ColumnView.getNativeNullCount(viewHandle);
+    this.offHeap = null;
+  }
+
+
+  /**
+   * Intended to be called from ColumnVector when it is being constructed. Because state creates a
+   * cudf::column_view instance and will close it in all cases, we don't want to have to double
+   * close it.
+   * @param state the state this view is based off of.
+   */
+  protected ColumnView(ColumnVector.OffHeapState state) {
+    offHeap = state;
+    viewHandle = state.getViewHandle();
+    type = DType.fromNative(ColumnView.getNativeTypeId(viewHandle), ColumnView.getNativeTypeScale(viewHandle));
+    rows = ColumnView.getNativeRowCount(viewHandle);
+    nullCount = ColumnView.getNativeNullCount(viewHandle);
   }
 
   /**
@@ -265,7 +282,10 @@ public long getDeviceMemorySize() {
 
   @Override
   public void close() {
-    ColumnView.deleteColumnView(viewHandle);
+    // close the view handle so long as offHeap is not going to do it for us.
+    if (offHeap == null) {
+      ColumnView.deleteColumnView(viewHandle);
+    }
     viewHandle = 0;
   }
 
@@ -3248,7 +3268,7 @@ public final ColumnVector listIndexOf(Scalar key, FindOptions findOption) {
    * The index is set to null if one of the following is true: 
    * 1. The search key row is null.
    * 2. The list row is null.
-   * @param key ColumnView of search keys.
+   * @param keys ColumnView of search keys.
    * @param findOption Whether to find the first index of the key, or the last.
    * @return The resultant column of int32 indices
    */
@@ -3284,6 +3304,17 @@ public final Scalar getScalarElement(int index) {
     return new Scalar(getType(), getElement(getNativeView(), index));
   }
 
+  /**
+   * Get the number of bytes needed to allocate a validity buffer for the given number of rows.
+   * According to cudf::bitmask_allocation_size_bytes, the padding boundary for null mask is 64 bytes.
+   */
+  static long getValidityBufferSize(int numRows) {
+    // number of bytes required = Math.ceil(number of bits / 8)
+    long actualBytes = ((long) numRows + 7) >> 3;
+    // padding to the multiplies of the padding boundary(64 bytes)
+    return ((actualBytes + 63) >> 6) << 6;
+  }
+
   /////////////////////////////////////////////////////////////////////////////
   // INTERNAL/NATIVE ACCESS
   /////////////////////////////////////////////////////////////////////////////
@@ -3701,7 +3732,7 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat
    * Native method to find the first (or last) index of each search key in the specified column,
    * in each row of a list column.
    * @param nativeView the column view handle of the list
-   * @param scalarColumnHandle handle to the search key column
+   * @param keyColumnHandle handle to the search key column
    * @param isFindFirst Whether to find the first index of the key, or the last.
    * @return column handle of the resultant column of int32 indices
    */
@@ -3881,11 +3912,6 @@ private static native long bitwiseMergeAndSetValidity(long baseHandle, long[] vi
   private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle, 
                                                              long boolColumnViewHandle) throws CudfException;
 
-  /**
-   * Get the number of bytes needed to allocate a validity buffer for the given number of rows.
-   */
-  static native long getNativeValidPointerSize(int size);
-
   ////////
   // Native cudf::column_view life cycle and metadata access methods. Life cycle methods
   // should typically only be called from the OffHeap inner class.
@@ -3975,7 +4001,7 @@ static ColumnVector createColumnVector(DType type, int rows, HostMemoryBuffer da
       DeviceMemoryBuffer mainValidDevBuff = null;
       DeviceMemoryBuffer mainOffsetsDevBuff = null;
       if (mainColValid != null) {
-        long validLen = getNativeValidPointerSize(mainColRows);
+        long validLen = getValidityBufferSize(mainColRows);
         mainValidDevBuff = DeviceMemoryBuffer.allocate(validLen);
         mainValidDevBuff.copyFromHostBuffer(mainColValid, 0, validLen);
       }
@@ -4084,7 +4110,7 @@ private static NestedColumnVector createNestedColumnVector(DType type, long rows
         data.copyFromHostBuffer(dataBuffer, 0, dataLen);
       }
       if (validityBuffer != null) {
-        long validLen = getNativeValidPointerSize((int)rows);
+        long validLen = getValidityBufferSize((int)rows);
         valid = DeviceMemoryBuffer.allocate(validLen);
         valid.copyFromHostBuffer(validityBuffer, 0, validLen);
       }
diff --git a/java/src/main/java/ai/rapids/cudf/DType.java b/java/src/main/java/ai/rapids/cudf/DType.java
index 742501be375..2e5b0202dc5 100644
--- a/java/src/main/java/ai/rapids/cudf/DType.java
+++ b/java/src/main/java/ai/rapids/cudf/DType.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -307,7 +307,7 @@ public static DType fromJavaBigDecimal(BigDecimal dec) {
       return new DType(DTypeEnum.DECIMAL128, -dec.scale());
     }
     throw new IllegalArgumentException("Precision " + dec.precision() +
-        " exceeds max precision cuDF can support " + DECIMAL64_MAX_PRECISION);
+        " exceeds max precision cuDF can support " + DECIMAL128_MAX_PRECISION);
   }
 
   /**
diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
index 0fe7d7a5df8..3abc6db385d 100644
--- a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
@@ -199,7 +199,7 @@ public ColumnVector copyToDevice() {
         }
         HostMemoryBuffer hvalid = this.offHeap.valid;
         if (hvalid != null) {
-          long validLen = ColumnView.getNativeValidPointerSize((int) rows);
+          long validLen = ColumnView.getValidityBufferSize((int) rows);
           valid = DeviceMemoryBuffer.allocate(validLen);
           valid.copyFromHostBuffer(hvalid, 0, validLen);
         }
@@ -858,7 +858,7 @@ public static HostColumnVector timestampNanoSecondsFromBoxedLongs(Long... values
    * Build
    */
 
-  public static final class ColumnBuilder implements  AutoCloseable {
+  public static final class ColumnBuilder implements AutoCloseable {
 
     private DType type;
     private HostMemoryBuffer data;
@@ -869,28 +869,78 @@ public static final class ColumnBuilder implements  AutoCloseable {
     private boolean nullable;
     private long rows;
     private long estimatedRows;
+    private long rowCapacity = 0L;
+    private long validCapacity = 0L;
     private boolean built = false;
     private List<ColumnBuilder> childBuilders = new ArrayList<>();
+    private Runnable nullHandler;
 
-    private int currentIndex = 0;
-    private int currentByteIndex = 0;
-
+    // The value of currentIndex can't exceed Int32.Max. Storing currentIndex as a long is to
+    // adapt HostMemoryBuffer.setXXX, which requires a long offset.
+    private long currentIndex = 0;
+    // Only for Strings: pointer of the byte (data) buffer
+    private int currentStringByteIndex = 0;
+    // Use bit shift instead of multiply to transform row offset to byte offset
+    private int bitShiftBySize = 0;
+    private static final int bitShiftByOffset = (int)(Math.log(OFFSET_SIZE) / Math.log(2));
 
     public ColumnBuilder(HostColumnVector.DataType type, long estimatedRows) {
       this.type = type.getType();
       this.nullable = type.isNullable();
       this.rows = 0;
-      this.estimatedRows = estimatedRows;
+      this.estimatedRows = Math.max(estimatedRows, 1L);
+      this.bitShiftBySize = (int)(Math.log(this.type.getSizeInBytes()) / Math.log(2));
+
+      // initialize the null handler according to the data type
+      this.setupNullHandler();
+
       for (int i = 0; i < type.getNumChildren(); i++) {
         childBuilders.add(new ColumnBuilder(type.getChild(i), estimatedRows));
       }
     }
 
+    private void setupNullHandler() {
+      if (this.type == DType.LIST) {
+        this.nullHandler = () -> {
+          this.growListBuffersAndRows();
+          this.growValidBuffer();
+          setNullAt(currentIndex++);
+          offsets.setInt(currentIndex << bitShiftByOffset, childBuilders.get(0).getCurrentIndex());
+        };
+      } else if (this.type == DType.STRING) {
+        this.nullHandler = () -> {
+          this.growStringBuffersAndRows(0);
+          this.growValidBuffer();
+          setNullAt(currentIndex++);
+          offsets.setInt(currentIndex << bitShiftByOffset, currentStringByteIndex);
+        };
+      } else if (this.type == DType.STRUCT) {
+        this.nullHandler = () -> {
+          this.growStructBuffersAndRows();
+          this.growValidBuffer();
+          setNullAt(currentIndex++);
+          for (ColumnBuilder childBuilder : childBuilders) {
+            childBuilder.appendNull();
+          }
+        };
+      } else {
+        this.nullHandler = () -> {
+          this.growFixedWidthBuffersAndRows();
+          this.growValidBuffer();
+          setNullAt(currentIndex++);
+        };
+      }
+    }
+
     public HostColumnVector build() {
       List<HostColumnVectorCore> hostColumnVectorCoreList = new ArrayList<>();
       for (ColumnBuilder childBuilder : childBuilders) {
         hostColumnVectorCoreList.add(childBuilder.buildNestedInternal());
       }
+      // Aligns the valid buffer size with other buffers in terms of row size, because it grows lazily.
+      if (valid != null) {
+        growValidBuffer();
+      }
       HostColumnVector hostColumnVector = new HostColumnVector(type, rows, Optional.of(nullCount), data, valid, offsets,
           hostColumnVectorCoreList);
       built = true;
@@ -902,6 +952,10 @@ private HostColumnVectorCore buildNestedInternal() {
       for (ColumnBuilder childBuilder : childBuilders) {
         hostColumnVectorCoreList.add(childBuilder.buildNestedInternal());
       }
+      // Aligns the valid buffer size with other buffers in terms of row size, because it grows lazily.
+      if (valid != null) {
+        growValidBuffer();
+      }
       return new HostColumnVectorCore(type, rows, Optional.of(nullCount), data, valid, offsets, hostColumnVectorCoreList);
     }
 
@@ -929,71 +983,113 @@ public ColumnBuilder appendStructValues(StructData... inputList) {
     }
 
     /**
-     * A method that is responsible for growing the buffers as needed
-     * and incrementing the row counts when we append values or nulls.
-     * @param hasNull indicates whether the validity buffer needs to be considered, as the
-     *                nullcount may not have been fully calculated yet
-     * @param length used for strings
+     * Grows valid buffer lazily. The valid buffer won't be materialized until the first null
+     * value appended. This method reuses the rowCapacity to track the sizes of column.
+     * Therefore, please call specific growBuffer method to update rowCapacity before calling
+     * this method.
+     */
+    private void growValidBuffer() {
+      if (valid == null) {
+        long maskBytes = ColumnView.getValidityBufferSize((int) rowCapacity);
+        valid = HostMemoryBuffer.allocate(maskBytes);
+        valid.setMemory(0, valid.length, (byte) 0xFF);
+        validCapacity = rowCapacity;
+        return;
+      }
+      if (validCapacity < rowCapacity) {
+        long maskBytes = ColumnView.getValidityBufferSize((int) rowCapacity);
+        HostMemoryBuffer newValid = HostMemoryBuffer.allocate(maskBytes);
+        newValid.setMemory(0, newValid.length, (byte) 0xFF);
+        valid = copyBuffer(newValid, valid);
+        validCapacity = rowCapacity;
+      }
+    }
+
+    /**
+     * A method automatically grows data buffer for fixed-width columns as needed along with
+     * incrementing the row counts. Please call this method before appending any value or null.
      */
-    private void growBuffersAndRows(boolean hasNull, int length) {
+    private void growFixedWidthBuffersAndRows() {
       assert rows + 1 <= Integer.MAX_VALUE : "Row count cannot go over Integer.MAX_VALUE";
       rows++;
-      long targetDataSize = 0;
 
-      if (!type.isNestedType()) {
-        if (type.equals(DType.STRING)) {
-          targetDataSize = data == null ? length : currentByteIndex + length;
-        } else {
-          targetDataSize = data == null ? estimatedRows * type.getSizeInBytes() : rows * type.getSizeInBytes();
-        }
+      if (data == null) {
+        data = HostMemoryBuffer.allocate(estimatedRows << bitShiftBySize);
+        rowCapacity = estimatedRows;
+      } else if (rows > rowCapacity) {
+        long newCap = Math.min(rowCapacity * 2, Integer.MAX_VALUE - 1);
+        data = copyBuffer(HostMemoryBuffer.allocate(newCap << bitShiftBySize), data);
+        rowCapacity = newCap;
       }
+    }
 
-      if (targetDataSize > 0) {
-        if (data == null) {
-          data = HostMemoryBuffer.allocate(targetDataSize);
-        } else {
-          long maxLen;
-          if (type.equals(DType.STRING)) {
-            maxLen = Integer.MAX_VALUE;
-          } else {
-            maxLen = Integer.MAX_VALUE * (long) type.getSizeInBytes();
-          }
-          long oldLen = data.getLength();
-          long newDataLen = Math.max(1, oldLen);
-          while (targetDataSize > newDataLen) {
-            newDataLen = newDataLen * 2;
-          }
-          if (newDataLen != oldLen) {
-            newDataLen = Math.min(newDataLen, maxLen);
-            if (newDataLen < targetDataSize) {
-              throw new IllegalStateException("A data buffer for strings is not supported over 2GB in size");
-            }
-            HostMemoryBuffer newData = HostMemoryBuffer.allocate(newDataLen);
-            data = copyBuffer(newData, data);
-          }
-        }
+    /**
+     * A method automatically grows offsets buffer for list columns as needed along with
+     * incrementing the row counts. Please call this method before appending any value or null.
+     */
+    private void growListBuffersAndRows() {
+      assert rows + 2 <= Integer.MAX_VALUE : "Row count cannot go over Integer.MAX_VALUE";
+      rows++;
+
+      if (offsets == null) {
+        offsets = HostMemoryBuffer.allocate((estimatedRows + 1) << bitShiftByOffset);
+        offsets.setInt(0, 0);
+        rowCapacity = estimatedRows;
+      } else if (rows > rowCapacity) {
+        long newCap = Math.min(rowCapacity * 2, Integer.MAX_VALUE - 2);
+        offsets = copyBuffer(HostMemoryBuffer.allocate((newCap + 1) << bitShiftByOffset), offsets);
+        rowCapacity = newCap;
       }
-      if (type.equals(DType.LIST) || type.equals(DType.STRING)) {
-        if (offsets == null) {
-          offsets = HostMemoryBuffer.allocate((estimatedRows + 1) * OFFSET_SIZE);
-          offsets.setInt(0, 0);
-        } else if ((rows +1) * OFFSET_SIZE > offsets.length) {
-          long newOffsetLen = offsets.length * 2;
-          HostMemoryBuffer newOffsets = HostMemoryBuffer.allocate(newOffsetLen);
-          offsets = copyBuffer(newOffsets, offsets);
-        }
+    }
+
+    /**
+     * A method automatically grows offsets and data buffer for string columns as needed along with
+     * incrementing the row counts. Please call this method before appending any value or null.
+     *
+     * @param stringLength number of bytes required by the next row
+     */
+    private void growStringBuffersAndRows(int stringLength) {
+      assert rows + 2 <= Integer.MAX_VALUE : "Row count cannot go over Integer.MAX_VALUE";
+      rows++;
+
+      if (offsets == null) {
+        // Initialize data buffer with at least 1 byte in case the first appended value is null.
+        data = HostMemoryBuffer.allocate(Math.max(1, stringLength));
+        offsets = HostMemoryBuffer.allocate((estimatedRows + 1) << bitShiftByOffset);
+        offsets.setInt(0, 0);
+        rowCapacity = estimatedRows;
+        return;
       }
-      if (hasNull || nullCount > 0) {
-        if (valid == null) {
-          long targetValidSize = ColumnView.getNativeValidPointerSize((int)estimatedRows);
-          valid = HostMemoryBuffer.allocate(targetValidSize);
-          valid.setMemory(0, targetValidSize, (byte) 0xFF);
-        } else if (valid.length < ColumnView.getNativeValidPointerSize((int)rows)) {
-          long newValidLen = valid.length * 2;
-          HostMemoryBuffer newValid = HostMemoryBuffer.allocate(newValidLen);
-          newValid.setMemory(0, newValidLen, (byte) 0xFF);
-          valid = copyBuffer(newValid, valid);
-        }
+
+      if (rows > rowCapacity) {
+        long newCap = Math.min(rowCapacity * 2, Integer.MAX_VALUE - 2);
+        offsets = copyBuffer(HostMemoryBuffer.allocate((newCap + 1) << bitShiftByOffset), offsets);
+        rowCapacity = newCap;
+      }
+
+      long currentLength = currentStringByteIndex + stringLength;
+      if (currentLength > data.length) {
+        long requiredLength = data.length;
+        do {
+          requiredLength = requiredLength * 2;
+        } while (currentLength > requiredLength);
+        data = copyBuffer(HostMemoryBuffer.allocate(requiredLength), data);
+      }
+    }
+
+    /**
+     * For struct columns, we only need to update rows and rowCapacity (for the growth of
+     * valid buffer), because struct columns hold no buffer itself.
+     * Please call this method before appending any value or null.
+     */
+    private void growStructBuffersAndRows() {
+      assert rows + 1 <= Integer.MAX_VALUE : "Row count cannot go over Integer.MAX_VALUE";
+      rows++;
+
+      if (rowCapacity == 0) {
+        rowCapacity = estimatedRows;
+      } else if (rows > rowCapacity) {
+        rowCapacity = Math.min(rowCapacity * 2, Integer.MAX_VALUE - 1);
       }
     }
 
@@ -1015,29 +1111,13 @@ private HostMemoryBuffer copyBuffer(HostMemoryBuffer targetBuffer, HostMemoryBuf
      * Method that sets the null bit in the validity vector
      * @param index the row index at which the null is marked
      */
-    private void setNullAt(int index) {
+    private void setNullAt(long index) {
       assert index < rows : "Index for null value should fit the column with " + rows + " rows";
       nullCount += BitVectorHelper.setNullAt(valid, index);
     }
 
     public final ColumnBuilder appendNull() {
-      growBuffersAndRows(true, 0);
-      setNullAt(currentIndex);
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
-      if (type.hasOffsets()) {
-        if (type.equals(DType.LIST)) {
-          offsets.setInt(currentIndex * OFFSET_SIZE, childBuilders.get(0).getCurrentIndex());
-        } else {
-          // It is a String
-          offsets.setInt(currentIndex * OFFSET_SIZE, currentByteIndex);
-        }
-      } else if (type.equals(DType.STRUCT)) {
-        // structs propagate nulls to children and even further down if needed
-        for (ColumnBuilder childBuilder : childBuilders) {
-          childBuilder.appendNull();
-        }
-      }
+      nullHandler.run();
       return this;
     }
 
@@ -1081,7 +1161,7 @@ public ColumnBuilder endStruct() {
       assert type.equals(DType.STRUCT) : "This only works for structs";
       assert allChildrenHaveSameIndex() : "Appending structs data appears to be off " +
           childBuilders + " should all have the same currentIndex " + type;
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+      growStructBuffersAndRows();
       currentIndex++;
       return this;
     }
@@ -1095,9 +1175,8 @@ assert allChildrenHaveSameIndex() : "Appending structs data appears to be off "
      */
     public ColumnBuilder endList() {
       assert type.equals(DType.LIST);
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
-      currentIndex++;
-      offsets.setInt(currentIndex * OFFSET_SIZE, childBuilders.get(0).getCurrentIndex());
+      growListBuffersAndRows();
+      offsets.setInt(++currentIndex << bitShiftByOffset, childBuilders.get(0).getCurrentIndex());
       return this;
     }
 
@@ -1155,80 +1234,67 @@ public void incrCurrentIndex() {
     }
 
     public int getCurrentIndex() {
-      return currentIndex;
+      return (int) currentIndex;
     }
 
+    @Deprecated
     public int getCurrentByteIndex() {
-      return currentByteIndex;
+      return currentStringByteIndex;
     }
 
     public final ColumnBuilder append(byte value) {
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+      growFixedWidthBuffersAndRows();
       assert type.isBackedByByte();
       assert currentIndex < rows;
-      data.setByte(currentIndex * type.getSizeInBytes(), value);
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
+      data.setByte(currentIndex++ << bitShiftBySize, value);
       return this;
     }
 
     public final ColumnBuilder append(short value) {
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+      growFixedWidthBuffersAndRows();
       assert type.isBackedByShort();
       assert currentIndex < rows;
-      data.setShort(currentIndex * type.getSizeInBytes(), value);
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
+      data.setShort(currentIndex++ << bitShiftBySize, value);
       return this;
     }
 
     public final ColumnBuilder append(int value) {
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+      growFixedWidthBuffersAndRows();
       assert type.isBackedByInt();
       assert currentIndex < rows;
-      data.setInt(currentIndex * type.getSizeInBytes(), value);
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
+      data.setInt(currentIndex++ << bitShiftBySize, value);
       return this;
     }
 
     public final ColumnBuilder append(long value) {
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+      growFixedWidthBuffersAndRows();
       assert type.isBackedByLong();
       assert currentIndex < rows;
-      data.setLong(currentIndex * type.getSizeInBytes(), value);
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
+      data.setLong(currentIndex++ << bitShiftBySize, value);
       return this;
     }
 
     public final ColumnBuilder append(float value) {
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+      growFixedWidthBuffersAndRows();
       assert type.equals(DType.FLOAT32);
       assert currentIndex < rows;
-      data.setFloat(currentIndex * type.getSizeInBytes(), value);
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
+      data.setFloat(currentIndex++ << bitShiftBySize, value);
       return this;
     }
 
     public final ColumnBuilder append(double value) {
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+      growFixedWidthBuffersAndRows();
       assert type.equals(DType.FLOAT64);
       assert currentIndex < rows;
-      data.setDouble(currentIndex * type.getSizeInBytes(), value);
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
+      data.setDouble(currentIndex++ << bitShiftBySize, value);
       return this;
     }
 
     public final ColumnBuilder append(boolean value) {
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+      growFixedWidthBuffersAndRows();
       assert type.equals(DType.BOOL8);
       assert currentIndex < rows;
-      data.setBoolean(currentIndex * type.getSizeInBytes(), value);
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
+      data.setBoolean(currentIndex++ << bitShiftBySize, value);
       return this;
     }
 
@@ -1237,22 +1303,19 @@ public ColumnBuilder append(BigDecimal value) {
     }
 
     public ColumnBuilder append(BigInteger unscaledVal) {
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+      growFixedWidthBuffersAndRows();
       assert currentIndex < rows;
       if (type.typeId == DType.DTypeEnum.DECIMAL32) {
-        data.setInt(currentIndex * type.getSizeInBytes(), unscaledVal.intValueExact());
+        data.setInt(currentIndex++ << bitShiftBySize, unscaledVal.intValueExact());
       } else if (type.typeId == DType.DTypeEnum.DECIMAL64) {
-        data.setLong(currentIndex * type.getSizeInBytes(), unscaledVal.longValueExact());
+        data.setLong(currentIndex++ << bitShiftBySize, unscaledVal.longValueExact());
       } else if (type.typeId == DType.DTypeEnum.DECIMAL128) {
-        assert currentIndex < rows;
         byte[] unscaledValueBytes = unscaledVal.toByteArray();
         byte[] result = convertDecimal128FromJavaToCudf(unscaledValueBytes);
-        data.setBytes(currentIndex*DType.DTypeEnum.DECIMAL128.sizeInBytes, result, 0, result.length);
-      }  else {
+        data.setBytes(currentIndex++ << bitShiftBySize, result, 0, result.length);
+      } else {
         throw new IllegalStateException(type + " is not a supported decimal type.");
       }
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
       return this;
     }
 
@@ -1271,14 +1334,13 @@ public ColumnBuilder appendUTF8String(byte[] value, int srcOffset, int length) {
       assert length >= 0;
       assert value.length + srcOffset <= length;
       assert type.equals(DType.STRING) : " type " + type + " is not String";
-      currentIndex++;
-      growBuffersAndRows(false, length);
-      assert currentIndex < rows + 1;
+      growStringBuffersAndRows(length);
+      assert currentIndex < rows;
       if (length > 0) {
-        data.setBytes(currentByteIndex, value, srcOffset, length);
+        data.setBytes(currentStringByteIndex, value, srcOffset, length);
       }
-      currentByteIndex += length;
-      offsets.setInt(currentIndex * OFFSET_SIZE, currentByteIndex);
+      currentStringByteIndex += length;
+      offsets.setInt(++currentIndex << bitShiftByOffset, currentStringByteIndex);
       return this;
     }
 
@@ -1822,7 +1884,7 @@ public final Builder append(HostColumnVector columnVector) {
     }
 
     private void allocateBitmaskAndSetDefaultValues() {
-      long bitmaskSize = ColumnView.getNativeValidPointerSize((int) rows);
+      long bitmaskSize = ColumnView.getValidityBufferSize((int) rows);
       valid = HostMemoryBuffer.allocate(bitmaskSize);
       valid.setMemory(0, bitmaskSize, (byte) 0xFF);
     }
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 0fce27bc130..63247eb0066 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -1790,16 +1790,6 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidityLength(J
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidPointerSize(JNIEnv *env,
-                                                                                 jobject j_object,
-                                                                                 jint size) {
-  try {
-    cudf::jni::auto_set_device(env);
-    return static_cast<jlong>(cudf::bitmask_allocation_size_bytes(size));
-  }
-  CATCH_STD(env, 0);
-}
-
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getDeviceMemorySize(JNIEnv *env, jclass,
                                                                            jlong handle) {
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
diff --git a/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java
index a26dbec4907..7b476c31b95 100644
--- a/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 import org.junit.jupiter.api.Test;
 
 import java.util.Random;
+import java.util.function.Consumer;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
@@ -39,21 +40,34 @@ public void testCreateColumnVectorBuilder() {
 
   @Test
   public void testArrayAllocation() {
-    try (HostColumnVector byteColumnVector = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) {
-      assertFalse(byteColumnVector.hasNulls());
-      assertEquals(byteColumnVector.getByte(0), 2);
-      assertEquals(byteColumnVector.getByte(1), 3);
-      assertEquals(byteColumnVector.getByte(2), 5);
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertEquals(cv.getByte(0), 2);
+      assertEquals(cv.getByte(1), 3);
+      assertEquals(cv.getByte(2), 5);
+    };
+    try (HostColumnVector bcv = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) {
+      verify.accept(bcv);
+    }
+    try (HostColumnVector bcv = ColumnBuilderHelper.fromBytes(true, new byte[]{2, 3, 5})) {
+      verify.accept(bcv);
     }
   }
 
   @Test
   public void testUnsignedArrayAllocation() {
-    try (HostColumnVector v = HostColumnVector.fromUnsignedBytes(new byte[]{(byte)0xff, (byte)128, 5})) {
-      assertFalse(v.hasNulls());
-      assertEquals(0xff, Byte.toUnsignedInt(v.getByte(0)), 0xff);
-      assertEquals(128, Byte.toUnsignedInt(v.getByte(1)), 128);
-      assertEquals(5, Byte.toUnsignedInt(v.getByte(2)), 5);
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertEquals(0xff, Byte.toUnsignedInt(cv.getByte(0)), 0xff);
+      assertEquals(128, Byte.toUnsignedInt(cv.getByte(1)), 128);
+      assertEquals(5, Byte.toUnsignedInt(cv.getByte(2)), 5);
+    };
+    try (HostColumnVector bcv = HostColumnVector.fromUnsignedBytes(new byte[]{(byte)0xff, (byte)128, 5})) {
+      verify.accept(bcv);
+    }
+    try (HostColumnVector bcv = ColumnBuilderHelper.fromBytes(false,
+        new byte[]{(byte)0xff, (byte)128, 5})) {
+      verify.accept(bcv);
     }
   }
 
@@ -70,47 +84,73 @@ public void testAppendRepeatingValues() {
 
   @Test
   public void testUpperIndexOutOfBoundsException() {
-    try (HostColumnVector byteColumnVector = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) {
-      assertThrows(AssertionError.class, () -> byteColumnVector.getByte(3));
-      assertFalse(byteColumnVector.hasNulls());
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertThrows(AssertionError.class, () -> cv.getByte(3));
+      assertFalse(cv.hasNulls());
+    };
+    try (HostColumnVector bcv = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) {
+      verify.accept(bcv);
+    }
+    try (HostColumnVector bcv = ColumnBuilderHelper.fromBytes(true, new byte[]{2, 3, 5})) {
+      verify.accept(bcv);
     }
   }
 
   @Test
   public void testLowerIndexOutOfBoundsException() {
-    try (HostColumnVector byteColumnVector = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) {
-      assertFalse(byteColumnVector.hasNulls());
-      assertThrows(AssertionError.class, () -> byteColumnVector.getByte(-1));
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertThrows(AssertionError.class, () -> cv.getByte(-1));
+    };
+    try (HostColumnVector bcv = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) {
+      verify.accept(bcv);
+    }
+    try (HostColumnVector bcv = ColumnBuilderHelper.fromBytes(true, new byte[]{2, 3, 5})) {
+      verify.accept(bcv);
     }
   }
 
   @Test
   public void testAddingNullValues() {
-    try (HostColumnVector byteColumnVector = HostColumnVector.fromBoxedBytes(
-        new Byte[]{2, 3, 4, 5, 6, 7, null, null})) {
-      assertTrue(byteColumnVector.hasNulls());
-      assertEquals(2, byteColumnVector.getNullCount());
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertTrue(cv.hasNulls());
+      assertEquals(2, cv.getNullCount());
       for (int i = 0; i < 6; i++) {
-        assertFalse(byteColumnVector.isNull(i));
+        assertFalse(cv.isNull(i));
       }
-      assertTrue(byteColumnVector.isNull(6));
-      assertTrue(byteColumnVector.isNull(7));
+      assertTrue(cv.isNull(6));
+      assertTrue(cv.isNull(7));
+    };
+    try (HostColumnVector bcv = HostColumnVector.fromBoxedBytes(
+        new Byte[]{2, 3, 4, 5, 6, 7, null, null})) {
+      verify.accept(bcv);
+    }
+    try (HostColumnVector bcv = ColumnBuilderHelper.fromBoxedBytes(true,
+        new Byte[]{2, 3, 4, 5, 6, 7, null, null})) {
+      verify.accept(bcv);
     }
   }
 
   @Test
   public void testAddingUnsignedNullValues() {
-    try (HostColumnVector byteColumnVector = HostColumnVector.fromBoxedUnsignedBytes(
-        new Byte[]{2, 3, 4, 5, (byte)128, (byte)254, null, null})) {
-      assertTrue(byteColumnVector.hasNulls());
-      assertEquals(2, byteColumnVector.getNullCount());
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertTrue(cv.hasNulls());
+      assertEquals(2, cv.getNullCount());
       for (int i = 0; i < 6; i++) {
-        assertFalse(byteColumnVector.isNull(i));
+        assertFalse(cv.isNull(i));
       }
-      assertEquals(128, Byte.toUnsignedInt(byteColumnVector.getByte(4)));
-      assertEquals(254, Byte.toUnsignedInt(byteColumnVector.getByte(5)));
-      assertTrue(byteColumnVector.isNull(6));
-      assertTrue(byteColumnVector.isNull(7));
+      assertEquals(128, Byte.toUnsignedInt(cv.getByte(4)));
+      assertEquals(254, Byte.toUnsignedInt(cv.getByte(5)));
+      assertTrue(cv.isNull(6));
+      assertTrue(cv.isNull(7));
+    };
+    try (HostColumnVector bcv = HostColumnVector.fromBoxedUnsignedBytes(
+        new Byte[]{2, 3, 4, 5, (byte)128, (byte)254, null, null})) {
+      verify.accept(bcv);
+    }
+    try (HostColumnVector bcv = ColumnBuilderHelper.fromBoxedBytes(false,
+        new Byte[]{2, 3, 4, 5, (byte)128, (byte)254, null, null})) {
+      verify.accept(bcv);
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnBuilderHelper.java b/java/src/test/java/ai/rapids/cudf/ColumnBuilderHelper.java
new file mode 100644
index 00000000000..263244b2413
--- /dev/null
+++ b/java/src/test/java/ai/rapids/cudf/ColumnBuilderHelper.java
@@ -0,0 +1,158 @@
+/*
+ *  Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+import java.math.BigDecimal;
+import java.math.RoundingMode;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.Objects;
+import java.util.function.Consumer;
+
+/**
+ * ColumnBuilderHelper helps to test ColumnBuilder with existed ColumnVector tests.
+ */
+public class ColumnBuilderHelper {
+
+  public static HostColumnVector build(
+      HostColumnVector.DataType type,
+      int rows,
+      Consumer<HostColumnVector.ColumnBuilder> init) {
+    try (HostColumnVector.ColumnBuilder b = new HostColumnVector.ColumnBuilder(type, rows)) {
+      init.accept(b);
+      return b.build();
+    }
+  }
+
+  public static ColumnVector buildOnDevice(
+      HostColumnVector.DataType type,
+      int rows,
+      Consumer<HostColumnVector.ColumnBuilder> init) {
+    try (HostColumnVector.ColumnBuilder b = new HostColumnVector.ColumnBuilder(type, rows)) {
+      init.accept(b);
+      return b.buildAndPutOnDevice();
+    }
+  }
+
+  public static HostColumnVector fromBoxedBytes(boolean signed, Byte... values) {
+    DType dt = signed ? DType.INT8 : DType.UINT8;
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(true, dt),
+        values.length,
+        (b) -> {
+          for (Byte v : values)
+            if (v == null) b.appendNull();
+            else b.append(v);
+        });
+  }
+
+  public static HostColumnVector fromBoxedDoubles(Double... values) {
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(true, DType.FLOAT64),
+        values.length,
+        (b) -> {
+          for (Double v : values)
+            if (v == null) b.appendNull();
+            else b.append(v);
+        });
+  }
+
+  public static HostColumnVector fromBoxedInts(boolean signed, Integer... values) {
+    DType dt = signed ? DType.INT32 : DType.UINT32;
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(true, dt),
+        values.length,
+        (b) -> {
+          for (Integer v : values)
+            if (v == null) b.appendNull();
+            else b.append(v);
+        });
+  }
+
+  public static HostColumnVector fromBoxedLongs(boolean signed, Long... values) {
+    DType dt = signed ? DType.INT64 : DType.UINT64;
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(true, dt),
+        values.length,
+        (b) -> {
+          for (Long v : values)
+            if (v == null) b.appendNull();
+            else b.append(v);
+        });
+  }
+
+  public static HostColumnVector fromBytes(boolean signed, byte... values) {
+    DType dt = signed ? DType.INT8 : DType.UINT8;
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(false, dt),
+        values.length,
+        (b) -> {
+          for (byte v : values) b.append(v);
+        });
+  }
+
+  public static HostColumnVector fromDecimals(BigDecimal... values) {
+    // Simply copy from HostColumnVector.fromDecimals
+    BigDecimal maxDec = Arrays.stream(values).filter(Objects::nonNull)
+        .max(Comparator.comparingInt(BigDecimal::precision))
+        .orElse(BigDecimal.ZERO);
+    int maxScale = Arrays.stream(values).filter(Objects::nonNull)
+        .map(decimal -> decimal.scale())
+        .max(Comparator.naturalOrder())
+        .orElse(0);
+    maxDec = maxDec.setScale(maxScale, RoundingMode.UNNECESSARY);
+
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(true, DType.fromJavaBigDecimal(maxDec)),
+        values.length,
+        (b) -> {
+          for (BigDecimal v : values)
+            if (v == null) b.appendNull();
+            else b.append(v);
+        });
+  }
+
+  public static HostColumnVector fromDoubles(double... values) {
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(false, DType.FLOAT64),
+        values.length,
+        (b) -> {
+          for (double v : values) b.append(v);
+        });
+  }
+
+  public static HostColumnVector fromInts(boolean signed, int... values) {
+    DType dt = signed ? DType.INT32 : DType.UINT32;
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(false, dt),
+        values.length,
+        (b) -> {
+          for (int v : values) b.append(v);
+        });
+  }
+
+  public static HostColumnVector fromLongs(boolean signed, long... values) {
+    DType dt = signed ? DType.INT64 : DType.UINT64;
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(false, dt),
+        values.length,
+        (b) -> {
+          for (long v : values) b.append(v);
+        });
+  }
+}
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 2dbec454eb2..8f39c3c51ce 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -3559,7 +3559,6 @@ void testCastDecimal64ToString() {
     for (int scale : new int[]{-5, -2, -1, 0, 1, 2, 5}) {
       for (int i = 0; i < strDecimalValues.length; i++) {
         strDecimalValues[i] = dumpDecimal(unScaledValues[i], scale);
-        System.out.println(strDecimalValues[i]);
       }
 
       testCastFixedWidthToStringsAndBack(DType.create(DType.DTypeEnum.DECIMAL64, scale),
diff --git a/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java
index c2772520f57..994066c5df0 100644
--- a/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -22,10 +22,12 @@
 import org.junit.jupiter.api.Test;
 
 import java.math.BigDecimal;
+import java.math.BigInteger;
 import java.math.RoundingMode;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.Random;
+import java.util.function.Consumer;
 
 import static org.junit.jupiter.api.Assertions.*;
 
@@ -33,9 +35,11 @@ public class DecimalColumnVectorTest extends CudfTestBase {
   private static final Random rdSeed = new Random(1234);
   private static final int dec32Scale = 4;
   private static final int dec64Scale = 10;
+  private static final int dec128Scale = 30;
 
   private static final BigDecimal[] decimal32Zoo = new BigDecimal[20];
   private static final BigDecimal[] decimal64Zoo = new BigDecimal[20];
+  private static final BigDecimal[] decimal128Zoo = new BigDecimal[20];
   private static final int[] unscaledDec32Zoo = new int[decimal32Zoo.length];
   private static final long[] unscaledDec64Zoo = new long[decimal64Zoo.length];
 
@@ -45,6 +49,9 @@ public class DecimalColumnVectorTest extends CudfTestBase {
   private final BigDecimal[] boundaryDecimal64 = new BigDecimal[]{
       new BigDecimal("999999999999999999"), new BigDecimal("-999999999999999999")};
 
+  private final BigDecimal[] boundaryDecimal128 = new BigDecimal[]{
+      new BigDecimal("99999999999999999999999999999999999999"), new BigDecimal("-99999999999999999999999999999999999999")};
+
   private final BigDecimal[] overflowDecimal32 = new BigDecimal[]{
       BigDecimal.valueOf(Integer.MAX_VALUE), BigDecimal.valueOf(Integer.MIN_VALUE)};
 
@@ -72,6 +79,12 @@ public static void setup() {
       } else {
         decimal64Zoo[i] = null;
       }
+      if (rdSeed.nextBoolean()) {
+        BigInteger unscaledVal = BigInteger.valueOf(rdSeed.nextLong()).multiply(BigInteger.valueOf(rdSeed.nextLong()));
+        decimal128Zoo[i] = new BigDecimal(unscaledVal, dec128Scale);
+      } else {
+        decimal128Zoo[i] = null;
+      }
     }
   }
 
@@ -190,27 +203,44 @@ public void testDecimalGeneral() {
 
   @Test
   public void testDecimalFromDecimals() {
-    DecimalColumnVectorTest.testDecimalImpl(false, dec32Scale, decimal32Zoo);
-    DecimalColumnVectorTest.testDecimalImpl(true, dec64Scale, decimal64Zoo);
-    DecimalColumnVectorTest.testDecimalImpl(false, 0, boundaryDecimal32);
-    DecimalColumnVectorTest.testDecimalImpl(true, 0, boundaryDecimal64);
+    DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL32, dec32Scale, decimal32Zoo);
+    DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL64, dec64Scale, decimal64Zoo);
+    DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL128, dec128Scale, decimal128Zoo);
+    DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL32, 0, boundaryDecimal32);
+    DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL64, 0, boundaryDecimal64);
+    DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL128, 0, boundaryDecimal128);
   }
 
-  private static void testDecimalImpl(boolean isInt64, int scale, BigDecimal[] decimalZoo) {
-    try (ColumnVector cv = ColumnVector.fromDecimals(decimalZoo)) {
-      try (HostColumnVector hcv = cv.copyToHost()) {
-        assertEquals(-scale, hcv.getType().getScale());
-        assertEquals(isInt64, hcv.getType().typeId == DType.DTypeEnum.DECIMAL64);
-        assertEquals(decimalZoo.length, hcv.rows);
-        for (int i = 0; i < decimalZoo.length; i++) {
-          assertEquals(decimalZoo[i] == null, hcv.isNull(i));
-          if (decimalZoo[i] != null) {
-            assertEquals(decimalZoo[i].floatValue(), hcv.getBigDecimal(i).floatValue());
-            long backValue = isInt64 ? hcv.getLong(i) : hcv.getInt(i);
-            assertEquals(decimalZoo[i].setScale(scale, RoundingMode.UNNECESSARY), BigDecimal.valueOf(backValue, scale));
+  private static void testDecimalImpl(DType.DTypeEnum decimalType, int scale, BigDecimal[] decimalZoo) {
+    Consumer<HostColumnVector> assertions = (hcv) -> {
+      assertEquals(-scale, hcv.getType().getScale());
+      assertEquals(hcv.getType().typeId, decimalType);
+      assertEquals(decimalZoo.length, hcv.rows);
+      for (int i = 0; i < decimalZoo.length; i++) {
+        assertEquals(decimalZoo[i] == null, hcv.isNull(i));
+        if (decimalZoo[i] != null) {
+          BigDecimal actual;
+          switch (decimalType) {
+          case DECIMAL32:
+            actual = BigDecimal.valueOf(hcv.getInt(i), scale);
+            break;
+          case DECIMAL64:
+            actual = BigDecimal.valueOf(hcv.getLong(i), scale);
+            break;
+          default:
+            actual = hcv.getBigDecimal(i);
           }
+          assertEquals(decimalZoo[i].subtract(actual).longValueExact(), 0L);
         }
       }
+    };
+    try (ColumnVector cv = ColumnVector.fromDecimals(decimalZoo)) {
+      try (HostColumnVector hcv = cv.copyToHost()) {
+        assertions.accept(hcv);
+      }
+    }
+    try (HostColumnVector hcv = ColumnBuilderHelper.fromDecimals(decimalZoo)) {
+      assertions.accept(hcv);
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/DoubleColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/DoubleColumnVectorTest.java
index d82565e1d2d..fa34429685e 100644
--- a/java/src/test/java/ai/rapids/cudf/DoubleColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/DoubleColumnVectorTest.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 import org.junit.jupiter.api.Test;
 
 import java.util.Random;
+import java.util.function.Consumer;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
@@ -40,34 +41,51 @@ public void testCreateColumnVectorBuilder() {
 
   @Test
   public void testArrayAllocation() {
-    try (HostColumnVector doubleColumnVector = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) {
-      assertFalse(doubleColumnVector.hasNulls());
-      assertEqualsWithinPercentage(doubleColumnVector.getDouble(0), 2.1, 0.01);
-      assertEqualsWithinPercentage(doubleColumnVector.getDouble(1), 3.02, 0.01);
-      assertEqualsWithinPercentage(doubleColumnVector.getDouble(2), 5.003, 0.001);
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertEqualsWithinPercentage(cv.getDouble(0), 2.1, 0.01);
+      assertEqualsWithinPercentage(cv.getDouble(1), 3.02, 0.01);
+      assertEqualsWithinPercentage(cv.getDouble(2), 5.003, 0.001);
+    };
+    try (HostColumnVector dcv = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) {
+      verify.accept(dcv);
+    }
+    try (HostColumnVector dcv = ColumnBuilderHelper.fromDoubles(2.1, 3.02, 5.003)) {
+      verify.accept(dcv);
     }
   }
 
   @Test
   public void testUpperIndexOutOfBoundsException() {
-    try (HostColumnVector doubleColumnVector = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) {
-      assertThrows(AssertionError.class, () -> doubleColumnVector.getDouble(3));
-      assertFalse(doubleColumnVector.hasNulls());
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertThrows(AssertionError.class, () -> cv.getDouble(3));
+      assertFalse(cv.hasNulls());
+    };
+    try (HostColumnVector dcv = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) {
+      verify.accept(dcv);
+    }
+    try (HostColumnVector dcv = ColumnBuilderHelper.fromDoubles(2.1, 3.02, 5.003)) {
+      verify.accept(dcv);
     }
   }
 
   @Test
   public void testLowerIndexOutOfBoundsException() {
-    try (HostColumnVector doubleColumnVector = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) {
-      assertFalse(doubleColumnVector.hasNulls());
-      assertThrows(AssertionError.class, () -> doubleColumnVector.getDouble(-1));
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertThrows(AssertionError.class, () -> cv.getDouble(-1));
+    };
+    try (HostColumnVector dcv = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) {
+      verify.accept(dcv);
+    }
+    try (HostColumnVector dcv = ColumnBuilderHelper.fromDoubles(2.1, 3.02, 5.003)) {
+      verify.accept(dcv);
     }
   }
 
   @Test
   public void testAddingNullValues() {
-    try (HostColumnVector cv =
-             HostColumnVector.fromBoxedDoubles(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, null, null)) {
+    Consumer<HostColumnVector> verify = (cv) -> {
       assertTrue(cv.hasNulls());
       assertEquals(2, cv.getNullCount());
       for (int i = 0; i < 6; i++) {
@@ -75,6 +93,14 @@ public void testAddingNullValues() {
       }
       assertTrue(cv.isNull(6));
       assertTrue(cv.isNull(7));
+    };
+    try (HostColumnVector dcv =
+             HostColumnVector.fromBoxedDoubles(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, null, null)) {
+      verify.accept(dcv);
+    }
+    try (HostColumnVector dcv = ColumnBuilderHelper.fromBoxedDoubles(
+        2.0, 3.0, 4.0, 5.0, 6.0, 7.0, null, null)) {
+      verify.accept(dcv);
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java
index 2fb8164534b..7d6311fb24c 100644
--- a/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 import org.junit.jupiter.api.Test;
 
 import java.util.Random;
+import java.util.function.Consumer;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
@@ -34,47 +35,75 @@ public void testCreateColumnVectorBuilder() {
     try (ColumnVector intColumnVector = ColumnVector.build(DType.INT32, 3, (b) -> b.append(1))) {
       assertFalse(intColumnVector.hasNulls());
     }
+    try (ColumnVector intColumnVector = ColumnBuilderHelper.buildOnDevice(
+        new HostColumnVector.BasicType(true, DType.INT32), 3, (b) -> b.append(1))) {
+      assertFalse(intColumnVector.hasNulls());
+    }
   }
 
   @Test
   public void testArrayAllocation() {
-    try (HostColumnVector intColumnVector = HostColumnVector.fromInts(2, 3, 5)) {
-      assertFalse(intColumnVector.hasNulls());
-      assertEquals(intColumnVector.getInt(0), 2);
-      assertEquals(intColumnVector.getInt(1), 3);
-      assertEquals(intColumnVector.getInt(2), 5);
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertEquals(cv.getInt(0), 2);
+      assertEquals(cv.getInt(1), 3);
+      assertEquals(cv.getInt(2), 5);
+    };
+    try (HostColumnVector cv = HostColumnVector.fromInts(2, 3, 5)) {
+      verify.accept(cv);
+    }
+    try (HostColumnVector cv = ColumnBuilderHelper.fromInts(true, 2, 3, 5)) {
+      verify.accept(cv);
     }
   }
 
   @Test
   public void testUnsignedArrayAllocation() {
-    try (HostColumnVector v = HostColumnVector.fromUnsignedInts(0xfedcba98, 0x80000000, 5)) {
-      assertFalse(v.hasNulls());
-      assertEquals(0xfedcba98L, Integer.toUnsignedLong(v.getInt(0)));
-      assertEquals(0x80000000L, Integer.toUnsignedLong(v.getInt(1)));
-      assertEquals(5, Integer.toUnsignedLong(v.getInt(2)));
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertEquals(0xfedcba98L, Integer.toUnsignedLong(cv.getInt(0)));
+      assertEquals(0x80000000L, Integer.toUnsignedLong(cv.getInt(1)));
+      assertEquals(5, Integer.toUnsignedLong(cv.getInt(2)));
+    };
+    try (HostColumnVector cv = HostColumnVector.fromUnsignedInts(0xfedcba98, 0x80000000, 5)) {
+      verify.accept(cv);
+    }
+    try (HostColumnVector cv = ColumnBuilderHelper.fromInts(false, 0xfedcba98, 0x80000000, 5)) {
+      verify.accept(cv);
     }
   }
 
   @Test
   public void testUpperIndexOutOfBoundsException() {
-    try (HostColumnVector intColumnVector = HostColumnVector.fromInts(2, 3, 5)) {
-      assertThrows(AssertionError.class, () -> intColumnVector.getInt(3));
-      assertFalse(intColumnVector.hasNulls());
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertThrows(AssertionError.class, () -> cv.getInt(3));
+      assertFalse(cv.hasNulls());
+    };
+    try (HostColumnVector icv = HostColumnVector.fromInts(2, 3, 5)) {
+      verify.accept(icv);
+    }
+    try (HostColumnVector icv = ColumnBuilderHelper.fromInts(true, 2, 3, 5)) {
+      verify.accept(icv);
     }
   }
 
   @Test
   public void testLowerIndexOutOfBoundsException() {
-    try (HostColumnVector intColumnVector = HostColumnVector.fromInts(2, 3, 5)) {
-      assertFalse(intColumnVector.hasNulls());
-      assertThrows(AssertionError.class, () -> intColumnVector.getInt(-1));
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertThrows(AssertionError.class, () -> cv.getInt(-1));
+    };
+    try (HostColumnVector icv = HostColumnVector.fromInts(2, 3, 5)) {
+      verify.accept(icv);
+    }
+    try (HostColumnVector icv = ColumnBuilderHelper.fromInts(true, 2, 3, 5)) {
+      verify.accept(icv);
     }
   }
 
   @Test
   public void testAddingNullValues() {
-    try (HostColumnVector cv = HostColumnVector.fromBoxedInts(2, 3, 4, 5, 6, 7, null, null)) {
+    Consumer<HostColumnVector> verify = (cv) -> {
       assertTrue(cv.hasNulls());
       assertEquals(2, cv.getNullCount());
       for (int i = 0; i < 6; i++) {
@@ -82,13 +111,18 @@ public void testAddingNullValues() {
       }
       assertTrue(cv.isNull(6));
       assertTrue(cv.isNull(7));
+    };
+    try (HostColumnVector cv = HostColumnVector.fromBoxedInts(2, 3, 4, 5, 6, 7, null, null)) {
+      verify.accept(cv);
+    }
+    try (HostColumnVector cv = ColumnBuilderHelper.fromBoxedInts(true, 2, 3, 4, 5, 6, 7, null, null)) {
+      verify.accept(cv);
     }
   }
 
   @Test
   public void testAddingUnsignedNullValues() {
-    try (HostColumnVector cv = HostColumnVector.fromBoxedUnsignedInts(
-            2, 3, 4, 5, 0xfedbca98, 0x80000000, null, null)) {
+    Consumer<HostColumnVector> verify = (cv) -> {
       assertTrue(cv.hasNulls());
       assertEquals(2, cv.getNullCount());
       for (int i = 0; i < 6; i++) {
@@ -98,6 +132,14 @@ public void testAddingUnsignedNullValues() {
       assertEquals(0x80000000L, Integer.toUnsignedLong(cv.getInt(5)));
       assertTrue(cv.isNull(6));
       assertTrue(cv.isNull(7));
+    };
+    try (HostColumnVector cv = HostColumnVector.fromBoxedUnsignedInts(
+            2, 3, 4, 5, 0xfedbca98, 0x80000000, null, null)) {
+      verify.accept(cv);
+    }
+    try (HostColumnVector cv = ColumnBuilderHelper.fromBoxedInts(false,
+        2, 3, 4, 5, 0xfedbca98, 0x80000000, null, null)) {
+      verify.accept(cv);
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/LongColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/LongColumnVectorTest.java
index 43c2b5a99c2..193992f5304 100644
--- a/java/src/test/java/ai/rapids/cudf/LongColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/LongColumnVectorTest.java
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 import org.junit.jupiter.api.Test;
 
 import java.util.Random;
+import java.util.function.Consumer;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
@@ -38,46 +39,71 @@ public void testCreateColumnVectorBuilder() {
 
   @Test
   public void testArrayAllocation() {
-    try (HostColumnVector longColumnVector = HostColumnVector.fromLongs(2L, 3L, 5L)) {
-      assertFalse(longColumnVector.hasNulls());
-      assertEquals(longColumnVector.getLong(0), 2);
-      assertEquals(longColumnVector.getLong(1), 3);
-      assertEquals(longColumnVector.getLong(2), 5);
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertEquals(cv.getLong(0), 2);
+      assertEquals(cv.getLong(1), 3);
+      assertEquals(cv.getLong(2), 5);
+    };
+    try (HostColumnVector lcv = HostColumnVector.fromLongs(2L, 3L, 5L)) {
+      verify.accept(lcv);
+    }
+    try (HostColumnVector lcv = ColumnBuilderHelper.fromLongs(true,2L, 3L, 5L)) {
+      verify.accept(lcv);
     }
   }
 
   @Test
   public void testUnsignedArrayAllocation() {
-    try (HostColumnVector longColumnVector = HostColumnVector.fromUnsignedLongs(
-        0xfedcba9876543210L, 0x8000000000000000L, 5L)) {
-      assertFalse(longColumnVector.hasNulls());
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
       assertEquals(Long.toUnsignedString(0xfedcba9876543210L),
-          Long.toUnsignedString(longColumnVector.getLong(0)));
+          Long.toUnsignedString(cv.getLong(0)));
       assertEquals(Long.toUnsignedString(0x8000000000000000L),
-          Long.toUnsignedString(longColumnVector.getLong(1)));
-      assertEquals(5L, longColumnVector.getLong(2));
+          Long.toUnsignedString(cv.getLong(1)));
+      assertEquals(5L, cv.getLong(2));
+    };
+    try (HostColumnVector lcv = HostColumnVector.fromUnsignedLongs(
+        0xfedcba9876543210L, 0x8000000000000000L, 5L)) {
+      verify.accept(lcv);
+    }
+    try (HostColumnVector lcv = ColumnBuilderHelper.fromLongs(false,
+        0xfedcba9876543210L, 0x8000000000000000L, 5L)) {
+      verify.accept(lcv);
     }
   }
 
   @Test
   public void testUpperIndexOutOfBoundsException() {
-    try (HostColumnVector longColumnVector = HostColumnVector.fromLongs(2L, 3L, 5L)) {
-      assertThrows(AssertionError.class, () -> longColumnVector.getLong(3));
-      assertFalse(longColumnVector.hasNulls());
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertThrows(AssertionError.class, () -> cv.getLong(3));
+      assertFalse(cv.hasNulls());
+    };
+    try (HostColumnVector lcv = HostColumnVector.fromLongs(2L, 3L, 5L)) {
+      verify.accept(lcv);
+    }
+    try (HostColumnVector lcv = ColumnBuilderHelper.fromLongs(true, 2L, 3L, 5L)) {
+      verify.accept(lcv);
     }
   }
 
   @Test
   public void testLowerIndexOutOfBoundsException() {
-    try (HostColumnVector longColumnVector = HostColumnVector.fromLongs(2L, 3L, 5L)) {
-      assertFalse(longColumnVector.hasNulls());
-      assertThrows(AssertionError.class, () -> longColumnVector.getLong(-1));
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertThrows(AssertionError.class, () -> cv.getLong(-1));
+    };
+    try (HostColumnVector lcv = HostColumnVector.fromLongs(2L, 3L, 5L)) {
+      verify.accept(lcv);
+    }
+    try (HostColumnVector lcv = ColumnBuilderHelper.fromLongs(true, 2L, 3L, 5L)) {
+      verify.accept(lcv);
     }
   }
 
   @Test
   public void testAddingNullValues() {
-    try (HostColumnVector cv = HostColumnVector.fromBoxedLongs(2L, 3L, 4L, 5L, 6L, 7L, null, null)) {
+    Consumer<HostColumnVector> verify = (cv) -> {
       assertTrue(cv.hasNulls());
       assertEquals(2, cv.getNullCount());
       for (int i = 0; i < 6; i++) {
@@ -85,13 +111,19 @@ public void testAddingNullValues() {
       }
       assertTrue(cv.isNull(6));
       assertTrue(cv.isNull(7));
+    };
+    try (HostColumnVector lcv = HostColumnVector.fromBoxedLongs(2L, 3L, 4L, 5L, 6L, 7L, null, null)) {
+      verify.accept(lcv);
+    }
+    try (HostColumnVector lcv = ColumnBuilderHelper.fromBoxedLongs(true,
+        2L, 3L, 4L, 5L, 6L, 7L, null, null)) {
+      verify.accept(lcv);
     }
   }
 
   @Test
   public void testAddingUnsignedNullValues() {
-    try (HostColumnVector cv = HostColumnVector.fromBoxedUnsignedLongs(
-        2L, 3L, 4L, 5L, 0xfedcba9876543210L, 0x8000000000000000L, null, null)) {
+    Consumer<HostColumnVector> verify = (cv) -> {
       assertTrue(cv.hasNulls());
       assertEquals(2, cv.getNullCount());
       for (int i = 0; i < 6; i++) {
@@ -103,6 +135,14 @@ public void testAddingUnsignedNullValues() {
           Long.toUnsignedString(cv.getLong(5)));
       assertTrue(cv.isNull(6));
       assertTrue(cv.isNull(7));
+    };
+    try (HostColumnVector lcv = HostColumnVector.fromBoxedUnsignedLongs(
+        2L, 3L, 4L, 5L, 0xfedcba9876543210L, 0x8000000000000000L, null, null)) {
+      verify.accept(lcv);
+    }
+    try (HostColumnVector lcv = ColumnBuilderHelper.fromBoxedLongs(false,
+        2L, 3L, 4L, 5L, 0xfedcba9876543210L, 0x8000000000000000L, null, null)) {
+      verify.accept(lcv);
     }
   }
 
diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd
index 5b81d369ef5..897b61f8001 100644
--- a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd
+++ b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -33,11 +33,13 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" \
         column_view boolean_mask
     ) except +
 
-    cdef unique_ptr[table] drop_duplicates(table_view source_table,
-                                           vector[size_type] keys,
-                                           duplicate_keep_option keep,
-                                           null_equality nulls_equal) except +
-
-    cdef size_type distinct_count(column_view source_table,
-                                  null_policy null_handling,
-                                  nan_policy nan_handling) except +
+    cdef unique_ptr[table] drop_duplicates(
+        table_view source_table,
+        vector[size_type] keys,
+        duplicate_keep_option keep,
+        null_equality nulls_equal) except +
+
+    cdef size_type unordered_distinct_count(
+        column_view source_table,
+        null_policy null_handling,
+        nan_policy nan_handling) except +
diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx
index 4330c565982..c4f885382f3 100644
--- a/python/cudf/cudf/_lib/stream_compaction.pyx
+++ b/python/cudf/cudf/_lib/stream_compaction.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import pandas as pd
 
@@ -11,10 +11,10 @@ from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.stream_compaction cimport (
     apply_boolean_mask as cpp_apply_boolean_mask,
-    distinct_count as cpp_distinct_count,
     drop_duplicates as cpp_drop_duplicates,
     drop_nulls as cpp_drop_nulls,
     duplicate_keep_option,
+    unordered_distinct_count as cpp_unordered_distinct_count,
 )
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
@@ -190,7 +190,7 @@ def distinct_count(Column source_column, ignore_nulls=True, nan_as_null=False):
 
     cdef column_view source_column_view = source_column.view()
     with nogil:
-        count = cpp_distinct_count(
+        count = cpp_unordered_distinct_count(
             source_column_view,
             cpp_null_handling,
             cpp_nan_handling
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 2866c1c003c..19313dd3fe2 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -986,7 +986,7 @@ def __array__(self, dtype=None):
         raise TypeError(
             "Implicit conversion to a host NumPy array via __array__ is not "
             "allowed. To explicitly construct a host array, consider using "
-            ".to_array()"
+            ".to_numpy()"
         )
 
     @property
@@ -1271,6 +1271,12 @@ def column_empty(
             column_empty(row_count, field_dtype)
             for field_dtype in dtype.fields.values()
         )
+    elif is_list_dtype(dtype):
+        data = None
+        children = (
+            full(row_count + 1, 0, dtype="int32"),
+            column_empty(row_count, dtype=dtype.element_type),
+        )
     elif is_categorical_dtype(dtype):
         data = None
         children = (
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 323a5ad088a..3735a949277 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6027,6 +6027,37 @@ def __dataframe__(
             self, nan_as_null=nan_as_null, allow_copy=allow_copy
         )
 
+    def nunique(self, axis=0, dropna=True):
+        """
+        Count number of distinct elements in specified axis.
+        Return Series with number of distinct elements. Can ignore NaN values.
+
+        Parameters
+        ----------
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for
+            column-wise.
+        dropna : bool, default True
+            Don't include NaN in the counts.
+
+        Returns
+        -------
+        Series
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]})
+        >>> df.nunique()
+        A    3
+        B    2
+        dtype: int64
+        """
+        if axis != 0:
+            raise NotImplementedError("axis parameter is not supported yet.")
+
+        return cudf.Series(super().nunique(method="sort", dropna=dropna))
+
 
 def from_dataframe(df, allow_copy=False):
     return df_protocol.from_dataframe(df, allow_copy=allow_copy)
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index bc666430189..2e01a29b961 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import builtins
 import copy
 import pickle
 import warnings
@@ -5982,12 +5983,12 @@ def eq(self, other, axis="columns", level=None, fill_value=None):
         ...     'd': [10, 12, 12]}
         ... )
         >>> left.eq(right)
-        a     b     c     d
+              a     b     c     d
         0  True  True  <NA>  <NA>
         1  True  True  <NA>  <NA>
         2  True  True  <NA>  <NA>
         >>> left.eq(right, fill_value=7)
-        a     b      c      d
+              a     b      c      d
         0  True  True   True  False
         1  True  True  False  False
         2  True  True  False  False
@@ -6057,12 +6058,12 @@ def ne(self, other, axis="columns", level=None, fill_value=None):
         ...     'd': [10, 12, 12]}
         ... )
         >>> left.ne(right)
-        a      b     c     d
+               a      b     c     d
         0  False  False  <NA>  <NA>
         1  False  False  <NA>  <NA>
         2  False  False  <NA>  <NA>
         >>> left.ne(right, fill_value=7)
-        a      b      c     d
+               a      b      c     d
         0  False  False  False  True
         1  False  False   True  True
         2  False  False   True  True
@@ -6132,12 +6133,12 @@ def lt(self, other, axis="columns", level=None, fill_value=None):
         ...     'd': [10, 12, 12]}
         ... )
         >>> left.lt(right)
-        a      b     c     d
+               a      b     c     d
         0  False  False  <NA>  <NA>
         1  False  False  <NA>  <NA>
         2  False  False  <NA>  <NA>
         >>> left.lt(right, fill_value=7)
-        a      b      c     d
+               a      b      c     d
         0  False  False  False  True
         1  False  False  False  True
         2  False  False  False  True
@@ -6207,12 +6208,12 @@ def le(self, other, axis="columns", level=None, fill_value=None):
         ...     'd': [10, 12, 12]}
         ... )
         >>> left.le(right)
-        a     b     c     d
+              a     b     c     d
         0  True  True  <NA>  <NA>
         1  True  True  <NA>  <NA>
         2  True  True  <NA>  <NA>
         >>> left.le(right, fill_value=7)
-        a     b      c     d
+              a     b      c     d
         0  True  True   True  True
         1  True  True  False  True
         2  True  True  False  True
@@ -6282,12 +6283,12 @@ def gt(self, other, axis="columns", level=None, fill_value=None):
         ...     'd': [10, 12, 12]}
         ... )
         >>> left.gt(right)
-        a      b     c     d
+               a      b     c     d
         0  False  False  <NA>  <NA>
         1  False  False  <NA>  <NA>
         2  False  False  <NA>  <NA>
         >>> left.gt(right, fill_value=7)
-        a      b      c      d
+               a      b      c      d
         0  False  False  False  False
         1  False  False   True  False
         2  False  False   True  False
@@ -6357,12 +6358,12 @@ def ge(self, other, axis="columns", level=None, fill_value=None):
         ...     'd': [10, 12, 12]}
         ... )
         >>> left.ge(right)
-        a     b     c     d
+              a     b     c     d
         0  True  True  <NA>  <NA>
         1  True  True  <NA>  <NA>
         2  True  True  <NA>  <NA>
         >>> left.ge(right, fill_value=7)
-        a     b     c      d
+              a     b     c      d
         0  True  True  True  False
         1  True  True  True  False
         2  True  True  True  False
@@ -6402,6 +6403,28 @@ def ge(self, other, axis="columns", level=None, fill_value=None):
             other=other, fn="ge", fill_value=fill_value, can_reindex=True
         )
 
+    def nunique(self, method: builtins.str = "sort", dropna: bool = True):
+        """
+        Returns a per column mapping with counts of unique values for
+        each column.
+
+        Parameters
+        ----------
+        method : builtins.str, default "sort"
+            Method used by cpp_distinct_count
+        dropna : bool, default True
+            Don't include NaN in the counts.
+
+        Returns
+        -------
+        dict
+            Name and unique value counts of each column in frame.
+        """
+        return {
+            name: col.distinct_count(method=method, dropna=dropna)
+            for name, col in self._data.items()
+        }
+
 
 def _get_replacement_values_for_columns(
     to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any]
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 66194f0f877..12a2538b776 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2756,7 +2756,7 @@ def nunique(self, method="sort", dropna=True):
             raise NotImplementedError(msg)
         if self.null_count == len(self):
             return 0
-        return self._column.distinct_count(method, dropna)
+        return super().nunique(method, dropna)
 
     def value_counts(
         self,
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 2623569afac..ef479f19363 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -3,6 +3,7 @@
 
 from __future__ import annotations
 
+import builtins
 from typing import Any, Dict, MutableMapping, Optional, Tuple, TypeVar, Union
 
 import cupy
@@ -325,3 +326,21 @@ def _make_operands_for_binop(
                 return NotImplemented
 
         return {result_name: (self._column, other, reflect, fill_value)}
+
+    def nunique(self, method: builtins.str = "sort", dropna: bool = True):
+        """
+        Return count of unique values for the column.
+
+        Parameters
+        ----------
+        method : builtins.str, default "sort"
+            Method used by cpp_distinct_count
+        dropna : bool, default True
+            Don't include NaN in the counts.
+
+        Returns
+        -------
+        int
+            Number of unique values in the column.
+        """
+        return self._column.distinct_count(method=method, dropna=dropna)
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 889662c8a1c..ba2caf7c6c8 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -9081,6 +9081,32 @@ def test_dataframe_assign_cp_np_array():
     assert_eq(pdf, gdf)
 
 
+@pytest.mark.parametrize(
+    "data", [{"a": [1, 2, 3], "b": [1, 1, 0]}],
+)
+def test_dataframe_nunique(data):
+    gdf = cudf.DataFrame(data)
+    pdf = gdf.to_pandas()
+
+    actual = gdf.nunique()
+    expected = pdf.nunique()
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "data", [{"key": [0, 1, 1, 0, 0, 1], "val": [1, 8, 3, 9, -3, 8]}],
+)
+def test_dataframe_nunique_index(data):
+    gdf = cudf.DataFrame(data)
+    pdf = gdf.to_pandas()
+
+    actual = gdf.index.nunique()
+    expected = pdf.index.nunique()
+
+    assert_eq(expected, actual)
+
+
 def test_dataframe_rename_duplicate_column():
     gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
     with pytest.raises(
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 44749103b54..fc9ad9711d1 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+
 import functools
 import operator
 
@@ -586,3 +587,18 @@ def test_listcol_setitem_error_cases(data, item, error):
     sr = cudf.Series(data)
     with pytest.raises(BaseException, match=error):
         sr[1] = item
+
+
+def test_listcol_setitem_retain_dtype():
+    df = cudf.DataFrame(
+        {"a": cudf.Series([["a", "b"], []]), "b": [1, 2], "c": [123, 321]}
+    )
+    df1 = df.head(0)
+    # Performing a setitem on `b` triggers a `column.column_empty_like` call
+    # which tries to create an empty ListColumn.
+    df1["b"] = df1["c"]
+    # Performing a copy to trigger a copy dtype which is obtained by accessing
+    # `ListColumn.children` that would have been corrupted in previous call
+    # prior to this fix: https://github.com/rapidsai/cudf/pull/10151/
+    df2 = df1.copy()
+    assert df2["a"].dtype == df["a"].dtype
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 1c80fe80f2d..358484d79b9 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -1521,6 +1521,32 @@ def test_series_transpose(data):
     assert_eq(cudf_transposed, csr)
 
 
+@pytest.mark.parametrize(
+    "data", [1, 3, 5, 7, 7],
+)
+def test_series_nunique(data):
+    cd_s = cudf.Series(data)
+    pd_s = cd_s.to_pandas()
+
+    actual = cd_s.nunique()
+    expected = pd_s.nunique()
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "data", [1, 3, 5, 7, 7],
+)
+def test_series_nunique_index(data):
+    cd_s = cudf.Series(data)
+    pd_s = cd_s.to_pandas()
+
+    actual = cd_s.index.nunique()
+    expected = pd_s.index.nunique()
+
+    assert_eq(expected, actual)
+
+
 @pytest.mark.parametrize(
     "fill_value,data",
     [
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index 149d98ebfb9..1bc270a5b9f 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+
 import math
 from operator import getitem
 from typing import Set
@@ -42,19 +43,11 @@ def __init__(self, *args, **kwargs):
     def __getitem__(self, key):
         if isinstance(key, list):
             g = CudfDataFrameGroupBy(
-                self.obj,
-                by=self.index,
-                slice=key,
-                sort=self.sort,
-                **self.dropna,
+                self.obj, by=self.by, slice=key, sort=self.sort, **self.dropna,
             )
         else:
             g = CudfSeriesGroupBy(
-                self.obj,
-                by=self.index,
-                slice=key,
-                sort=self.sort,
-                **self.dropna,
+                self.obj, by=self.by, slice=key, sort=self.sort, **self.dropna,
             )
 
         g._meta = g._meta[key]
@@ -63,8 +56,8 @@ def __getitem__(self, key):
     def mean(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
-            self.index,
-            {c: "mean" for c in self.obj.columns if c not in self.index},
+            self.by,
+            {c: "mean" for c in self.obj.columns if c not in self.by},
             split_every=split_every,
             split_out=split_out,
             dropna=self.dropna,
@@ -76,8 +69,8 @@ def mean(self, split_every=None, split_out=1):
     def collect(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
-            self.index,
-            {c: "collect" for c in self.obj.columns if c not in self.index},
+            self.by,
+            {c: "collect" for c in self.obj.columns if c not in self.by},
             split_every=split_every,
             split_out=split_out,
             dropna=self.dropna,
@@ -94,10 +87,10 @@ def aggregate(self, arg, split_every=None, split_out=1):
         if (
             isinstance(self.obj, DaskDataFrame)
             and (
-                isinstance(self.index, str)
+                isinstance(self.by, str)
                 or (
-                    isinstance(self.index, list)
-                    and all(isinstance(x, str) for x in self.index)
+                    isinstance(self.by, list)
+                    and all(isinstance(x, str) for x in self.by)
                 )
             )
             and _is_supported(arg, SUPPORTED_AGGS)
@@ -133,7 +126,7 @@ def __init__(self, *args, **kwargs):
     def mean(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
-            self.index,
+            self.by,
             {self._slice: "mean"},
             split_every=split_every,
             split_out=split_out,
@@ -146,7 +139,7 @@ def mean(self, split_every=None, split_out=1):
     def std(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
-            self.index,
+            self.by,
             {self._slice: "std"},
             split_every=split_every,
             split_out=split_out,
@@ -159,7 +152,7 @@ def std(self, split_every=None, split_out=1):
     def var(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
-            self.index,
+            self.by,
             {self._slice: "var"},
             split_every=split_every,
             split_out=split_out,
@@ -172,7 +165,7 @@ def var(self, split_every=None, split_out=1):
     def collect(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
-            self.index,
+            self.by,
             {self._slice: "collect"},
             split_every=split_every,
             split_out=split_out,
@@ -192,12 +185,12 @@ def aggregate(self, arg, split_every=None, split_out=1):
 
         if (
             isinstance(self.obj, DaskDataFrame)
-            and isinstance(self.index, (str, list))
+            and isinstance(self.by, (str, list))
             and _is_supported(arg, SUPPORTED_AGGS)
         ):
             return groupby_agg(
                 self.obj,
-                self.index,
+                self.by,
                 arg,
                 split_every=split_every,
                 split_out=split_out,