diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 5f0be6d797a..59e2ea224f6 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -14,9 +14,9 @@ python/dask_cudf/  @rapidsai/cudf-dask-codeowners
 java/              @rapidsai/cudf-java-codeowners
 
 #build/ops code owners
-.github/           @rapidsai/ops-codeowners 
-/ci/                @rapidsai/ops-codeowners
+.github/           @rapidsai/ops-codeowners
+/ci/               @rapidsai/ops-codeowners
 conda/             @rapidsai/ops-codeowners
-**/Dockerfile      @rapidsai/ops-codeowners
-**/.dockerignore   @rapidsai/ops-codeowners
-docker/            @rapidsai/ops-codeowners
+/Dockerfile        @rapidsai/ops-codeowners
+/.dockerignore     @rapidsai/ops-codeowners
+/docker/           @rapidsai/ops-codeowners
diff --git a/README.md b/README.md
index c0fa500ad77..687d25c200b 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,17 @@
 
 **NOTE:** For the latest stable [README.md](https://github.com/rapidsai/cudf/blob/main/README.md) ensure you are on the `main` branch.
 
+## Resources
+
+- [cuDF Reference Documentation](https://docs.rapids.ai/api/cudf/stable/): Python API reference, tutorials, and topic guides.
+- [libcudf Reference Documentation](https://docs.rapids.ai/api/libcudf/stable/): C/C++ CUDA library API reference.
+- [Getting Started](https://rapids.ai/start.html): Instructions for installing cuDF.
+- [RAPIDS Community](https://rapids.ai/community.html): Get help, contribute, and collaborate.
+- [GitHub repository](https://github.com/rapidsai/cudf): Download the cuDF source code.
+- [Issue tracker](https://github.com/rapidsai/cudf/issues): Report issues or request features.
+
+## Overview
+
 Built based on the [Apache Arrow](http://arrow.apache.org/) columnar memory format, cuDF is a GPU DataFrame library for loading, joining, aggregating, filtering, and otherwise manipulating data.
 
 cuDF provides a pandas-like API that will be familiar to data engineers & data scientists, so they can use it to easily accelerate their workflows without going into the details of CUDA programming.
diff --git a/build.sh b/build.sh
index d75053f8849..bc49b76d44e 100755
--- a/build.sh
+++ b/build.sh
@@ -192,6 +192,7 @@ fi
 # Build libcudf_kafka library
 if hasArg libcudf_kafka; then
     cmake -S $REPODIR/cpp/libcudf_kafka -B ${KAFKA_LIB_BUILD_DIR} \
+          ${CUDF_CMAKE_CUDA_ARCHITECTURES} \
           -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
           -DCMAKE_BUILD_TYPE=${BUILD_TYPE}
 
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 5657d21889f..1be8a6b450a 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -58,6 +58,7 @@ test:
     - test -f $PREFIX/include/cudf/ast/linearizer.hpp
     - test -f $PREFIX/include/cudf/ast/operators.hpp
     - test -f $PREFIX/include/cudf/binaryop.hpp
+    - test -f $PREFIX/include/cudf/labeling/label_bins.hpp
     - test -f $PREFIX/include/cudf/column/column_factories.hpp
     - test -f $PREFIX/include/cudf/column/column.hpp
     - test -f $PREFIX/include/cudf/column/column_view.hpp
@@ -66,6 +67,7 @@ test:
     - test -f $PREFIX/include/cudf/datetime.hpp
     - test -f $PREFIX/include/cudf/detail/aggregation/aggregation.hpp
     - test -f $PREFIX/include/cudf/detail/aggregation/result_cache.hpp
+    - test -f $PREFIX/include/cudf/detail/label_bins.hpp
     - test -f $PREFIX/include/cudf/detail/binaryop.hpp
     - test -f $PREFIX/include/cudf/detail/concatenate.hpp
     - test -f $PREFIX/include/cudf/detail/copy.hpp
@@ -132,6 +134,7 @@ test:
     - test -f $PREFIX/include/cudf/join.hpp
     - test -f $PREFIX/include/cudf/lists/detail/concatenate.hpp
     - test -f $PREFIX/include/cudf/lists/detail/copying.hpp
+    - test -f $PREFIX/include/cudf/lists/detail/drop_list_duplicates.hpp
     - test -f $PREFIX/include/cudf/lists/detail/sorting.hpp
     - test -f $PREFIX/include/cudf/lists/count_elements.hpp
     - test -f $PREFIX/include/cudf/lists/explode.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 3e875b71ca6..fc439ebfa7f 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -156,6 +156,7 @@ add_library(cudf
     src/binaryop/jit/code/kernel.cpp
     src/binaryop/jit/code/operation.cpp
     src/binaryop/jit/code/traits.cpp
+    src/labeling/label_bins.cu
     src/bitmask/null_mask.cu
     src/column/column.cu
     src/column/column_device_view.cu
@@ -194,7 +195,7 @@ add_library(cudf
     src/groupby/hash/groupby.cu
     src/groupby/sort/group_argmax.cu
     src/groupby/sort/group_argmin.cu
-    src/groupby/sort/groupby.cu
+    src/groupby/sort/aggregate.cpp
     src/groupby/sort/group_collect.cu
     src/groupby/sort/group_count.cu
     src/groupby/sort/group_max.cu
@@ -204,6 +205,11 @@ add_library(cudf
     src/groupby/sort/group_quantiles.cu
     src/groupby/sort/group_std.cu
     src/groupby/sort/group_sum.cu
+    src/groupby/sort/scan.cpp
+    src/groupby/sort/group_count_scan.cu
+    src/groupby/sort/group_max_scan.cu
+    src/groupby/sort/group_min_scan.cu
+    src/groupby/sort/group_sum_scan.cu
     src/groupby/sort/sort_helper.cu
     src/hash/hashing.cu
     src/interop/dlpack.cpp
@@ -410,7 +416,7 @@ target_compile_options(cudf
 
 target_compile_definitions(cudf
             PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_DEFINITIONS}>"
-                   "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_DEFINITIONS}>"
+                   "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_DEFINITIONS}>>"
 )
 
 # Disable Jitify log printing. See https://github.com/NVIDIA/jitify/issues/79
@@ -505,10 +511,11 @@ add_library(cudftestutil STATIC
 
 target_compile_options(cudftestutil
             PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
-                   "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
+                   "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>>"
 )
 
-target_compile_features(cudftestutil PUBLIC cxx_std_14 cuda_std_14)
+target_compile_features(cudftestutil
+    PUBLIC cxx_std_14 $<BUILD_INTERFACE:cuda_std_14>)
 
 target_link_libraries(cudftestutil
                PUBLIC GTest::gmock
@@ -522,7 +529,7 @@ target_include_directories(cudftestutil
 
 install(TARGETS cudftestutil
         DESTINATION lib
-        EXPORT cudf-targets)
+        EXPORT cudf-testing-targets)
 
 add_library(cudf::cudftestutil ALIAS cudftestutil)
 
@@ -600,6 +607,11 @@ install(EXPORT  cudf-targets
     NAMESPACE   cudf::
     DESTINATION "${INSTALL_CONFIGDIR}")
 
+install(EXPORT  cudf-testing-targets
+    FILE        cudf-testing-targets.cmake
+    NAMESPACE   cudf::
+    DESTINATION "${INSTALL_CONFIGDIR}")
+
 ################################################################################################
 # - build export -------------------------------------------------------------------------------
 configure_package_config_file(cmake/cudf-build-config.cmake.in ${CUDF_BINARY_DIR}/cudf-config.cmake
@@ -628,6 +640,10 @@ export(EXPORT cudf-targets
     FILE ${CUDF_BINARY_DIR}/cudf-targets.cmake
     NAMESPACE   cudf::)
 
+export(EXPORT cudf-testing-targets
+    FILE ${CUDF_BINARY_DIR}/cudf-testing-targets.cmake
+    NAMESPACE   cudf::)
+
 
 ###################################################################################################
 # - make documentation ----------------------------------------------------------------------------
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index ded5a4bb596..7fd84b508ac 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -173,8 +173,12 @@ ConfigureBench(AST_BENCH ast/transform_benchmark.cpp)
 ConfigureBench(BINARYOP_BENCH binaryop/binaryop_benchmark.cu)
 
 ###################################################################################################
-# - subword tokenizer benchmark -------------------------------------------------------------------
-ConfigureBench(SUBWORD_TOKENIZER_BENCH text/subword_benchmark.cpp)
+# - nvtext benchmark -------------------------------------------------------------------
+ConfigureBench(TEXT_BENCH
+  text/normalize_benchmark.cpp
+  text/normalize_spaces_benchmark.cpp
+  text/tokenize_benchmark.cpp
+  text/subword_benchmark.cpp)
 
 ###################################################################################################
 # - strings benchmark -------------------------------------------------------------------
@@ -191,6 +195,8 @@ ConfigureBench(STRINGS_BENCH
   string/filter_benchmark.cpp
   string/find_benchmark.cpp
   string/replace_benchmark.cpp
+  string/replace_re_benchmark.cpp
   string/split_benchmark.cpp
   string/substring_benchmark.cpp
+  string/translate_benchmark.cpp
   string/url_decode_benchmark.cpp)
diff --git a/cpp/benchmarks/string/replace_re_benchmark.cpp b/cpp/benchmarks/string/replace_re_benchmark.cpp
new file mode 100644
index 00000000000..616e2c0f22c
--- /dev/null
+++ b/cpp/benchmarks/string/replace_re_benchmark.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "string_bench_args.hpp"
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/strings/replace_re.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+class StringReplace : public cudf::benchmark {
+};
+
+enum replace_type { replace_re, replace_re_multi, replace_backref };
+
+static void BM_replace(benchmark::State& state, replace_type rt)
+{
+  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
+  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+  cudf::test::strings_column_wrapper repls({"#", ""});
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    switch (rt) {
+      case replace_type::replace_re:  // contains_re and matches_re use the same main logic
+        cudf::strings::replace_re(input, "\\d+");
+        break;
+      case replace_type::replace_re_multi:  // counts occurrences of pattern
+        cudf::strings::replace_re(input, {"\\d+", "\\s+"}, cudf::strings_column_view(repls));
+        break;
+      case replace_type::replace_backref:  // returns occurrences of matches
+        cudf::strings::replace_with_backrefs(input, "(\\d+)", "#\\1X");
+        break;
+    }
+  }
+
+  state.SetBytesProcessed(state.iterations() * input.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 1 << 5;
+  int const max_rowlen = 1 << 13;
+  int const len_mult   = 4;
+  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+}
+
+#define STRINGS_BENCHMARK_DEFINE(name)                \
+  BENCHMARK_DEFINE_F(StringReplace, name)             \
+  (::benchmark::State & st) { BM_replace(st, name); } \
+  BENCHMARK_REGISTER_F(StringReplace, name)           \
+    ->Apply(generate_bench_args)                      \
+    ->UseManualTime()                                 \
+    ->Unit(benchmark::kMillisecond);
+
+STRINGS_BENCHMARK_DEFINE(replace_re)
+STRINGS_BENCHMARK_DEFINE(replace_re_multi)
+STRINGS_BENCHMARK_DEFINE(replace_backref)
diff --git a/cpp/benchmarks/string/translate_benchmark.cpp b/cpp/benchmarks/string/translate_benchmark.cpp
new file mode 100644
index 00000000000..c49a986d744
--- /dev/null
+++ b/cpp/benchmarks/string/translate_benchmark.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "string_bench_args.hpp"
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/strings/translate.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <algorithm>
+
+#include <thrust/iterator/counting_iterator.h>
+
+class StringTranslate : public cudf::benchmark {
+};
+
+using entry_type = std::pair<cudf::char_utf8, cudf::char_utf8>;
+
+static void BM_translate(benchmark::State& state, int entry_count)
+{
+  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
+  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+
+  std::vector<entry_type> entries(entry_count);
+  std::transform(thrust::counting_iterator<int>(0),
+                 thrust::counting_iterator<int>(entry_count),
+                 entries.begin(),
+                 [](auto idx) -> entry_type {
+                   return entry_type{'!' + idx, '~' - idx};
+                 });
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    cudf::strings::translate(input, entries);
+  }
+
+  state.SetBytesProcessed(state.iterations() * input.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 1 << 5;
+  int const max_rowlen = 1 << 13;
+  int const len_mult   = 4;
+  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+}
+
+#define STRINGS_BENCHMARK_DEFINE(name, entries)            \
+  BENCHMARK_DEFINE_F(StringTranslate, name)                \
+  (::benchmark::State & st) { BM_translate(st, entries); } \
+  BENCHMARK_REGISTER_F(StringTranslate, name)              \
+    ->Apply(generate_bench_args)                           \
+    ->UseManualTime()                                      \
+    ->Unit(benchmark::kMillisecond);
+
+STRINGS_BENCHMARK_DEFINE(translate_small, 5)
+STRINGS_BENCHMARK_DEFINE(translate_medium, 25)
+STRINGS_BENCHMARK_DEFINE(translate_large, 50)
diff --git a/cpp/benchmarks/text/normalize_benchmark.cpp b/cpp/benchmarks/text/normalize_benchmark.cpp
new file mode 100644
index 00000000000..32c4fb7dcde
--- /dev/null
+++ b/cpp/benchmarks/text/normalize_benchmark.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <nvtext/normalize.hpp>
+
+class TextNormalize : public cudf::benchmark {
+};
+
+static void BM_normalize(benchmark::State& state, bool to_lower)
+{
+  auto const n_rows         = static_cast<cudf::size_type>(state.range(0));
+  auto const max_str_length = static_cast<cudf::size_type>(state.range(1));
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    nvtext::normalize_characters(input, to_lower);
+  }
+
+  state.SetBytesProcessed(state.iterations() * input.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 1 << 5;
+  int const max_rowlen = 1 << 13;
+  int const len_mult   = 4;
+  for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) {
+    for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) {
+      // avoid generating combinations that exceed the cudf column limit
+      size_t total_chars = static_cast<size_t>(row_count) * rowlen * 4;
+      if (total_chars < std::numeric_limits<cudf::size_type>::max()) {
+        b->Args({row_count, rowlen});
+      }
+    }
+  }
+}
+
+#define NVTEXT_BENCHMARK_DEFINE(name, lower)             \
+  BENCHMARK_DEFINE_F(TextNormalize, name)                \
+  (::benchmark::State & st) { BM_normalize(st, lower); } \
+  BENCHMARK_REGISTER_F(TextNormalize, name)              \
+    ->Apply(generate_bench_args)                         \
+    ->UseManualTime()                                    \
+    ->Unit(benchmark::kMillisecond);
+
+NVTEXT_BENCHMARK_DEFINE(characters, false)
+NVTEXT_BENCHMARK_DEFINE(to_lower, true)
diff --git a/cpp/benchmarks/text/normalize_spaces_benchmark.cpp b/cpp/benchmarks/text/normalize_spaces_benchmark.cpp
new file mode 100644
index 00000000000..dcabb0c225c
--- /dev/null
+++ b/cpp/benchmarks/text/normalize_spaces_benchmark.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/string/string_bench_args.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <nvtext/normalize.hpp>
+
+class TextNormalize : public cudf::benchmark {
+};
+
+static void BM_normalize(benchmark::State& state)
+{
+  auto const n_rows         = static_cast<cudf::size_type>(state.range(0));
+  auto const max_str_length = static_cast<cudf::size_type>(state.range(1));
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    nvtext::normalize_spaces(input);
+  }
+
+  state.SetBytesProcessed(state.iterations() * input.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 1 << 5;
+  int const max_rowlen = 1 << 13;
+  int const len_mult   = 4;
+  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+}
+
+#define NVTEXT_BENCHMARK_DEFINE(name)             \
+  BENCHMARK_DEFINE_F(TextNormalize, name)         \
+  (::benchmark::State & st) { BM_normalize(st); } \
+  BENCHMARK_REGISTER_F(TextNormalize, name)       \
+    ->Apply(generate_bench_args)                  \
+    ->UseManualTime()                             \
+    ->Unit(benchmark::kMillisecond);
+
+NVTEXT_BENCHMARK_DEFINE(spaces)
diff --git a/cpp/benchmarks/text/tokenize_benchmark.cpp b/cpp/benchmarks/text/tokenize_benchmark.cpp
new file mode 100644
index 00000000000..f9e742f0f31
--- /dev/null
+++ b/cpp/benchmarks/text/tokenize_benchmark.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/string/string_bench_args.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <nvtext/ngrams_tokenize.hpp>
+#include <nvtext/tokenize.hpp>
+
+class TextTokenize : public cudf::benchmark {
+};
+
+enum class tokenize_type { single, multi, count, count_multi, ngrams };
+
+static void BM_tokenize(benchmark::State& state, tokenize_type tt)
+{
+  auto const n_rows         = static_cast<cudf::size_type>(state.range(0));
+  auto const max_str_length = static_cast<cudf::size_type>(state.range(1));
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+  cudf::test::strings_column_wrapper delimiters({" ", "+", "-"});
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    switch (tt) {
+      case tokenize_type::single: nvtext::tokenize(input); break;
+      case tokenize_type::multi:
+        nvtext::tokenize(input, cudf::strings_column_view(delimiters));
+        break;
+      case tokenize_type::count: nvtext::count_tokens(input); break;
+      case tokenize_type::count_multi:
+        nvtext::count_tokens(input, cudf::strings_column_view(delimiters));
+        break;
+      case tokenize_type::ngrams:
+        // default is bigrams
+        nvtext::ngrams_tokenize(input);
+        break;
+    }
+  }
+
+  state.SetBytesProcessed(state.iterations() * input.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 1 << 5;
+  int const max_rowlen = 1 << 13;
+  int const len_mult   = 4;
+  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+}
+
+#define NVTEXT_BENCHMARK_DEFINE(name)                                 \
+  BENCHMARK_DEFINE_F(TextTokenize, name)                              \
+  (::benchmark::State & st) { BM_tokenize(st, tokenize_type::name); } \
+  BENCHMARK_REGISTER_F(TextTokenize, name)                            \
+    ->Apply(generate_bench_args)                                      \
+    ->UseManualTime()                                                 \
+    ->Unit(benchmark::kMillisecond);
+
+NVTEXT_BENCHMARK_DEFINE(single)
+NVTEXT_BENCHMARK_DEFINE(multi)
+NVTEXT_BENCHMARK_DEFINE(count)
+NVTEXT_BENCHMARK_DEFINE(count_multi)
+NVTEXT_BENCHMARK_DEFINE(ngrams)
diff --git a/cpp/cmake/Modules/ConfigureCUDA.cmake b/cpp/cmake/Modules/ConfigureCUDA.cmake
index d4be6e65021..b0d048c6294 100644
--- a/cpp/cmake/Modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/Modules/ConfigureCUDA.cmake
@@ -18,7 +18,7 @@
 find_package(CUDAToolkit REQUIRED)
 
 # Auto-detect available GPU compute architectures
-include(${CUDF_SOURCE_DIR}/cmake/Modules/SetGPUArchs.cmake)
+include(${CMAKE_CURRENT_LIST_DIR}/SetGPUArchs.cmake)
 message(STATUS "CUDF: Building CUDF for GPU architectures: ${CMAKE_CUDA_ARCHITECTURES}")
 
 # Must come after find_package(CUDAToolkit) because we symlink
@@ -29,10 +29,6 @@ enable_language(CUDA)
 
 if(CMAKE_COMPILER_IS_GNUCXX)
     list(APPEND CUDF_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations)
-    if(CUDF_BUILD_TESTS OR CUDF_BUILD_BENCHMARKS)
-        # Suppress parentheses warning which causes gmock to fail
-        list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-Wno-parentheses)
-    endif()
 endif()
 
 list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
@@ -46,6 +42,9 @@ if(DISABLE_DEPRECATION_WARNING)
     list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-Wno-deprecated-declarations)
 endif()
 
+# make sure we produce smallest binary size
+list(APPEND CUDF_CUDA_FLAGS -Xfatbin=-compress-all)
+
 # Option to enable line info in CUDA device compilation to allow introspection when profiling / memchecking
 if(CUDA_ENABLE_LINEINFO)
     list(APPEND CUDF_CUDA_FLAGS -lineinfo)
diff --git a/cpp/cmake/Modules/SetGPUArchs.cmake b/cpp/cmake/Modules/SetGPUArchs.cmake
index 61e4e6bc198..f09d5ead8e2 100644
--- a/cpp/cmake/Modules/SetGPUArchs.cmake
+++ b/cpp/cmake/Modules/SetGPUArchs.cmake
@@ -58,7 +58,7 @@ if(${PROJECT_NAME}_BUILD_FOR_ALL_ARCHS)
   list(APPEND CMAKE_CUDA_ARCHITECTURES ${latest_arch})
 
 elseif(${PROJECT_NAME}_BUILD_FOR_DETECTED_ARCHS)
-  include(${PROJECT_SOURCE_DIR}/cmake/Modules/EvalGPUArchs.cmake)
+  include(${CMAKE_CURRENT_LIST_DIR}/EvalGPUArchs.cmake)
   evaluate_gpu_archs(CMAKE_CUDA_ARCHITECTURES)
 
   list(TRANSFORM CMAKE_CUDA_ARCHITECTURES APPEND "-real")
diff --git a/cpp/cmake/cudf-build-config.cmake.in b/cpp/cmake/cudf-build-config.cmake.in
index 3f4d2e5586e..d0c5a608e45 100644
--- a/cpp/cmake/cudf-build-config.cmake.in
+++ b/cpp/cmake/cudf-build-config.cmake.in
@@ -50,6 +50,11 @@ if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-arrow-targets.cmake")
   include("${CMAKE_CURRENT_LIST_DIR}/cudf-arrow-targets.cmake")
 endif()
 include("${CMAKE_CURRENT_LIST_DIR}/cudf-targets.cmake")
+
+if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake")
+  include("${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake")
+endif()
+
 include("${CMAKE_CURRENT_LIST_DIR}/cudf-config-version.cmake")
 
 check_required_components(cudf)
diff --git a/cpp/cmake/cudf-config.cmake.in b/cpp/cmake/cudf-config.cmake.in
index 0a478516f18..14f8a661c2f 100644
--- a/cpp/cmake/cudf-config.cmake.in
+++ b/cpp/cmake/cudf-config.cmake.in
@@ -1,7 +1,70 @@
 @PACKAGE_INIT@
 
+
+#[=======================================================================[
+
+Provide targets for the cudf library.
+
+Built based on the Apache Arrow columnar memory format, cuDF is a GPU DataFrame
+library for loading, joining, aggregating, filtering, and otherwise
+manipulating data.
+
+cuDF provides a pandas-like API that will be familiar to data engineers &
+data scientists, so they can use it to easily accelerate their workflows
+without going into the details of CUDA programming.
+
+
+Imported Targets
+^^^^^^^^^^^^^^^^
+
+If cudf is found, this module defines the following IMPORTED GLOBAL
+targets:
+
+ cudf::cudf             - The main cudf library.
+
+This module offers an optional testing component which defines the
+following IMPORTED GLOBAL targets:
+
+ cudf::cudftestutil     - The main cudf testing library
+ cudf::gmock
+ cudf::gmock_main
+ cudf::gtest
+ cudf::gtest_main
+
+
+Result Variables
+^^^^^^^^^^^^^^^^
+
+This module will set the following variables in your project::
+
+  CUDF_FOUND
+  CUDF_VERSION
+  CUDF_VERSION_MAJOR
+  CUDF_VERSION_MINOR
+
+#]=======================================================================]
+
+
 cmake_minimum_required(VERSION 3.18)
 
+set(_possible_targets_to_promote
+      cudf::cudf
+      cudf::benchmark
+      cudf::benchmark_main
+      cudf::gmock
+      cudf::gtest
+      cudf::gmock_main
+      cudf::gtest_main
+      cudf::cudftestutil
+      rmm::rmm
+      arrow_shared
+      arrow_cuda_shared )
+foreach(t IN LISTS _possible_targets_to_promote)
+  if(NOT TARGET ${t})
+    list(APPEND _targets_to_promote ${t})
+  endif()
+endforeach()
+
 set(CUDF_VERSION @CUDF_VERSION@)
 set(CUDF_VERSION_MAJOR @CUDF_VERSION_MAJOR@)
 set(CUDF_VERSION_MINOR @CUDF_VERSION_MINOR@)
@@ -26,7 +89,6 @@ set(ArrowCUDA_DIR "${Arrow_DIR}")
 find_dependency(ArrowCUDA @CUDF_VERSION_Arrow@)
 
 find_dependency(rmm @CUDF_MIN_VERSION_rmm@)
-find_dependency(GTest @CUDF_MIN_VERSION_GTest@)
 
 set(Thrust_ROOT "${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/Thrust")
 find_dependency(Thrust @CUDF_MIN_VERSION_Thrust@)
@@ -35,10 +97,23 @@ thrust_create_target(cudf::Thrust FROM_OPTIONS)
 list(POP_FRONT CMAKE_MODULE_PATH)
 
 include("${CMAKE_CURRENT_LIST_DIR}/cudf-targets.cmake")
+
+if(testing IN_LIST cudf_FIND_COMPONENTS)
+  enable_language(CUDA)
+
+  find_dependency(GTest @CUDF_MIN_VERSION_GTest@)
+  include("${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake")
+endif()
+
 include("${CMAKE_CURRENT_LIST_DIR}/cudf-config-version.cmake")
 
 check_required_components(cudf)
 
+foreach(t IN LISTS _targets_to_promote)
+  if(TARGET ${t})
+    set_target_properties(${t} PROPERTIES IMPORTED_GLOBAL TRUE)
+  endif()
+endforeach()
 set(${CMAKE_FIND_PACKAGE_NAME}_CONFIG "${CMAKE_CURRENT_LIST_FILE}")
 
 include(FindPackageHandleStandardArgs)
diff --git a/cpp/cmake/thirdparty/CUDF_GetCPM.cmake b/cpp/cmake/thirdparty/CUDF_GetCPM.cmake
index 5162aaf6ce7..19c07933d42 100644
--- a/cpp/cmake/thirdparty/CUDF_GetCPM.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetCPM.cmake
@@ -23,7 +23,8 @@ include(${CPM_DOWNLOAD_LOCATION})
 function(fix_cmake_global_defaults target)
     if(TARGET ${target})
         get_target_property(_is_imported ${target} IMPORTED)
-        if(_is_imported)
+        get_target_property(_already_global ${target} IMPORTED_GLOBAL)
+        if(_is_imported AND NOT _already_global)
             set_target_properties(${target} PROPERTIES IMPORTED_GLOBAL TRUE)
         endif()
     endif()
diff --git a/cpp/cmake/thirdparty/CUDF_GetGTest.cmake b/cpp/cmake/thirdparty/CUDF_GetGTest.cmake
index e346dce1730..666ba0fbb2c 100644
--- a/cpp/cmake/thirdparty/CUDF_GetGTest.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetGTest.cmake
@@ -15,6 +15,11 @@
 #=============================================================================
 
 function(find_and_configure_gtest VERSION)
+
+    if(TARGET GTest::gtest)
+        return()
+    endif()
+
     # Find or install GoogleTest
     CPMFindPackage(NAME GTest
         VERSION         ${VERSION}
@@ -44,7 +49,7 @@ function(find_and_configure_gtest VERSION)
                         gmock_main
                         gtest_main
             DESTINATION lib
-            EXPORT cudf-targets)
+            EXPORT cudf-testing-targets)
     endif()
 endfunction()
 
diff --git a/cpp/cmake/thirdparty/CUDF_GetRMM.cmake b/cpp/cmake/thirdparty/CUDF_GetRMM.cmake
index 54e0a8620c5..e5d1f2f07a9 100644
--- a/cpp/cmake/thirdparty/CUDF_GetRMM.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetRMM.cmake
@@ -28,6 +28,11 @@ function(cudf_restore_if_enabled var)
 endfunction()
 
 function(find_and_configure_rmm VERSION)
+
+    if(TARGET rmm::rmm)
+        return()
+    endif()
+
     # Consumers have two options for local source builds:
     # 1. Pass `-D CPM_rmm_SOURCE=/path/to/rmm` to build a local RMM source tree
     # 2. Pass `-D CMAKE_PREFIX_PATH=/path/to/rmm/build` to use an existing local
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index a81b6ebc8a1..3c454c85720 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -74,7 +74,8 @@ class aggregation {
     NUNIQUE,         ///< count number of unique elements
     NTH_ELEMENT,     ///< get the nth element
     ROW_NUMBER,      ///< get row-number of current index (relative to rolling window)
-    COLLECT,         ///< collect values into a list
+    COLLECT_LIST,    ///< collect values into a list
+    COLLECT_SET,     ///< collect values into a list without duplicate entries
     LEAD,            ///< window function, accesses row at specified offset following current row
     LAG,             ///< window function, accesses row at specified offset preceding current row
     PTX,             ///< PTX  UDF based reduction
@@ -205,18 +206,35 @@ std::unique_ptr<aggregation> make_nth_element_aggregation(
 std::unique_ptr<aggregation> make_row_number_aggregation();
 
 /**
- * @brief Factory to create a COLLECT aggregation
+ * @brief Factory to create a COLLECT_LIST aggregation
  *
- * `COLLECT` returns a list column of all included elements in the group/series.
+ * `COLLECT_LIST` returns a list column of all included elements in the group/series.
  *
  * If `null_handling` is set to `EXCLUDE`, null elements are dropped from each
  * of the list rows.
  *
  * @param null_handling Indicates whether to include/exclude nulls in list elements.
  */
-std::unique_ptr<aggregation> make_collect_aggregation(
+std::unique_ptr<aggregation> make_collect_list_aggregation(
   null_policy null_handling = null_policy::INCLUDE);
 
+/**
+ * @brief Factory to create a COLLECT_SET aggregation
+ *
+ * `COLLECT_SET` returns a lists column of all included elements in the group/series. Within each
+ * list, the duplicated entries are dropped out such that each entry appears only once.
+ *
+ * If `null_handling` is set to `EXCLUDE`, null elements are dropped from each
+ * of the list rows.
+ *
+ * @param null_handling Indicates whether to include/exclude nulls during collection
+ * @param nulls_equal   Flag to specify whether null entries within each list should be considered
+ * equal
+ */
+std::unique_ptr<aggregation> make_collect_set_aggregation(
+  null_policy null_handling = null_policy::INCLUDE,
+  null_equality null_equal  = null_equality::EQUAL);
+
 /// Factory to create a LAG aggregation
 std::unique_ptr<aggregation> make_lag_aggregation(size_type offset);
 
diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp
index 8ec26cf5eb7..27bcb0d320b 100644
--- a/cpp/include/cudf/ast/detail/operators.hpp
+++ b/cpp/include/cudf/ast/detail/operators.hpp
@@ -187,7 +187,7 @@ CUDA_HOST_DEVICE_CALLABLE constexpr void ast_operator_dispatcher(ast_operator op
 #ifndef __CUDA_ARCH__
       CUDF_FAIL("Invalid operator.");
 #else
-      release_assert(false && "Invalid operator.");
+      cudf_assert(false && "Invalid operator.");
 #endif
       break;
   }
@@ -784,7 +784,7 @@ struct double_dispatch_binary_operator_types {
 #ifndef __CUDA_ARCH__
     CUDF_FAIL("Invalid binary operation.");
 #else
-    release_assert(false && "Invalid binary operation.");
+    cudf_assert(false && "Invalid binary operation.");
 #endif
   }
 };
@@ -819,7 +819,7 @@ struct single_dispatch_binary_operator_types {
 #ifndef __CUDA_ARCH__
     CUDF_FAIL("Invalid binary operation.");
 #else
-    release_assert(false && "Invalid binary operation.");
+    cudf_assert(false && "Invalid binary operation.");
 #endif
   }
 };
@@ -924,7 +924,7 @@ struct dispatch_unary_operator_types {
 #ifndef __CUDA_ARCH__
     CUDF_FAIL("Invalid unary operation.");
 #else
-    release_assert(false && "Invalid unary operation.");
+    cudf_assert(false && "Invalid unary operation.");
 #endif
   }
 };
@@ -996,7 +996,7 @@ struct return_type_functor {
 #ifndef __CUDA_ARCH__
     CUDF_FAIL("Invalid binary operation. Return type cannot be determined.");
 #else
-    release_assert(false && "Invalid binary operation. Return type cannot be determined.");
+    cudf_assert(false && "Invalid binary operation. Return type cannot be determined.");
 #endif
   }
 
@@ -1024,7 +1024,7 @@ struct return_type_functor {
 #ifndef __CUDA_ARCH__
     CUDF_FAIL("Invalid unary operation. Return type cannot be determined.");
 #else
-    release_assert(false && "Invalid unary operation. Return type cannot be determined.");
+    cudf_assert(false && "Invalid unary operation. Return type cannot be determined.");
 #endif
   }
 };
diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh
index ee08742d871..2719a8b5077 100644
--- a/cpp/include/cudf/ast/detail/transform.cuh
+++ b/cpp/include/cudf/ast/detail/transform.cuh
@@ -87,7 +87,7 @@ struct unary_row_output : public row_output {
                              Input input,
                              detail::device_data_reference output) const
   {
-    release_assert(false && "Invalid unary dispatch operator for the provided input.");
+    cudf_assert(false && "Invalid unary dispatch operator for the provided input.");
   }
 };
 
@@ -116,7 +116,7 @@ struct binary_row_output : public row_output {
                              RHS rhs,
                              detail::device_data_reference output) const
   {
-    release_assert(false && "Invalid binary dispatch operator for the provided input.");
+    cudf_assert(false && "Invalid binary dispatch operator for the provided input.");
   }
 };
 
@@ -239,7 +239,7 @@ struct row_evaluator {
                              detail::device_data_reference rhs,
                              detail::device_data_reference output) const
   {
-    release_assert(false && "Invalid binary dispatch operator for the provided input.");
+    cudf_assert(false && "Invalid binary dispatch operator for the provided input.");
   }
 
  private:
@@ -311,7 +311,7 @@ __device__ void evaluate_row_expression(detail::row_evaluator const& evaluator,
                       output,
                       op);
     } else {
-      release_assert(false && "Invalid operator arity.");
+      cudf_assert(false && "Invalid operator arity.");
     }
   }
 }
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index b2f152180b0..5a02f5bbe55 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -774,7 +774,7 @@ struct index_element_fn {
                                  std::is_unsigned<IndexType>::value)>* = nullptr>
   __device__ size_type operator()(Args&&... args)
   {
-    release_assert(false and "dictionary indices must be an unsigned integral type");
+    cudf_assert(false and "dictionary indices must be an unsigned integral type");
     return 0;
   }
 };
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index 3d006449044..3f5f5a91632 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -19,8 +19,8 @@
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/device_atomics.cuh>
-#include <cudf/detail/utilities/release_assert.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
 
@@ -103,7 +103,7 @@ struct update_target_element {
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    release_assert(false and "Invalid source type and aggregation combination.");
+    cudf_assert(false and "Invalid source type and aggregation combination.");
   }
 };
 
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 1cafad25c9c..18bef301e03 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <cudf/aggregation.hpp>
-#include <cudf/detail/utilities/release_assert.cuh>
+#include <cudf/detail/utilities/assert.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -320,11 +320,11 @@ struct udf_aggregation final : derived_aggregation<udf_aggregation> {
 };
 
 /**
- * @brief Derived aggregation class for specifying COLLECT aggregation
+ * @brief Derived aggregation class for specifying COLLECT_LIST aggregation
  */
 struct collect_list_aggregation final : derived_aggregation<nunique_aggregation> {
   explicit collect_list_aggregation(null_policy null_handling = null_policy::INCLUDE)
-    : derived_aggregation{COLLECT}, _null_handling{null_handling}
+    : derived_aggregation{COLLECT_LIST}, _null_handling{null_handling}
   {
   }
   null_policy _null_handling;  ///< include or exclude nulls
@@ -340,6 +340,32 @@ struct collect_list_aggregation final : derived_aggregation<nunique_aggregation>
   size_t hash_impl() const { return std::hash<int>{}(static_cast<int>(_null_handling)); }
 };
 
+/**
+ * @brief Derived aggregation class for specifying COLLECT_SET aggregation
+ */
+struct collect_set_aggregation final : derived_aggregation<collect_set_aggregation> {
+  explicit collect_set_aggregation(null_policy null_handling = null_policy::INCLUDE,
+                                   null_equality null_equal  = null_equality::EQUAL)
+    : derived_aggregation{COLLECT_SET}, _null_handling{null_handling}, _null_equal(null_equal)
+  {
+  }
+  null_policy _null_handling;  ///< include or exclude nulls
+  null_equality _null_equal;   ///< whether to consider nulls as equal values
+
+ protected:
+  friend class derived_aggregation<collect_set_aggregation>;
+
+  bool operator==(collect_set_aggregation const& other) const
+  {
+    return _null_handling == other._null_handling && _null_equal == other._null_equal;
+  }
+
+  size_t hash_impl() const
+  {
+    return std::hash<int>{}(static_cast<int>(_null_handling) ^ static_cast<int>(_null_equal));
+  }
+};
+
 /**
  * @brief Sentinel value used for `ARGMAX` aggregation.
  *
@@ -514,9 +540,15 @@ struct target_type_impl<Source, aggregation::ROW_NUMBER> {
   using type = cudf::size_type;
 };
 
-// Always use list for COLLECT
+// Always use list for COLLECT_LIST
+template <typename Source>
+struct target_type_impl<Source, aggregation::COLLECT_LIST> {
+  using type = cudf::list_view;
+};
+
+// Always use list for COLLECT_SET
 template <typename Source>
-struct target_type_impl<Source, aggregation::COLLECT> {
+struct target_type_impl<Source, aggregation::COLLECT_SET> {
   using type = cudf::list_view;
 };
 
@@ -617,8 +649,10 @@ CUDA_HOST_DEVICE_CALLABLE decltype(auto) aggregation_dispatcher(aggregation::Kin
       return f.template operator()<aggregation::NTH_ELEMENT>(std::forward<Ts>(args)...);
     case aggregation::ROW_NUMBER:
       return f.template operator()<aggregation::ROW_NUMBER>(std::forward<Ts>(args)...);
-    case aggregation::COLLECT:
-      return f.template operator()<aggregation::COLLECT>(std::forward<Ts>(args)...);
+    case aggregation::COLLECT_LIST:
+      return f.template operator()<aggregation::COLLECT_LIST>(std::forward<Ts>(args)...);
+    case aggregation::COLLECT_SET:
+      return f.template operator()<aggregation::COLLECT_SET>(std::forward<Ts>(args)...);
     case aggregation::LEAD:
       return f.template operator()<aggregation::LEAD>(std::forward<Ts>(args)...);
     case aggregation::LAG:
@@ -627,7 +661,7 @@ CUDA_HOST_DEVICE_CALLABLE decltype(auto) aggregation_dispatcher(aggregation::Kin
 #ifndef __CUDA_ARCH__
       CUDF_FAIL("Unsupported aggregation.");
 #else
-      release_assert(false && "Unsupported aggregation.");
+      cudf_assert(false && "Unsupported aggregation.");
 
       // The following code will never be reached, but the compiler generates a
       // warning if there isn't a return value.
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 87f5c9251c7..73647ac2292 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -18,8 +18,8 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/release_assert.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp
index cadcb1265c4..a68d649b8c8 100644
--- a/cpp/include/cudf/detail/groupby/sort_helper.hpp
+++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp
@@ -63,8 +63,8 @@ struct sort_groupby_helper {
                       sorted keys_pre_sorted        = sorted::NO)
     : _keys(keys),
       _num_keys(-1),
-      _include_null_keys(include_null_keys),
-      _keys_pre_sorted(keys_pre_sorted)
+      _keys_pre_sorted(keys_pre_sorted),
+      _include_null_keys(include_null_keys)
   {
     if (keys_pre_sorted == sorted::YES and include_null_keys == null_policy::EXCLUDE and
         has_nulls(keys)) {
diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh
index 8568bd68bfd..8bbd0d1aada 100644
--- a/cpp/include/cudf/detail/indexalator.cuh
+++ b/cpp/include/cudf/detail/indexalator.cuh
@@ -268,7 +268,7 @@ struct input_indexalator : base_indexalator<input_indexalator> {
     template <typename T, std::enable_if_t<not is_index_type<T>()>* = nullptr>
     __device__ size_type operator()(void const* tp)
     {
-      release_assert(false and "only index types are supported");
+      cudf_assert(false and "only index types are supported");
       return 0;
     }
   };
@@ -366,7 +366,7 @@ struct output_indexalator : base_indexalator<output_indexalator> {
     template <typename T, std::enable_if_t<not is_index_type<T>()>* = nullptr>
     __device__ void operator()(void* tp, size_type const value)
     {
-      release_assert(false and "only index types are supported");
+      cudf_assert(false and "only index types are supported");
     }
   };
 
diff --git a/cpp/include/cudf/detail/label_bins.hpp b/cpp/include/cudf/detail/label_bins.hpp
new file mode 100644
index 00000000000..b4da6d888fa
--- /dev/null
+++ b/cpp/include/cudf/detail/label_bins.hpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/labeling/label_bins.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+namespace cudf {
+
+namespace detail {
+
+/**
+ * @addtogroup label_bins
+ * @{
+ * @file
+ * @brief Internal APIs for labeling values by bin.
+ */
+
+/**
+ * @copydoc cudf::label_bins(column_view const& input, column_view const& left_edges, inclusive
+ * left_inclusive, column_view const& right_edges, inclusive right_inclusive, null_order
+ * edge_null_precedence null_order::BEFORE, rmm::mr::device_memory_resource* mr)
+ *
+ * @param stream Stream view on which to allocate resources and queue execution.
+ */
+std::unique_ptr<column> label_bins(
+  column_view const& input,
+  column_view const& left_edges,
+  inclusive left_inclusive,
+  column_view const& right_edges,
+  inclusive right_inclusive,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/** @} */  // end of group
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index 93f54cff588..08dae998944 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -23,8 +23,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-using cudf::device_span;
-
 namespace cudf {
 namespace detail {
 /**
diff --git a/cpp/include/cudf/detail/utilities/release_assert.cuh b/cpp/include/cudf/detail/utilities/assert.cuh
similarity index 87%
rename from cpp/include/cudf/detail/utilities/release_assert.cuh
rename to cpp/include/cudf/detail/utilities/assert.cuh
index e0db88d8fcb..69f9e2d3791 100644
--- a/cpp/include/cudf/detail/utilities/release_assert.cuh
+++ b/cpp/include/cudf/detail/utilities/assert.cuh
@@ -27,11 +27,11 @@
  *
  * Relies on the `__PRETTY_FUNCTION__` macro which is specific to GCC and Clang.
  */
-#if defined(__CUDA_ARCH__) && (defined(__clang__) || defined(__GNUC__))
+#if !defined(NDEBUG) && defined(__CUDA_ARCH__) && (defined(__clang__) || defined(__GNUC__))
 #define __ASSERT_STR_HELPER(x) #x
-#define release_assert(e)     \
+#define cudf_assert(e)        \
   ((e) ? static_cast<void>(0) \
        : __assert_fail(__ASSERT_STR_HELPER(e), __FILE__, __LINE__, __PRETTY_FUNCTION__))
 #else
-#define release_assert(e) (static_cast<void>(0))
+#define cudf_assert(e) (static_cast<void>(0))
 #endif
diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh
index 8b04651e1e6..31533a69487 100644
--- a/cpp/include/cudf/detail/utilities/hash_functions.cuh
+++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <cudf/column/column_device_view.cuh>
-#include <cudf/detail/utilities/release_assert.cuh>
+#include <cudf/detail/utilities/assert.cuh>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <hash/hash_constants.hpp>
@@ -155,7 +155,7 @@ struct MD5ListHasher {
                              size_type offset_end,
                              md5_intermediate_data* hash_state) const
   {
-    release_assert(false && "MD5 Unsupported chrono type column");
+    cudf_assert(false && "MD5 Unsupported chrono type column");
   }
 
   template <typename T, std::enable_if_t<!is_fixed_width<T>()>* = nullptr>
@@ -164,7 +164,7 @@ struct MD5ListHasher {
                              size_type offset_end,
                              md5_intermediate_data* hash_state) const
   {
-    release_assert(false && "MD5 Unsupported non-fixed-width type column");
+    cudf_assert(false && "MD5 Unsupported non-fixed-width type column");
   }
 
   template <typename T, std::enable_if_t<is_floating_point<T>()>* = nullptr>
@@ -274,7 +274,7 @@ struct MD5Hash {
                              size_type row_index,
                              md5_intermediate_data* hash_state) const
   {
-    release_assert(false && "MD5 Unsupported chrono type column");
+    cudf_assert(false && "MD5 Unsupported chrono type column");
   }
 
   template <typename T, std::enable_if_t<!is_fixed_width<T>()>* = nullptr>
@@ -282,7 +282,7 @@ struct MD5Hash {
                              size_type row_index,
                              md5_intermediate_data* hash_state) const
   {
-    release_assert(false && "MD5 Unsupported non-fixed-width type column");
+    cudf_assert(false && "MD5 Unsupported non-fixed-width type column");
   }
 
   template <typename T, std::enable_if_t<is_floating_point<T>()>* = nullptr>
@@ -345,7 +345,7 @@ void CUDA_DEVICE_CALLABLE MD5Hash::operator()<list_view>(column_device_view col,
   column_device_view offsets = col.child(offsets_column_index);
   column_device_view data    = col.child(data_column_index);
 
-  if (data.type().id() == type_id::LIST) release_assert(false && "Nested list unsupported");
+  if (data.type().id() == type_id::LIST) cudf_assert(false && "Nested list unsupported");
 
   cudf::type_dispatcher(data.type(),
                         MD5ListHasher{},
@@ -765,7 +765,7 @@ struct IdentityHash {
   CUDA_HOST_DEVICE_CALLABLE std::enable_if_t<!std::is_arithmetic<Key>::value, return_type>
   operator()(Key const& key) const
   {
-    release_assert(false && "IdentityHash does not support this data type");
+    cudf_assert(false && "IdentityHash does not support this data type");
     return 0;
   }
 
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index 8f8e2b7394c..eb752a8a0ea 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <cudf/detail/utilities/release_assert.cuh>
+#include <cudf/detail/utilities/assert.cuh>
 #include <cudf/types.hpp>
 
 // Note: The <cuda/std/*> versions are used in order for Jitify to work with our fixed_point type.
@@ -91,7 +91,7 @@ template <typename Rep,
                                            is_supported_representation_type<Rep>())>* = nullptr>
 CUDA_HOST_DEVICE_CALLABLE Rep ipow(T exponent)
 {
-  release_assert(exponent >= 0 && "integer exponentiation with negative exponent is not possible.");
+  cudf_assert(exponent >= 0 && "integer exponentiation with negative exponent is not possible.");
   if (exponent == 0) return static_cast<Rep>(1);
   auto extra  = static_cast<Rep>(1);
   auto square = static_cast<Rep>(Base);
diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp
index f7f7f51479d..1dfacd53e0d 100644
--- a/cpp/include/cudf/groupby.hpp
+++ b/cpp/include/cudf/groupby.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -166,6 +166,61 @@ class groupby {
     std::vector<aggregation_request> const& requests,
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+  /**
+   * @brief Performs grouped scans on the specified values.
+   *
+   * The values to aggregate and the aggregations to perform are specifed in an
+   * `aggregation_request`. Each request contains a `column_view` of values to
+   * aggregate and a set of `aggregation`s to perform on those elements.
+   *
+   * For each `aggregation` in a request, `values[i]` is scan aggregated with
+   * all previous `values[j]` where rows `i` and `j` in `keys` are equivalent.
+   *
+   * The `size()` of the request column must equal `keys.num_rows()`.
+   *
+   * For every `aggregation_request` an `aggregation_result` will be returned.
+   * The `aggregation_result` holds the resulting column(s) for each requested
+   * aggregation on the `request`s values. The order of the columns in each
+   * result is the same order as was specified in the request.
+   *
+   * The returned `table` contains the group labels for each row, i.e., the
+   * `keys` given to groupby object. Element `i` across all aggregation results
+   * belongs to the group at row `i` in the group labels table.
+   *
+   * The order of the rows in the group labels is arbitrary. Furthermore,
+   * successive `groupby::scan` calls may return results in different orders.
+   *
+   * @throws cudf::logic_error If `requests[i].values.size() !=
+   * keys.num_rows()`.
+   *
+   * Example:
+   * ```
+   * Input:
+   * keys:     {1 2 1 3 1}
+   *           {1 2 1 4 1}
+   * request:
+   *   values: {3 1 4 9 2}
+   *   aggregations: {{SUM}, {MIN}}
+   *
+   * result:
+   *
+   * keys:  {3 1 1 1 2}
+   *        {4 1 1 1 2}
+   * values:
+   *   SUM: {9 3 7 9 1}
+   *   MIN: {9 3 3 2 1}
+   * ```
+   *
+   * @param requests The set of columns to scan and the scans to perform
+   * @param mr Device memory resource used to allocate the returned table and columns' device memory
+   * @return Pair containing the table with each group's key and
+   * a vector of aggregation_results for each request in the same order as
+   * specified in `requests`.
+   */
+  std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> scan(
+    std::vector<aggregation_request> const& requests,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
   /**
    * @brief The grouped data corresponding to a groupby operation on a set of values.
    *
@@ -231,6 +286,11 @@ class groupby {
     std::vector<aggregation_request> const& requests,
     rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr);
+
+  std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> sort_scan(
+    std::vector<aggregation_request> const& requests,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr);
 };
 /** @} */
 }  // namespace groupby
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 3e63e8fc770..7cb3db1eb30 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -24,6 +24,8 @@
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
+#include <thrust/optional.h>
+
 #include <iostream>
 #include <memory>
 #include <string>
diff --git a/cpp/include/cudf/labeling/label_bins.hpp b/cpp/include/cudf/labeling/label_bins.hpp
new file mode 100644
index 00000000000..7244698f8a2
--- /dev/null
+++ b/cpp/include/cudf/labeling/label_bins.hpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+namespace cudf {
+
+/**
+ * @addtogroup label_bins
+ * @{
+ * @file
+ * @brief APIs for labeling values by bin.
+ */
+
+/**
+ * @brief Enum used to define whether or not bins include their boundary points.
+ */
+enum class inclusive { YES, NO };
+
+/**
+ * @brief Labels elements based on membership in the specified bins.
+ *
+ * A bin `i` is defined by `left_edges[i], right_edges[i]`. Whether the edges are inclusive or
+ * not is determined by `left_inclusive` and `right_inclusive`, respectively.
+ *
+ * A value `input[j]` belongs to bin `i` if `value[j]` is contained in the range `left_edges[i],
+ * right_edges[i]` (with the specified inclusiveness) and `label[j] == i`. If  `input[j]` does not
+ * belong to any bin, then `label[j]` is NULL.
+ *
+ * Notes:
+ *   - If an empty set of edges is provided, all elements in `input` are labeled NULL.
+ *   - NULL elements in `input` belong to no bin and their corresponding label is NULL.
+ *   - NaN elements in `input` belong to no bin and their corresponding label is NULL.
+ *   - Bins must be provided in monotonically increasing order, otherwise behavior is undefined.
+ *   - If two or more bins overlap, behavior is undefined.
+ *
+ * @throws cudf::logic_error if `input.type() == left_edges.type() == right_edges.type()` is
+ * violated.
+ * @throws cudf::logic_error if `left_edges.size() != right_edges.size()`
+ * @throws cudf::logic_error if `left_edges.has_nulls()` or `right_edges.has_nulls()`
+ *
+ * @param input The input elements to label according to the specified bins.
+ * @param left_edges Values of the left edge of each bin.
+ * @param left_inclusive Whether or not the left edge is inclusive.
+ * @param right_edges Value of the right edge of each bin.
+ * @param right_inclusive Whether or not the right edge is inclusive.
+ * @param edge_null_precedence Whether nulls in left and right edges are at the beginning or the
+ * end.
+ * @param mr Device memory resource used to allocate the returned column's device.
+ * @return The integer labels of the elements in `input` according to the specified bins.
+ */
+std::unique_ptr<column> label_bins(
+  column_view const& input,
+  column_view const& left_edges,
+  inclusive left_inclusive,
+  column_view const& right_edges,
+  inclusive right_inclusive,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/** @} */  // end of group
+}  // namespace cudf
diff --git a/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp b/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp
new file mode 100644
index 00000000000..ba3e1d17d7f
--- /dev/null
+++ b/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/lists/lists_column_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace lists {
+namespace detail {
+
+/**
+ * @copydoc cudf::lists::drop_list_duplicates
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> drop_list_duplicates(
+  lists_column_view const& lists_column,
+  null_equality nulls_equal,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+}  // namespace detail
+}  // namespace lists
+}  // namespace cudf
diff --git a/cpp/include/cudf/lists/list_device_view.cuh b/cpp/include/cudf/lists/list_device_view.cuh
index 3afafe9d1fa..4f207474526 100644
--- a/cpp/include/cudf/lists/list_device_view.cuh
+++ b/cpp/include/cudf/lists/list_device_view.cuh
@@ -37,12 +37,12 @@ class list_device_view {
     : lists_column(lists_column), _row_index(row_index)
   {
     column_device_view const& offsets = lists_column.offsets();
-    release_assert(row_index >= 0 && row_index < lists_column.size() &&
-                   row_index < offsets.size() && "row_index out of bounds");
+    cudf_assert(row_index >= 0 && row_index < lists_column.size() && row_index < offsets.size() &&
+                "row_index out of bounds");
 
     begin_offset = offsets.element<size_type>(row_index);
-    release_assert(begin_offset >= 0 && begin_offset <= lists_column.child().size() &&
-                   "begin_offset out of bounds.");
+    cudf_assert(begin_offset >= 0 && begin_offset <= lists_column.child().size() &&
+                "begin_offset out of bounds.");
     _size = offsets.element<size_type>(row_index + 1) - begin_offset;
   }
 
@@ -71,7 +71,7 @@ class list_device_view {
    */
   CUDA_DEVICE_CALLABLE size_type element_offset(size_type idx) const
   {
-    release_assert(idx >= 0 && idx < size() && "idx out of bounds");
+    cudf_assert(idx >= 0 && idx < size() && "idx out of bounds");
     return begin_offset + idx;
   }
 
@@ -93,7 +93,7 @@ class list_device_view {
    */
   CUDA_DEVICE_CALLABLE bool is_null(size_type idx) const
   {
-    release_assert(idx >= 0 && idx < size() && "Index out of bounds.");
+    cudf_assert(idx >= 0 && idx < size() && "Index out of bounds.");
     auto element_offset = begin_offset + idx;
     return lists_column.child().is_null(element_offset);
   }
@@ -294,7 +294,7 @@ struct list_size_functor {
   CUDA_HOST_DEVICE_CALLABLE list_size_functor(column_device_view const& d_col) : d_column(d_col)
   {
 #if defined(__CUDA_ARCH__)
-    release_assert(d_col.type().id() == type_id::LIST && "Only list type column is supported");
+    cudf_assert(d_col.type().id() == type_id::LIST && "Only list type column is supported");
 #else
     CUDF_EXPECTS(d_col.type().id() == type_id::LIST, "Only list type column is supported");
 #endif
diff --git a/cpp/include/cudf/strings/convert/convert_integers.hpp b/cpp/include/cudf/strings/convert/convert_integers.hpp
index 1e2fa80b129..4d29b0a5b6a 100644
--- a/cpp/include/cudf/strings/convert/convert_integers.hpp
+++ b/cpp/include/cudf/strings/convert/convert_integers.hpp
@@ -78,7 +78,10 @@ std::unique_ptr<column> from_integers(
  * characters are valid for conversion to integers.
  *
  * The output row entry will be set to `true` if the corresponding string element
- * has at least one character in [-+0-9].
+ * have all characters in [-+0-9]. The optional sign character must only be in the first
+ * position. Notice that the the integer value is not checked to be within its storage limits.
+ * For strict integer type check, use the other `is_integer()` API which accepts `data_type`
+ * argument.
  *
  * @code{.pseudo}
  * Example:
@@ -89,12 +92,44 @@ std::unique_ptr<column> from_integers(
  *
  * Any null row results in a null entry for that row in the output column.
  *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of boolean results for each string.
+ * @param strings  Strings instance for this operation.
+ * @param mr       Device memory resource used to allocate the returned column's device memory.
+ * @return         New column of boolean results for each string.
+ */
+std::unique_ptr<column> is_integer(
+  strings_column_view const& strings,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns a boolean column identifying strings in which all
+ * characters are valid for conversion to integers.
+ *
+ * The output row entry will be set to `true` if the corresponding string element
+ * has all characters in [-+0-9]. The optional sign character must only be in the first
+ * position. Also, the integer component must fit within the size limits of the underlying
+ * storage type, which is provided by the int_type parameter.
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ['123456', '-456', '', 'A', '+7']
+ *
+ * output1 = s.is_integer(s, data_type{type_id::INT32})
+ * output1 is [true, true, false, false, true]
+ *
+ * output2 = s.is_integer(s, data_type{type_id::INT8})
+ * output2 is [false, false, false, false, true]
+ * @endcode
+ *
+ * Any null row results in a null entry for that row in the output column.
+ *
+ * @param strings  Strings instance for this operation.
+ * @param int_type Integer type used for checking underflow and overflow.
+ * @param mr       Device memory resource used to allocate the returned column's device memory.
+ * @return         New column of boolean results for each string.
  */
 std::unique_ptr<column> is_integer(
   strings_column_view const& strings,
+  data_type int_type,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
index d9840e78be2..04d215ff7cb 100644
--- a/cpp/include/cudf/table/row_operators.cuh
+++ b/cpp/include/cudf/table/row_operators.cuh
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/hash_functions.cuh>
-#include <cudf/detail/utilities/release_assert.cuh>
 #include <cudf/sorting.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/traits.hpp>
@@ -190,7 +190,7 @@ class element_equality_comparator {
             std::enable_if_t<not cudf::is_equality_comparable<Element, Element>()>* = nullptr>
   __device__ bool operator()(size_type lhs_element_index, size_type rhs_element_index)
   {
-    release_assert(false && "Attempted to compare elements of uncomparable types.");
+    cudf_assert(false && "Attempted to compare elements of uncomparable types.");
     return false;
   }
 
@@ -291,7 +291,7 @@ class element_relational_comparator {
             std::enable_if_t<not cudf::is_relationally_comparable<Element, Element>()>* = nullptr>
   __device__ weak_ordering operator()(size_type lhs_element_index, size_type rhs_element_index)
   {
-    release_assert(false && "Attempted to compare elements of uncomparable types.");
+    cudf_assert(false && "Attempted to compare elements of uncomparable types.");
     return weak_ordering::LESS;
   }
 
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index 48e5d9543b8..7a3316a0571 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,8 +24,6 @@
 #define CUDA_DEVICE_CALLABLE inline
 #endif
 
-#include <thrust/optional.h>  // TODO no idea why this is needed ¯\_(ツ)_/¯
-
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp
index 26c51d0435a..bd9ea015a32 100644
--- a/cpp/include/cudf/utilities/type_dispatcher.hpp
+++ b/cpp/include/cudf/utilities/type_dispatcher.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <cudf/detail/utilities/release_assert.cuh>
+#include <cudf/detail/utilities/assert.cuh>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -501,7 +501,7 @@ CUDA_HOST_DEVICE_CALLABLE constexpr decltype(auto) type_dispatcher(cudf::data_ty
 #ifndef __CUDA_ARCH__
       CUDF_FAIL("Unsupported type_id.");
 #else
-      release_assert(false && "Unsupported type_id.");
+      cudf_assert(false && "Unsupported type_id.");
 
       // The following code will never be reached, but the compiler generates a
       // warning if there isn't a return value.
diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h
index 3f3efdb7626..65dd5c73475 100644
--- a/cpp/include/doxygen_groups.h
+++ b/cpp/include/doxygen_groups.h
@@ -147,6 +147,7 @@
  *   @defgroup lists_gather Gathering
  *   @defgroup lists_elements Counting
  *   @defgroup lists_drop_duplicates Filtering
+ *   @defgroup lists_sort Sorting
  * @}
  * @defgroup nvtext_apis NVText
  * @{
@@ -164,4 +165,8 @@
  *   @defgroup utility_bitmask Bitmask
  *   @defgroup utility_error Exception
  * @}
+ * @defgroup labeling_apis Labeling
+ * @{
+ *   @defgroup label_bins Bin Labeling
+ * @}
  */
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index 04dc8776d20..33c19617308 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -125,11 +125,17 @@ std::unique_ptr<aggregation> make_row_number_aggregation()
 {
   return std::make_unique<aggregation>(aggregation::ROW_NUMBER);
 }
-/// Factory to create a COLLECT aggregation
-std::unique_ptr<aggregation> make_collect_aggregation(null_policy null_handling)
+/// Factory to create a COLLECT_LIST aggregation
+std::unique_ptr<aggregation> make_collect_list_aggregation(null_policy null_handling)
 {
   return std::make_unique<detail::collect_list_aggregation>(null_handling);
 }
+/// Factory to create a COLLECT_SET aggregation
+std::unique_ptr<aggregation> make_collect_set_aggregation(null_policy null_handling,
+                                                          null_equality null_equal)
+{
+  return std::make_unique<detail::collect_set_aggregation>(null_handling, null_equal);
+}
 /// Factory to create a LAG aggregation
 std::unique_ptr<aggregation> make_lag_aggregation(size_type offset)
 {
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 60167d77507..845a5512c27 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -44,8 +44,6 @@
 #include <numeric>
 #include <type_traits>
 
-using cudf::device_span;
-
 namespace cudf {
 size_type state_null_count(mask_state state, size_type size)
 {
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 487aed4b411..cdd8ceb0a6c 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -159,6 +159,24 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggr
   return dispatch_aggregation(requests, 0, mr);
 }
 
+// Compute scan requests
+std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::scan(
+  std::vector<aggregation_request> const& requests, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(
+    std::all_of(requests.begin(),
+                requests.end(),
+                [this](auto const& request) { return request.values.size() == _keys.num_rows(); }),
+    "Size mismatch between request values and groupby keys.");
+
+  verify_valid_requests(requests);
+
+  if (_keys.num_rows() == 0) { return std::make_pair(empty_like(_keys), empty_results(requests)); }
+
+  return sort_scan(requests, rmm::cuda_stream_default, mr);
+}
+
 groupby::groups groupby::get_groups(table_view values, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
diff --git a/cpp/src/groupby/hash/multi_pass_kernels.cuh b/cpp/src/groupby/hash/multi_pass_kernels.cuh
index a491b50478a..24de22705a9 100644
--- a/cpp/src/groupby/hash/multi_pass_kernels.cuh
+++ b/cpp/src/groupby/hash/multi_pass_kernels.cuh
@@ -20,8 +20,8 @@
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/device_atomics.cuh>
-#include <cudf/detail/utilities/release_assert.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -65,7 +65,7 @@ struct var_hash_functor {
                                                                   size_type source_index,
                                                                   size_type target_index) noexcept
   {
-    release_assert(false and "Invalid source type for std, var aggregation combination.");
+    cudf_assert(false and "Invalid source type for std, var aggregation combination.");
   }
 
   template <typename Source>
diff --git a/cpp/src/groupby/sort/groupby.cu b/cpp/src/groupby/sort/aggregate.cpp
similarity index 79%
rename from cpp/src/groupby/sort/groupby.cu
rename to cpp/src/groupby/sort/aggregate.cpp
index 5c54dd3cb4c..b171b19413b 100644
--- a/cpp/src/groupby/sort/groupby.cu
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,20 +15,20 @@
  */
 
 #include <groupby/common/utils.hpp>
-#include "group_reductions.hpp"
+#include <groupby/sort/functors.hpp>
+#include <groupby/sort/group_reductions.hpp>
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column.hpp>
-#include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/aggregation/result_cache.hpp>
 #include <cudf/detail/binaryop.hpp>
 #include <cudf/detail/gather.hpp>
-#include <cudf/detail/groupby.hpp>
 #include <cudf/detail/groupby/sort_helper.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/groupby.hpp>
+#include <cudf/lists/detail/drop_list_duplicates.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
@@ -51,71 +51,17 @@ namespace detail {
  * memoised sorted and/or grouped values and re-using will save on computation
  * of these values.
  */
-struct store_result_functor {
-  store_result_functor(size_type col_idx,
-                       column_view const& values,
-                       sort::sort_groupby_helper& helper,
-                       cudf::detail::result_cache& cache,
-                       rmm::cuda_stream_view stream,
-                       rmm::mr::device_memory_resource* mr)
-    : col_idx(col_idx), helper(helper), cache(cache), values(values), stream(stream), mr(mr)
-  {
-  }
-
+struct aggregrate_result_functor final : store_result_functor {
+  using store_result_functor::store_result_functor;
   template <aggregation::Kind k>
   void operator()(aggregation const& agg)
   {
+    CUDF_FAIL("Unsupported aggregation.");
   }
-
- private:
-  /**
-   * @brief Get the grouped values
-   *
-   * Computes the grouped values from @p values on first invocation and returns
-   * the stored result on subsequent invocation
-   */
-  column_view get_grouped_values()
-  {
-    // TODO (dm): After implementing single pass multi-agg, explore making a
-    //            cache of all grouped value columns rather than one at a time
-    if (grouped_values)
-      return grouped_values->view();
-    else if (sorted_values)
-      // TODO (dm): When we implement scan, it wouldn't be ok to return sorted
-      //            values when asked for grouped values. Change this then.
-      return sorted_values->view();
-    else
-      grouped_values = helper.grouped_values(values);
-    return grouped_values->view();
-  };
-
-  /**
-   * @brief Get the grouped and sorted values
-   *
-   * Computes the grouped and sorted (within each group) values from @p values
-   * on first invocation and returns the stored result on subsequent invocation
-   */
-  column_view get_sorted_values()
-  {
-    if (not sorted_values) sorted_values = helper.sorted_values(values);
-    return sorted_values->view();
-  };
-
- private:
-  size_type col_idx;                  ///< Index of column in requests being operated on
-  sort::sort_groupby_helper& helper;  ///< Sort helper
-  cudf::detail::result_cache& cache;  ///< cache of results to store into
-  column_view const& values;          ///< Column of values to group and aggregate
-
-  rmm::cuda_stream_view stream;         ///< CUDA stream on which to execute kernels
-  rmm::mr::device_memory_resource* mr;  ///< Memory resource to allocate space for results
-
-  std::unique_ptr<column> sorted_values;   ///< Memoised grouped and sorted values
-  std::unique_ptr<column> grouped_values;  ///< Memoised grouped values
 };
 
 template <>
-void store_result_functor::operator()<aggregation::COUNT_VALID>(aggregation const& agg)
+void aggregrate_result_functor::operator()<aggregation::COUNT_VALID>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
@@ -129,7 +75,7 @@ void store_result_functor::operator()<aggregation::COUNT_VALID>(aggregation cons
 }
 
 template <>
-void store_result_functor::operator()<aggregation::COUNT_ALL>(aggregation const& agg)
+void aggregrate_result_functor::operator()<aggregation::COUNT_ALL>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
@@ -138,7 +84,7 @@ void store_result_functor::operator()<aggregation::COUNT_ALL>(aggregation const&
 }
 
 template <>
-void store_result_functor::operator()<aggregation::SUM>(aggregation const& agg)
+void aggregrate_result_functor::operator()<aggregation::SUM>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
@@ -149,7 +95,7 @@ void store_result_functor::operator()<aggregation::SUM>(aggregation const& agg)
 };
 
 template <>
-void store_result_functor::operator()<aggregation::ARGMAX>(aggregation const& agg)
+void aggregrate_result_functor::operator()<aggregation::ARGMAX>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
@@ -164,7 +110,7 @@ void store_result_functor::operator()<aggregation::ARGMAX>(aggregation const& ag
 };
 
 template <>
-void store_result_functor::operator()<aggregation::ARGMIN>(aggregation const& agg)
+void aggregrate_result_functor::operator()<aggregation::ARGMIN>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
@@ -179,7 +125,7 @@ void store_result_functor::operator()<aggregation::ARGMIN>(aggregation const& ag
 };
 
 template <>
-void store_result_functor::operator()<aggregation::MIN>(aggregation const& agg)
+void aggregrate_result_functor::operator()<aggregation::MIN>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
@@ -216,7 +162,7 @@ void store_result_functor::operator()<aggregation::MIN>(aggregation const& agg)
 };
 
 template <>
-void store_result_functor::operator()<aggregation::MAX>(aggregation const& agg)
+void aggregrate_result_functor::operator()<aggregation::MAX>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
@@ -253,7 +199,7 @@ void store_result_functor::operator()<aggregation::MAX>(aggregation const& agg)
 };
 
 template <>
-void store_result_functor::operator()<aggregation::MEAN>(aggregation const& agg)
+void aggregrate_result_functor::operator()<aggregation::MEAN>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
@@ -277,7 +223,7 @@ void store_result_functor::operator()<aggregation::MEAN>(aggregation const& agg)
 };
 
 template <>
-void store_result_functor::operator()<aggregation::VARIANCE>(aggregation const& agg)
+void aggregrate_result_functor::operator()<aggregation::VARIANCE>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
@@ -300,7 +246,7 @@ void store_result_functor::operator()<aggregation::VARIANCE>(aggregation const&
 };
 
 template <>
-void store_result_functor::operator()<aggregation::STD>(aggregation const& agg)
+void aggregrate_result_functor::operator()<aggregation::STD>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
@@ -314,7 +260,7 @@ void store_result_functor::operator()<aggregation::STD>(aggregation const& agg)
 };
 
 template <>
-void store_result_functor::operator()<aggregation::QUANTILE>(aggregation const& agg)
+void aggregrate_result_functor::operator()<aggregation::QUANTILE>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
@@ -335,7 +281,7 @@ void store_result_functor::operator()<aggregation::QUANTILE>(aggregation const&
 };
 
 template <>
-void store_result_functor::operator()<aggregation::MEDIAN>(aggregation const& agg)
+void aggregrate_result_functor::operator()<aggregation::MEDIAN>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
@@ -355,7 +301,7 @@ void store_result_functor::operator()<aggregation::MEDIAN>(aggregation const& ag
 };
 
 template <>
-void store_result_functor::operator()<aggregation::NUNIQUE>(aggregation const& agg)
+void aggregrate_result_functor::operator()<aggregation::NUNIQUE>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
@@ -372,7 +318,7 @@ void store_result_functor::operator()<aggregation::NUNIQUE>(aggregation const& a
 };
 
 template <>
-void store_result_functor::operator()<aggregation::NTH_ELEMENT>(aggregation const& agg)
+void aggregrate_result_functor::operator()<aggregation::NTH_ELEMENT>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
@@ -401,12 +347,12 @@ void store_result_functor::operator()<aggregation::NTH_ELEMENT>(aggregation cons
 }
 
 template <>
-void store_result_functor::operator()<aggregation::COLLECT>(aggregation const& agg)
+void aggregrate_result_functor::operator()<aggregation::COLLECT_LIST>(aggregation const& agg)
 {
   auto null_handling =
     static_cast<cudf::detail::collect_list_aggregation const&>(agg)._null_handling;
   CUDF_EXPECTS(null_handling == null_policy::INCLUDE,
-               "null exclusion is not supported on groupby COLLECT aggregation.");
+               "null exclusion is not supported on groupby COLLECT_LIST aggregation.");
 
   if (cache.has_result(col_idx, agg)) return;
 
@@ -416,6 +362,25 @@ void store_result_functor::operator()<aggregation::COLLECT>(aggregation const& a
   cache.add_result(col_idx, agg, std::move(result));
 };
 
+template <>
+void aggregrate_result_functor::operator()<aggregation::COLLECT_SET>(aggregation const& agg)
+{
+  auto const null_handling =
+    static_cast<cudf::detail::collect_set_aggregation const&>(agg)._null_handling;
+  CUDF_EXPECTS(null_handling == null_policy::INCLUDE,
+               "null exclusion is not supported on groupby COLLECT_SET aggregation.");
+
+  if (cache.has_result(col_idx, agg)) { return; }
+
+  auto const collect_result = detail::group_collect(
+    get_grouped_values(), helper.group_offsets(), helper.num_groups(), stream, mr);
+  auto const nulls_equal =
+    static_cast<cudf::detail::collect_set_aggregation const&>(agg)._null_equal;
+  cache.add_result(col_idx,
+                   agg,
+                   lists::detail::drop_list_duplicates(
+                     lists_column_view(collect_result->view()), nulls_equal, stream, mr));
+};
 }  // namespace detail
 
 // Sort-based groupby
@@ -431,7 +396,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::sort
 
   for (size_t i = 0; i < requests.size(); i++) {
     auto store_functor =
-      detail::store_result_functor(i, requests[i].values, helper(), cache, stream, mr);
+      detail::aggregrate_result_functor(i, requests[i].values, helper(), cache, stream, mr);
     for (size_t j = 0; j < requests[i].aggregations.size(); j++) {
       // TODO (dm): single pass compute all supported reductions
       cudf::detail::aggregation_dispatcher(
diff --git a/cpp/src/groupby/sort/functors.hpp b/cpp/src/groupby/sort/functors.hpp
new file mode 100644
index 00000000000..565320fbe80
--- /dev/null
+++ b/cpp/src/groupby/sort/functors.hpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/detail/groupby/sort_helper.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <memory>
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+/**
+ * @brief Functor to dispatch aggregation with
+ *
+ * This functor is to be used with `aggregation_dispatcher` to compute the
+ * appropriate aggregation. If the values on which to run the aggregation are
+ * unchanged, then this functor should be re-used. This is because it stores
+ * memoised sorted and/or grouped values and re-using will save on computation
+ * of these values.
+ */
+struct store_result_functor {
+  store_result_functor(size_type col_idx,
+                       column_view const& values,
+                       sort::sort_groupby_helper& helper,
+                       cudf::detail::result_cache& cache,
+                       rmm::cuda_stream_view stream,
+                       rmm::mr::device_memory_resource* mr)
+    : col_idx(col_idx), helper(helper), cache(cache), values(values), stream(stream), mr(mr)
+  {
+  }
+
+ protected:
+  /**
+   * @brief Get the grouped values
+   *
+   * Computes the grouped values from @p values on first invocation and returns
+   * the stored result on subsequent invocation
+   */
+  column_view get_grouped_values()
+  {
+    // TODO (dm): After implementing single pass multi-agg, explore making a
+    //            cache of all grouped value columns rather than one at a time
+    if (grouped_values)
+      return grouped_values->view();
+    else if (sorted_values)
+      // In scan, it wouldn't be ok to return sorted values when asked for grouped values.
+      // It's overridden in scan implementation.
+      return sorted_values->view();
+    else
+      return (grouped_values = helper.grouped_values(values))->view();
+  };
+
+  /**
+   * @brief Get the grouped and sorted values
+   *
+   * Computes the grouped and sorted (within each group) values from @p values
+   * on first invocation and returns the stored result on subsequent invocation
+   */
+  column_view get_sorted_values()
+  {
+    return sorted_values ? sorted_values->view()
+                         : (sorted_values = helper.sorted_values(values))->view();
+  };
+
+ protected:
+  size_type col_idx;                  ///< Index of column in requests being operated on
+  sort::sort_groupby_helper& helper;  ///< Sort helper
+  cudf::detail::result_cache& cache;  ///< cache of results to store into
+  column_view const& values;          ///< Column of values to group and aggregate
+
+  rmm::cuda_stream_view stream;         ///< CUDA stream on which to execute kernels
+  rmm::mr::device_memory_resource* mr;  ///< Memory resource to allocate space for results
+
+  std::unique_ptr<column> sorted_values;   ///< Memoised grouped and sorted values
+  std::unique_ptr<column> grouped_values;  ///< Memoised grouped values
+};
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/src/groupby/sort/group_count.cu b/cpp/src/groupby/sort/group_count.cu
index 60e0ce31db1..121e4bb889d 100644
--- a/cpp/src/groupby/sort/group_count.cu
+++ b/cpp/src/groupby/sort/group_count.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/groupby/sort/group_count_scan.cu b/cpp/src/groupby/sort/group_count_scan.cu
new file mode 100644
index 00000000000..4ad533aebdc
--- /dev/null
+++ b/cpp/src/groupby/sort/group_count_scan.cu
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/scan.h>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_vector.hpp>
+#include <rmm/exec_policy.hpp>
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+std::unique_ptr<column> count_scan(cudf::device_span<size_type const> group_labels,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
+{
+  std::unique_ptr<column> result = make_fixed_width_column(
+    data_type{type_id::INT32}, group_labels.size(), mask_state::UNALLOCATED, stream, mr);
+
+  if (group_labels.empty()) { return result; }
+
+  auto resultview = result->mutable_view();
+  // aggregation::COUNT_ALL
+  thrust::exclusive_scan_by_key(rmm::exec_policy(stream),
+                                group_labels.begin(),
+                                group_labels.end(),
+                                thrust::make_constant_iterator<size_type>(1),
+                                resultview.begin<size_type>());
+  return result;
+}
+
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/src/groupby/sort/group_max.cu b/cpp/src/groupby/sort/group_max.cu
index bd4e676b83d..3f5592186df 100644
--- a/cpp/src/groupby/sort/group_max.cu
+++ b/cpp/src/groupby/sort/group_max.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/groupby/sort/group_max_scan.cu b/cpp/src/groupby/sort/group_max_scan.cu
new file mode 100644
index 00000000000..303d606be9d
--- /dev/null
+++ b/cpp/src/groupby/sort/group_max_scan.cu
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <groupby/sort/group_scan_util.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+std::unique_ptr<column> max_scan(column_view const& values,
+                                 size_type num_groups,
+                                 cudf::device_span<size_type const> group_labels,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  return type_dispatcher(
+    values.type(), scan_functor<aggregation::MAX>{}, values, num_groups, group_labels, stream, mr);
+}
+
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/src/groupby/sort/group_min_scan.cu b/cpp/src/groupby/sort/group_min_scan.cu
new file mode 100644
index 00000000000..4a692cdf0bd
--- /dev/null
+++ b/cpp/src/groupby/sort/group_min_scan.cu
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <groupby/sort/group_scan_util.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+std::unique_ptr<column> min_scan(column_view const& values,
+                                 size_type num_groups,
+                                 cudf::device_span<size_type const> group_labels,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  return type_dispatcher(
+    values.type(), scan_functor<aggregation::MIN>{}, values, num_groups, group_labels, stream, mr);
+}
+
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/src/groupby/sort/group_scan.hpp b/cpp/src/groupby/sort/group_scan.hpp
new file mode 100644
index 00000000000..efb39068d2e
--- /dev/null
+++ b/cpp/src/groupby/sort/group_scan.hpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/aggregation.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <memory>
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+/**
+ * @brief Internal API to calculate groupwise cumulative sum
+ *
+ * @param values Grouped values to get sum of
+ * @param num_groups Number of groups
+ * @param group_labels ID of group that the corresponding value belongs to
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> sum_scan(column_view const& values,
+                                 size_type num_groups,
+                                 cudf::device_span<size_type const> group_labels,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Internal API to calculate groupwise cumulative minimum value
+ *
+ * @param values Grouped values to get minimum from
+ * @param num_groups Number of groups
+ * @param group_labels ID of group that the corresponding value belongs to
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> min_scan(column_view const& values,
+                                 size_type num_groups,
+                                 cudf::device_span<size_type const> group_labels,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Internal API to calculate groupwise cumulative maximum value
+ *
+ * @param values Grouped values to get maximum from
+ * @param num_groups Number of groups
+ * @param group_labels ID of group that the corresponding value belongs to
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> max_scan(column_view const& values,
+                                 size_type num_groups,
+                                 cudf::device_span<size_type const> group_labels,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Internal API to calculate cumulative number of values in each group
+ *
+ * @param group_labels ID of group that the corresponding value belongs to
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @return Column of type INT32 of count values
+ */
+std::unique_ptr<column> count_scan(cudf::device_span<size_type const> group_labels,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
new file mode 100644
index 00000000000..9f8614a61b4
--- /dev/null
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_vector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/functional.h>
+#include <thrust/scan.h>
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+template <aggregation::Kind K>
+struct scan_functor {
+  template <typename T>
+  static constexpr bool is_supported()
+  {
+    if (K == aggregation::SUM)
+      return cudf::is_numeric<T>() || cudf::is_duration<T>() || cudf::is_fixed_point<T>();
+    else if (K == aggregation::MIN or K == aggregation::MAX)
+      return cudf::is_fixed_width<T>() and is_relationally_comparable<T, T>();
+    else
+      return false;
+  }
+
+  template <typename T>
+  std::enable_if_t<is_supported<T>(), std::unique_ptr<column>> operator()(
+    column_view const& values,
+    size_type num_groups,
+    cudf::device_span<cudf::size_type const> group_labels,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr)
+  {
+    using DeviceType       = device_storage_type_t<T>;
+    using OpType           = cudf::detail::corresponding_operator_t<K>;
+    using ResultType       = cudf::detail::target_type_t<T, K>;
+    using ResultDeviceType = device_storage_type_t<ResultType>;
+
+    auto result_type = is_fixed_point<T>()
+                         ? data_type{type_to_id<ResultType>(), values.type().scale()}
+                         : data_type{type_to_id<ResultType>()};
+
+    std::unique_ptr<column> result =
+      make_fixed_width_column(result_type, values.size(), mask_state::UNALLOCATED, stream, mr);
+
+    if (values.is_empty()) { return result; }
+
+    auto result_table = mutable_table_view({*result});
+    cudf::detail::initialize_with_identity(result_table, {K}, stream);
+
+    auto result_view = mutable_column_device_view::create(result->mutable_view(), stream);
+    auto values_view = column_device_view::create(values, stream);
+
+    if (values.has_nulls()) {
+      auto input = thrust::make_transform_iterator(
+        make_null_replacement_iterator(*values_view, OpType::template identity<DeviceType>()),
+        thrust::identity<ResultDeviceType>{});
+      thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
+                                    group_labels.begin(),
+                                    group_labels.end(),
+                                    input,
+                                    result_view->begin<ResultDeviceType>(),
+                                    thrust::equal_to<size_type>{},
+                                    OpType{});
+      result->set_null_mask(cudf::detail::copy_bitmask(values, stream));
+    } else {
+      auto input = thrust::make_transform_iterator(values_view->begin<DeviceType>(),
+                                                   thrust::identity<ResultDeviceType>{});
+      thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
+                                    group_labels.begin(),
+                                    group_labels.end(),
+                                    input,
+                                    result_view->begin<ResultDeviceType>(),
+                                    thrust::equal_to<size_type>{},
+                                    OpType{});
+    }
+    return result;
+  }
+
+  template <typename T, typename... Args>
+  std::enable_if_t<not is_supported<T>(), std::unique_ptr<column>> operator()(Args&&... args)
+  {
+    CUDF_FAIL("Unsupported groupby scan type-agg combination");
+  }
+};
+
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/src/groupby/sort/group_sum_scan.cu b/cpp/src/groupby/sort/group_sum_scan.cu
new file mode 100644
index 00000000000..ae9b1c321d4
--- /dev/null
+++ b/cpp/src/groupby/sort/group_sum_scan.cu
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <groupby/sort/group_scan_util.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+std::unique_ptr<column> sum_scan(column_view const& values,
+                                 size_type num_groups,
+                                 cudf::device_span<size_type const> group_labels,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  return type_dispatcher(
+    values.type(), scan_functor<aggregation::SUM>{}, values, num_groups, group_labels, stream, mr);
+}
+
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp
new file mode 100644
index 00000000000..63de4ea8684
--- /dev/null
+++ b/cpp/src/groupby/sort/scan.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <groupby/common/utils.hpp>
+#include <groupby/sort/functors.hpp>
+#include <groupby/sort/group_scan.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <memory>
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+/**
+ * @brief Functor to dispatch aggregation with
+ *
+ * This functor is to be used with `aggregation_dispatcher` to compute the
+ * appropriate aggregation. If the values on which to run the aggregation are
+ * unchanged, then this functor should be re-used. This is because it stores
+ * memoised sorted and/or grouped values and re-using will save on computation
+ * of these values.
+ */
+struct scan_result_functor final : store_result_functor {
+  using store_result_functor::store_result_functor;
+  template <aggregation::Kind k>
+  void operator()(aggregation const& agg)
+  {
+    CUDF_FAIL("Unsupported groupby scan aggregation");
+  }
+
+ private:
+  column_view get_grouped_values()
+  {
+    // TODO (dm): After implementing single pass multi-agg, explore making a
+    //            cache of all grouped value columns rather than one at a time
+    if (grouped_values)
+      return grouped_values->view();
+    else
+      return (grouped_values = helper.grouped_values(values))->view();
+  };
+};
+
+template <>
+void scan_result_functor::operator()<aggregation::SUM>(aggregation const& agg)
+{
+  if (cache.has_result(col_idx, agg)) return;
+
+  cache.add_result(
+    col_idx,
+    agg,
+    detail::sum_scan(get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr));
+}
+
+template <>
+void scan_result_functor::operator()<aggregation::MIN>(aggregation const& agg)
+{
+  if (cache.has_result(col_idx, agg)) return;
+
+  cache.add_result(
+    col_idx,
+    agg,
+    detail::min_scan(get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr));
+}
+
+template <>
+void scan_result_functor::operator()<aggregation::MAX>(aggregation const& agg)
+{
+  if (cache.has_result(col_idx, agg)) return;
+
+  cache.add_result(
+    col_idx,
+    agg,
+    detail::max_scan(get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr));
+}
+
+template <>
+void scan_result_functor::operator()<aggregation::COUNT_ALL>(aggregation const& agg)
+{
+  if (cache.has_result(col_idx, agg)) return;
+
+  cache.add_result(col_idx, agg, detail::count_scan(helper.group_labels(), stream, mr));
+}
+}  // namespace detail
+
+// Sort-based groupby
+std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::sort_scan(
+  std::vector<aggregation_request> const& requests,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  // We're going to start by creating a cache of results so that aggs that
+  // depend on other aggs will not have to be recalculated. e.g. mean depends on
+  // sum and count. std depends on mean and count
+  cudf::detail::result_cache cache(requests.size());
+
+  for (size_t i = 0; i < requests.size(); i++) {
+    auto store_functor =
+      detail::scan_result_functor(i, requests[i].values, helper(), cache, stream, mr);
+    for (auto const& aggregation : requests[i].aggregations) {
+      // TODO (dm): single pass compute all supported reductions
+      cudf::detail::aggregation_dispatcher(aggregation->kind, store_functor, *aggregation);
+    }
+  }
+
+  auto results = detail::extract_results(requests, cache);
+
+  return std::make_pair(helper().sorted_keys(stream, mr), std::move(results));
+}
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 6e8937607b9..538e238b5ea 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -18,7 +18,7 @@
 #include <io/utilities/block_utils.cuh>
 #include <io/utilities/column_buffer.hpp>
 
-#include <cudf/detail/utilities/release_assert.cuh>
+#include <cudf/detail/utilities/assert.cuh>
 #include <cudf/utilities/bit.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -68,7 +68,7 @@ struct page_state_s {
   // (leaf) value decoding
   int32_t nz_count;  // number of valid entries in nz_idx (write position in circular buffer)
   int32_t dict_pos;  // write position of dictionary indices
-  int32_t out_pos;   // read position of final output
+  int32_t src_pos;   // input read position of final output value
   int32_t ts_scale;  // timestamp scale: <0: divide by -ts_scale, >0: multiply by ts_scale
   uint32_t nz_idx[non_zero_buffer_size];    // circular buffer of non-null value positions
   uint32_t dict_idx[non_zero_buffer_size];  // Dictionary index, boolean, or string offset values
@@ -963,6 +963,7 @@ static __device__ bool setupLocalPageInfo(page_state_s *const s,
     if (d + t < s->page.num_nesting_levels) {
       s->page.nesting[d + t].valid_count = 0;
       s->page.nesting[d + t].value_count = 0;
+      s->page.nesting[d + t].null_count  = 0;
     }
     d += blockDim.x;
   }
@@ -1029,13 +1030,13 @@ static __device__ bool setupLocalPageInfo(page_state_s *const s,
         s->dtype_len = 8;  // Convert to 64-bit timestamp
       }
 
-      // first row within the page to start reading
+      // first row within the page to output
       if (page_start_row >= min_row) {
         s->first_row = 0;
       } else {
         s->first_row = (int32_t)min(min_row - page_start_row, (size_t)s->page.num_rows);
       }
-      // # of rows within the page to read
+      // # of rows within the page to output
       s->num_rows = s->page.num_rows;
       if ((page_start_row + s->first_row) + s->num_rows > min_row + num_rows) {
         s->num_rows =
@@ -1127,43 +1128,54 @@ static __device__ bool setupLocalPageInfo(page_state_s *const s,
     s->nz_count                          = 0;
     s->num_input_values                  = s->page.num_input_values;
     s->dict_pos                          = 0;
-    s->out_pos                           = 0;
-
-    // handle row bounds (skip_rows, min_rows)
-    s->input_row_count = s->first_row;
+    s->src_pos                           = 0;
+
+    // for flat hierarchies, we can't know how many leaf values to skip unless we do a full
+    // preprocess of the definition levels (since nulls will have no actual decodable value, there
+    // is no direct correlation between # of rows and # of decodable values).  so we will start
+    // processing at the beginning of the value stream and disregard any indices that start
+    // before the first row.
+    if (s->col.max_level[level_type::REPETITION] == 0) {
+      s->page.skipped_values      = 0;
+      s->page.skipped_leaf_values = 0;
+      s->input_value_count        = 0;
+      s->input_row_count          = 0;
 
-    // return the lower bound to compare (page-relative) thread row index against. Explanation:
-    // In the case of nested schemas, rows can span page boundaries.  That is to say,
-    // we can encounter the first value for row X on page M, but the last value for page M
-    // might not be the last value for row X. page M+1 (or further) may contain the last value.
-    //
-    // This means that the first values we encounter for a given page (M+1) may not belong to the
-    // row indicated by chunk_row, but to the row before it that spanned page boundaries. If that
-    // previous row is within the overall row bounds, include the values by allowing relative row
-    // index -1
-    int max_row = (min_row + num_rows) - 1;
-    if (min_row < page_start_row && max_row >= page_start_row - 1) {
       s->row_index_lower_bound = -1;
-    } else {
-      s->row_index_lower_bound = s->first_row;
     }
-
-    // if we're in the decoding step, jump directly to the first
-    // value we care about
-    if (s->col.column_data_base != nullptr) {
-      // for flat hierarchies, we haven't computed skipped_values yet, but we can do so trivially
-      // now
-      if (s->col.max_level[level_type::REPETITION] == 0) {
-        s->page.skipped_values      = s->first_row;
-        s->page.skipped_leaf_values = s->first_row;
+    // for nested hierarchies, we have run a preprocess that lets us skip directly to the values
+    // we need to start decoding at
+    else {
+      // input_row_count translates to "how many rows we have processed so far", so since we are
+      // skipping directly to where we want to start decoding, set it to first_row
+      s->input_row_count = s->first_row;
+
+      // return the lower bound to compare (page-relative) thread row index against. Explanation:
+      // In the case of nested schemas, rows can span page boundaries.  That is to say,
+      // we can encounter the first value for row X on page M, but the last value for page M
+      // might not be the last value for row X. page M+1 (or further) may contain the last value.
+      //
+      // This means that the first values we encounter for a given page (M+1) may not belong to the
+      // row indicated by chunk_row, but to the row before it that spanned page boundaries. If that
+      // previous row is within the overall row bounds, include the values by allowing relative row
+      // index -1
+      int const max_row = (min_row + num_rows) - 1;
+      if (min_row < page_start_row && max_row >= page_start_row - 1) {
+        s->row_index_lower_bound = -1;
+      } else {
+        s->row_index_lower_bound = s->first_row;
       }
 
-      s->input_value_count = s->page.skipped_values > -1 ? s->page.skipped_values : 0;
-    } else {
-      s->input_value_count        = 0;
-      s->input_leaf_count         = 0;
-      s->page.skipped_values      = -1;
-      s->page.skipped_leaf_values = -1;
+      // if we're in the decoding step, jump directly to the first
+      // value we care about
+      if (s->col.column_data_base != nullptr) {
+        s->input_value_count = s->page.skipped_values > -1 ? s->page.skipped_values : 0;
+      } else {
+        s->input_value_count        = 0;
+        s->input_leaf_count         = 0;
+        s->page.skipped_values      = -1;
+        s->page.skipped_leaf_values = -1;
+      }
     }
 
     __threadfence_block();
@@ -1279,7 +1291,7 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu
                                                              int t)
 {
   // max nesting depth of the column
-  int max_depth = s->col.max_nesting_depth;
+  int const max_depth = s->col.max_nesting_depth;
   // how many (input) values we've processed in the page so far
   int input_value_count = s->input_value_count;
   // how many rows we've processed in the page so far
@@ -1304,19 +1316,19 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu
 
     // track (page-relative) row index for the thread so we can compare against input bounds
     // keep track of overall # of rows we've read.
-    int is_new_row               = start_depth == 0 ? 1 : 0;
-    uint32_t warp_row_count_mask = ballot(is_new_row);
-    int32_t thread_row_index =
+    int const is_new_row               = start_depth == 0 ? 1 : 0;
+    uint32_t const warp_row_count_mask = ballot(is_new_row);
+    int32_t const thread_row_index =
       input_row_count + ((__popc(warp_row_count_mask & ((1 << t) - 1)) + is_new_row) - 1);
     input_row_count += __popc(warp_row_count_mask);
-    // is this thread within row bounds?
-    int in_row_bounds = thread_row_index >= s->row_index_lower_bound &&
-                            thread_row_index < (s->first_row + s->num_rows)
-                          ? 1
-                          : 0;
+    // is this thread within read row bounds?
+    int const in_row_bounds = thread_row_index >= s->row_index_lower_bound &&
+                                  thread_row_index < (s->first_row + s->num_rows)
+                                ? 1
+                                : 0;
 
     // compute warp and thread value counts
-    uint32_t warp_count_mask =
+    uint32_t const warp_count_mask =
       ballot((0 >= start_depth && 0 <= end_depth) && in_row_bounds ? 1 : 0);
 
     warp_value_count = __popc(warp_count_mask);
@@ -1329,36 +1341,35 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu
       PageNestingInfo *pni = &s->page.nesting[s_idx];
 
       // if we are within the range of nesting levels we should be adding value indices for
-      int in_nesting_bounds =
+      int const in_nesting_bounds =
         ((s_idx >= start_depth && s_idx <= end_depth) && in_row_bounds) ? 1 : 0;
 
       // everything up to the max_def_level is a non-null value
-      uint32_t is_valid = 0;
-      if (d >= pni->max_def_level && in_nesting_bounds) { is_valid = 1; }
+      uint32_t const is_valid = d >= pni->max_def_level && in_nesting_bounds ? 1 : 0;
 
       // compute warp and thread valid counts
-      uint32_t warp_valid_mask;
-      // for flat schemas, a simple ballot_sync gives us the correct count and bit positions because
-      // every value in the input matches to a value in the output
-      if (max_depth == 0) {
-        warp_valid_mask = ballot(is_valid);
-      }
-      // for nested schemas, it's more complicated.  This warp will visit 32 incoming values,
-      // however not all of them will necessarily represent a value at this nesting level. so the
-      // validity bit for thread t might actually represent output value t-6. the correct position
-      // for thread t's bit is cur_value_count. for cuda 11 we could use __reduce_or_sync(), but
-      // until then we have to do a warp reduce.
-      else {
-        warp_valid_mask = WarpReduceOr32(is_valid << thread_value_count);
-      }
+      uint32_t const warp_valid_mask =
+        // for flat schemas, a simple ballot_sync gives us the correct count and bit positions
+        // because every value in the input matches to a value in the output
+        max_depth == 1
+          ? ballot(is_valid)
+          :
+          // for nested schemas, it's more complicated.  This warp will visit 32 incoming values,
+          // however not all of them will necessarily represent a value at this nesting level. so
+          // the validity bit for thread t might actually represent output value t-6. the correct
+          // position for thread t's bit is cur_value_count. for cuda 11 we could use
+          // __reduce_or_sync(), but until then we have to do a warp reduce.
+          WarpReduceOr32(is_valid << thread_value_count);
+
       thread_valid_count = __popc(warp_valid_mask & ((1 << thread_value_count) - 1));
       warp_valid_count   = __popc(warp_valid_mask);
 
       // if this is the value column emit an index for value decoding
       if (is_valid && s_idx == max_depth - 1) {
-        int idx                       = pni->valid_count + thread_valid_count;
-        int ofs                       = pni->value_count + thread_value_count;
-        s->nz_idx[rolling_index(idx)] = ofs;
+        int const src_pos = pni->valid_count + thread_valid_count;
+        int const dst_pos = pni->value_count + thread_value_count;
+        // nz_idx is a mapping of src buffer indices to destination buffer indices
+        s->nz_idx[rolling_index(src_pos)] = dst_pos;
       }
 
       // compute warp and thread value counts for the -next- nesting level. we need to
@@ -1366,7 +1377,7 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu
       // level. more concretely : the offset for the current nesting level == current length of the
       // next nesting level
       if (s_idx < max_depth - 1) {
-        uint32_t next_warp_count_mask =
+        uint32_t const next_warp_count_mask =
           ballot((s_idx + 1 >= start_depth && s_idx + 1 <= end_depth && in_row_bounds) ? 1 : 0);
         next_warp_value_count   = __popc(next_warp_count_mask);
         next_thread_value_count = __popc(next_warp_count_mask & ((1 << t) - 1));
@@ -1375,17 +1386,36 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu
         // and we have a valid data_out pointer, it implies this is a list column, so
         // emit an offset.
         if (in_nesting_bounds && pni->data_out != nullptr) {
-          int idx             = pni->value_count + thread_value_count;
-          cudf::size_type ofs = s->page.nesting[s_idx + 1].value_count + next_thread_value_count +
-                                s->page.nesting[s_idx + 1].page_start_value;
+          int const idx             = pni->value_count + thread_value_count;
+          cudf::size_type const ofs = s->page.nesting[s_idx + 1].value_count +
+                                      next_thread_value_count +
+                                      s->page.nesting[s_idx + 1].page_start_value;
           (reinterpret_cast<cudf::size_type *>(pni->data_out))[idx] = ofs;
         }
       }
 
-      // increment count of valid values, count of total values, and validity mask
+      // nested schemas always read and write to the same bounds (that is, read and write positions
+      // are already pre-bounded by first_row/num_rows). flat schemas will start reading at the
+      // first value, even if that is before first_row, because we cannot trivially jump to
+      // the correct position to start reading. since we are about to write the validity vector here
+      // we need to adjust our computed mask to take into account the write row bounds.
+      int const in_write_row_bounds =
+        max_depth == 1
+          ? thread_row_index >= s->first_row && thread_row_index < (s->first_row + s->num_rows)
+          : in_row_bounds;
+      int const first_thread_in_write_range =
+        max_depth == 1 ? __ffs(ballot(in_write_row_bounds)) - 1 : 0;
+      // # of bits to of the validity mask to write out
+      int const warp_valid_mask_bit_count =
+        first_thread_in_write_range < 0 ? 0 : warp_value_count - first_thread_in_write_range;
+
+      // increment count of valid values, count of total values, and update validity mask
       if (!t) {
-        if (pni->valid_map != nullptr && in_row_bounds) {
-          store_validity(pni, warp_valid_mask, warp_value_count);
+        if (pni->valid_map != nullptr && warp_valid_mask_bit_count > 0) {
+          uint32_t const warp_output_valid_mask = warp_valid_mask >> first_thread_in_write_range;
+          store_validity(pni, warp_output_valid_mask, warp_valid_mask_bit_count);
+
+          pni->null_count += warp_valid_mask_bit_count - __popc(warp_output_valid_mask);
         }
         pni->valid_count += warp_valid_count;
         pni->value_count += warp_value_count;
@@ -1669,16 +1699,17 @@ extern "C" __global__ void __launch_bounds__(block_size)
       ((s->col.data_type & 7) == BOOLEAN || (s->col.data_type & 7) == BYTE_ARRAY) ? 64 : 32;
   }
 
+  // skipped_leaf_values will always be 0 for flat hierarchies.
   uint32_t skipped_leaf_values = s->page.skipped_leaf_values;
-  while (!s->error && (s->input_value_count < s->num_input_values || s->out_pos < s->nz_count)) {
+  while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
     int target_pos;
-    int out_pos = s->out_pos;
+    int src_pos = s->src_pos;
 
     if (t < out_thread0) {
       target_pos =
-        min(out_pos + 2 * (block_size - out_thread0), s->nz_count + (block_size - out_thread0));
+        min(src_pos + 2 * (block_size - out_thread0), s->nz_count + (block_size - out_thread0));
     } else {
-      target_pos = min(s->nz_count, out_pos + block_size - out_thread0);
+      target_pos = min(s->nz_count, src_pos + block_size - out_thread0);
       if (out_thread0 > 32) { target_pos = min(target_pos, s->dict_pos); }
     }
     __syncthreads();
@@ -1689,6 +1720,7 @@ extern "C" __global__ void __launch_bounds__(block_size)
       // - produces non-NULL value indices in s->nz_idx for subsequent decoding
       gpuDecodeLevels(s, target_pos, t);
     } else if (t < out_thread0) {
+      // skipped_leaf_values will always be 0 for flat hierarchies.
       uint32_t src_target_pos = target_pos + skipped_leaf_values;
 
       // WARP1: Decode dictionary indices, booleans or string positions
@@ -1703,49 +1735,72 @@ extern "C" __global__ void __launch_bounds__(block_size)
     } else {
       // WARP1..WARP3: Decode values
       int dtype = s->col.data_type & 7;
-      out_pos += t - out_thread0;
-      uint32_t src_pos = out_pos + skipped_leaf_values;
-
-      int output_value_idx = s->nz_idx[rolling_index(out_pos)];
+      src_pos += t - out_thread0;
+
+      // the position in the output column/buffer
+      int dst_pos = s->nz_idx[rolling_index(src_pos)];
+
+      // for the flat hierarchy case we will be reading from the beginning of the value stream,
+      // regardless of the value of first_row. so adjust our destination offset accordingly.
+      // example:
+      // - user has passed skip_rows = 2, so our first_row to output is 2
+      // - the row values we get from nz_idx will be
+      //   0, 1, 2, 3, 4 ....
+      // - by shifting these values by first_row, the sequence becomes
+      //   -1, -2, 0, 1, 2 ...
+      // - so we will end up ignoring the first two input rows, and input rows 2..n will
+      //   get written to the output starting at position 0.
+      //
+      if (s->col.max_nesting_depth == 1) { dst_pos -= s->first_row; }
+
+      // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values
+      // before first_row) in the flat hierarchy case.
+      if (src_pos < target_pos && dst_pos >= 0) {
+        // src_pos represents the logical row position we want to read from. But in the case of
+        // nested hierarchies, there is no 1:1 mapping of rows to values.  So our true read position
+        // has to take into account the # of values we have to skip in the page to get to the
+        // desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
+        uint32_t val_src_pos = src_pos + skipped_leaf_values;
 
-      if (out_pos < target_pos && output_value_idx >= 0 && output_value_idx < s->num_input_values) {
         // nesting level that is storing actual leaf values
         int leaf_level_index = s->col.max_nesting_depth - 1;
 
         uint32_t dtype_len = s->dtype_len;
-        void *dst          = s->page.nesting[leaf_level_index].data_out +
-                    static_cast<size_t>(output_value_idx) * dtype_len;
-        if (dtype == BYTE_ARRAY)
-          gpuOutputString(s, src_pos, dst);
-        else if (dtype == BOOLEAN)
-          gpuOutputBoolean(s, src_pos, static_cast<uint8_t *>(dst));
-        else if (s->col.converted_type == DECIMAL) {
+        void *dst =
+          s->page.nesting[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
+        if (dtype == BYTE_ARRAY) {
+          gpuOutputString(s, val_src_pos, dst);
+        } else if (dtype == BOOLEAN) {
+          gpuOutputBoolean(s, val_src_pos, static_cast<uint8_t *>(dst));
+        } else if (s->col.converted_type == DECIMAL) {
           switch (dtype) {
-            case INT32: gpuOutputFast(s, src_pos, static_cast<uint32_t *>(dst)); break;
-            case INT64: gpuOutputFast(s, src_pos, static_cast<uint2 *>(dst)); break;
+            case INT32: gpuOutputFast(s, val_src_pos, static_cast<uint32_t *>(dst)); break;
+            case INT64: gpuOutputFast(s, val_src_pos, static_cast<uint2 *>(dst)); break;
             default:
               // we currently do not support reading byte arrays larger than DECIMAL64
               if (s->dtype_len_in <= 8) {
-                gpuOutputFixedLenByteArrayAsInt64(s, src_pos, static_cast<int64_t *>(dst));
+                gpuOutputFixedLenByteArrayAsInt64(s, val_src_pos, static_cast<int64_t *>(dst));
               } else {
-                gpuOutputDecimalAsFloat(s, src_pos, static_cast<double *>(dst), dtype);
+                gpuOutputDecimalAsFloat(s, val_src_pos, static_cast<double *>(dst), dtype);
               }
               break;
           }
-        } else if (dtype == INT96)
-          gpuOutputInt96Timestamp(s, src_pos, static_cast<int64_t *>(dst));
-        else if (dtype_len == 8) {
-          if (s->ts_scale)
-            gpuOutputInt64Timestamp(s, src_pos, static_cast<int64_t *>(dst));
-          else
-            gpuOutputFast(s, src_pos, static_cast<uint2 *>(dst));
-        } else if (dtype_len == 4)
-          gpuOutputFast(s, src_pos, static_cast<uint32_t *>(dst));
-        else
-          gpuOutputGeneric(s, src_pos, static_cast<uint8_t *>(dst), dtype_len);
+        } else if (dtype == INT96) {
+          gpuOutputInt96Timestamp(s, val_src_pos, static_cast<int64_t *>(dst));
+        } else if (dtype_len == 8) {
+          if (s->ts_scale) {
+            gpuOutputInt64Timestamp(s, val_src_pos, static_cast<int64_t *>(dst));
+          } else {
+            gpuOutputFast(s, val_src_pos, static_cast<uint2 *>(dst));
+          }
+        } else if (dtype_len == 4) {
+          gpuOutputFast(s, val_src_pos, static_cast<uint32_t *>(dst));
+        } else {
+          gpuOutputGeneric(s, val_src_pos, static_cast<uint8_t *>(dst), dtype_len);
+        }
       }
 
-      if (t == out_thread0) { *(volatile int32_t *)&s->out_pos = target_pos; }
+      if (t == out_thread0) { *(volatile int32_t *)&s->src_pos = target_pos; }
     }
     __syncthreads();
   }
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index ad3c214069f..555259c443d 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -98,6 +98,7 @@ struct PageNestingInfo {
   // set during data decoding
   int32_t valid_count;       // # of valid values decoded in this page/nesting-level
   int32_t value_count;       // total # of values decoded in this page/nesting-level
+  int32_t null_count;        // null count
   int32_t valid_map_offset;  // current offset in bits relative to valid_map
   uint8_t *data_out;         // pointer into output buffer
   uint32_t *valid_map;       // pointer into output validity buffer
@@ -128,7 +129,17 @@ struct PageInfo {
   Encoding definition_level_encoding;  // Encoding used for definition levels (data page)
   Encoding repetition_level_encoding;  // Encoding used for repetition levels (data page)
 
+  // for nested types, we run a preprocess step in order to determine output
+  // column sizes. Because of this, we can jump directly to the position in the
+  // input data to start decoding instead of reading all of the data and discarding
+  // rows we don't care about.
+  //
+  // NOTE: for flat hierarchies we do not do the preprocess step, so skipped_values and
+  // skipped_leaf_values will always be 0.
+  //
+  // # of values skipped in the repetition/definition level stream
   int skipped_values;
+  // # of values skipped in the actual data stream.
   int skipped_leaf_values;
 
   // nesting information (input/output) for each page
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index 16cf0877c23..698eb1569cb 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -1361,7 +1361,7 @@ void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc> &chu
       if (chunk_nested_valids.host_ptr(chunk_offsets[pi->chunk_idx])[l_idx] == nullptr) {
         continue;
       }
-      out_buf.null_count() += pni[l_idx].value_count - pni[l_idx].valid_count;
+      out_buf.null_count() += pni[l_idx].null_count;
     }
   }
 
diff --git a/cpp/src/labeling/label_bins.cu b/cpp/src/labeling/label_bins.cu
new file mode 100644
index 00000000000..70a6826d9eb
--- /dev/null
+++ b/cpp/src/labeling/label_bins.cu
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/label_bins.hpp>
+#include <cudf/detail/valid_if.cuh>
+#include <cudf/labeling/label_bins.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/span.hpp>
+#include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <thrust/advance.h>
+#include <thrust/binary_search.h>
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/pair.h>
+
+#include <limits>
+
+namespace cudf {
+namespace detail {
+namespace {
+
+// Sentinel used to indicate that an input value should be placed in the null
+// bin.
+// NOTE: In theory if a user decided to specify 2^31 bins this would fail. We
+// could make this an error in Python, but that is such a crazy edge case...
+constexpr size_type NULL_VALUE{std::numeric_limits<size_type>::max()};
+
+/*
+ * Functor for finding bins using thrust::transform.
+ *
+ * This functor is stateful, in the sense that it stores (for read-only use)
+ * pointers to the edge ranges on construction to enable natural use with
+ * thrust::transform semantics.  To handle null values, this functor assumes
+ * that the input iterators have already been shifted to exclude the range
+ * containing nulls. The `edge_index_shift` parameter is used to return the
+ * index of a value's bin accounting for this shift.
+ */
+template <typename T,
+          typename RandomAccessIterator,
+          typename LeftComparator,
+          typename RightComparator>
+struct bin_finder {
+  bin_finder(RandomAccessIterator left_begin,
+             RandomAccessIterator left_end,
+             RandomAccessIterator right_begin)
+    : m_left_begin(left_begin), m_left_end(left_end), m_right_begin(right_begin)
+  {
+  }
+
+  __device__ size_type operator()(thrust::pair<T, bool> input_value) const
+  {
+    // Immediately return sentinel for null inputs.
+    if (!input_value.second) return NULL_VALUE;
+
+    T value    = input_value.first;
+    auto bound = thrust::lower_bound(thrust::seq, m_left_begin, m_left_end, value, m_left_comp);
+
+    // Exit early and return sentinel for values that lie below the interval.
+    if (bound == m_left_begin) { return NULL_VALUE; }
+
+    auto index = thrust::distance(m_left_begin, thrust::prev(bound));
+    return (m_right_comp(value, m_right_begin[index])) ? index : NULL_VALUE;
+  }
+
+  const RandomAccessIterator
+    m_left_begin{};  // The beginning of the range containing the left bin edges.
+  const RandomAccessIterator m_left_end{};  // The end of the range containing the left bin edges.
+  const RandomAccessIterator
+    m_right_begin{};                   // The beginning of the range containing the right bin edges.
+  const LeftComparator m_left_comp{};  // Comparator used for left edges.
+  const RightComparator m_right_comp{};  // Comparator used for right edges.
+};
+
+// Functor to identify rows that should be filtered out based on the sentinel set by
+// bin_finder::operator().
+struct filter_null_sentinel {
+  __device__ bool operator()(size_type i) { return i != NULL_VALUE; }
+};
+
+// Bin the input by the edges in left_edges and right_edges.
+template <typename T, typename LeftComparator, typename RightComparator>
+std::unique_ptr<column> label_bins(column_view const& input,
+                                   column_view const& left_edges,
+                                   column_view const& right_edges,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
+{
+  auto output = make_numeric_column(
+    data_type(type_to_id<size_type>()), input.size(), mask_state::UNALLOCATED, stream, mr);
+  auto output_mutable_view = output->mutable_view();
+  auto output_begin        = output_mutable_view.begin<size_type>();
+  auto output_end          = output_mutable_view.end<size_type>();
+
+  // These device column views are necessary for creating iterators that work
+  // for columns of compound types. The column_view iterators fail for compound
+  // types because they return raw pointers to the start of the data. The output
+  // does not require these iterators because it's always a primitive type.
+  auto input_device_view       = column_device_view::create(input, stream);
+  auto left_edges_device_view  = column_device_view::create(left_edges, stream);
+  auto right_edges_device_view = column_device_view::create(right_edges, stream);
+
+  auto left_begin  = left_edges_device_view->begin<T>();
+  auto left_end    = left_edges_device_view->end<T>();
+  auto right_begin = right_edges_device_view->begin<T>();
+
+  using RandomAccessIterator = decltype(left_edges_device_view->begin<T>());
+
+  if (input.has_nulls()) {
+    thrust::transform(rmm::exec_policy(stream),
+                      input_device_view->pair_begin<T, true>(),
+                      input_device_view->pair_end<T, true>(),
+                      output_begin,
+                      bin_finder<T, RandomAccessIterator, LeftComparator, RightComparator>(
+                        left_begin, left_end, right_begin));
+  } else {
+    thrust::transform(rmm::exec_policy(stream),
+                      input_device_view->pair_begin<T, false>(),
+                      input_device_view->pair_end<T, false>(),
+                      output_begin,
+                      bin_finder<T, RandomAccessIterator, LeftComparator, RightComparator>(
+                        left_begin, left_end, right_begin));
+  }
+
+  const auto mask_and_count = valid_if(output_begin, output_end, filter_null_sentinel());
+
+  output->set_null_mask(mask_and_count.first, mask_and_count.second);
+  return output;
+}
+
+template <typename T>
+constexpr auto is_supported_bin_type()
+{
+  return cudf::is_relationally_comparable<T, T>() && cudf::is_equality_comparable<T, T>();
+}
+
+struct bin_type_dispatcher {
+  template <typename T, typename... Args>
+  std::enable_if_t<not detail::is_supported_bin_type<T>(), std::unique_ptr<column>> operator()(
+    Args&&... args)
+  {
+    CUDF_FAIL("Type not support for cudf::bin");
+  }
+
+  template <typename T>
+  std::enable_if_t<detail::is_supported_bin_type<T>(), std::unique_ptr<column>> operator()(
+    column_view const& input,
+    column_view const& left_edges,
+    inclusive left_inclusive,
+    column_view const& right_edges,
+    inclusive right_inclusive,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr)
+  {
+    if ((left_inclusive == inclusive::YES) && (right_inclusive == inclusive::YES))
+      return label_bins<T, thrust::less_equal<T>, thrust::less_equal<T>>(
+        input, left_edges, right_edges, stream, mr);
+    if ((left_inclusive == inclusive::YES) && (right_inclusive == inclusive::NO))
+      return label_bins<T, thrust::less_equal<T>, thrust::less<T>>(
+        input, left_edges, right_edges, stream, mr);
+    if ((left_inclusive == inclusive::NO) && (right_inclusive == inclusive::YES))
+      return label_bins<T, thrust::less<T>, thrust::less_equal<T>>(
+        input, left_edges, right_edges, stream, mr);
+    if ((left_inclusive == inclusive::NO) && (right_inclusive == inclusive::NO))
+      return label_bins<T, thrust::less<T>, thrust::less<T>>(
+        input, left_edges, right_edges, stream, mr);
+
+    CUDF_FAIL("Undefined inclusive setting.");
+  }
+};
+
+}  // anonymous namespace
+
+/// Bin the input by the edges in left_edges and right_edges.
+std::unique_ptr<column> label_bins(column_view const& input,
+                                   column_view const& left_edges,
+                                   inclusive left_inclusive,
+                                   column_view const& right_edges,
+                                   inclusive right_inclusive,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE()
+  CUDF_EXPECTS((input.type() == left_edges.type()) && (input.type() == right_edges.type()),
+               "The input and edge columns must have the same types.");
+  CUDF_EXPECTS(left_edges.size() == right_edges.size(),
+               "The left and right edge columns must be of the same length.");
+  CUDF_EXPECTS(!left_edges.has_nulls() && !right_edges.has_nulls(),
+               "The left and right edge columns cannot contain nulls.");
+
+  // Handle empty inputs.
+  if (input.is_empty()) { return make_empty_column(data_type(type_to_id<size_type>())); }
+
+  return type_dispatcher<dispatch_storage_type>(input.type(),
+                                                detail::bin_type_dispatcher{},
+                                                input,
+                                                left_edges,
+                                                left_inclusive,
+                                                right_edges,
+                                                right_inclusive,
+                                                stream,
+                                                mr);
+}
+
+}  // namespace detail
+
+/// Bin the input by the edges in left_edges and right_edges.
+std::unique_ptr<column> label_bins(column_view const& input,
+                                   column_view const& left_edges,
+                                   inclusive left_inclusive,
+                                   column_view const& right_edges,
+                                   inclusive right_inclusive,
+                                   rmm::mr::device_memory_resource* mr)
+{
+  return detail::label_bins(
+    input, left_edges, left_inclusive, right_edges, right_inclusive, rmm::cuda_stream_default, mr);
+}
+}  // namespace cudf
diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu
index 1eb105d296d..529b7489c35 100644
--- a/cpp/src/lists/drop_list_duplicates.cu
+++ b/cpp/src/lists/drop_list_duplicates.cu
@@ -225,6 +225,8 @@ void generate_offsets(size_type num_entries,
                       return offsets[i - prefix_sum_empty_lists[i]];
                     });
 }
+}  // anonymous namespace
+
 /**
  * @copydoc cudf::lists::drop_list_duplicates
  *
@@ -276,7 +278,6 @@ std::unique_ptr<column> drop_list_duplicates(lists_column_view const& lists_colu
                            cudf::detail::copy_bitmask(lists_column.parent(), stream, mr));
 }
 
-}  // anonymous namespace
 }  // namespace detail
 
 /**
diff --git a/cpp/src/lists/explode.cu b/cpp/src/lists/explode.cu
index 336aabde15e..8233635050e 100644
--- a/cpp/src/lists/explode.cu
+++ b/cpp/src/lists/explode.cu
@@ -29,6 +29,7 @@
 #include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
+#include <thrust/optional.h>
 
 #include <memory>
 #include <type_traits>
diff --git a/cpp/src/quantiles/quantiles_util.hpp b/cpp/src/quantiles/quantiles_util.hpp
index 67323126751..1df0a4ab41a 100644
--- a/cpp/src/quantiles/quantiles_util.hpp
+++ b/cpp/src/quantiles/quantiles_util.hpp
@@ -15,7 +15,7 @@
  */
 
 #include <cmath>
-#include <cudf/detail/utilities/release_assert.cuh>
+#include <cudf/detail/utilities/assert.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -144,7 +144,7 @@ select_quantile(ValueAccessor get_value, size_type size, double q, interpolation
 
     default:
 #if defined(__CUDA_ARCH__)
-      release_assert(false && "Invalid interpolation operation for quantiles");
+      cudf_assert(false && "Invalid interpolation operation for quantiles");
       return Result();
 #else
       CUDF_FAIL("Invalid interpolation operation for quantiles.");
@@ -173,7 +173,7 @@ select_quantile_data(Iterator begin, size_type size, double q, interpolation int
   }
 
 #if defined(__CUDA_ARCH__)
-  release_assert(false && "Invalid interpolation operation for quantiles");
+  cudf_assert(false && "Invalid interpolation operation for quantiles");
   return Result();
 #else
   CUDF_FAIL("Invalid interpolation operation for quantiles.");
@@ -200,7 +200,7 @@ CUDA_HOST_DEVICE_CALLABLE bool select_quantile_validity(Iterator begin,
   }
 
 #if defined(__CUDA_ARCH__)
-  release_assert(false && "Invalid interpolation operation for quantiles");
+  cudf_assert(false && "Invalid interpolation operation for quantiles");
   return false;
 #else
   CUDF_FAIL("Invalid interpolation operation for quantiles.");
diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh
index dcc48aafb39..42562507fa9 100644
--- a/cpp/src/rolling/rolling_detail.cuh
+++ b/cpp/src/rolling/rolling_detail.cuh
@@ -315,7 +315,7 @@ template <typename InputType,
           std::enable_if_t<!std::is_same<InputType, cudf::string_view>::value and
                            !(op == aggregation::COUNT_VALID || op == aggregation::COUNT_ALL ||
                              op == aggregation::ROW_NUMBER || op == aggregation::LEAD ||
-                             op == aggregation::LAG || op == aggregation::COLLECT)>* = nullptr>
+                             op == aggregation::LAG || op == aggregation::COLLECT_LIST)>* = nullptr>
 bool __device__ process_rolling_window(column_device_view input,
                                        column_device_view ignored_default_outputs,
                                        mutable_column_device_view output,
@@ -814,7 +814,7 @@ struct rolling_window_launcher {
             typename PrecedingWindowIterator,
             typename FollowingWindowIterator>
   std::enable_if_t<!(op == aggregation::MEAN || op == aggregation::LEAD || op == aggregation::LAG ||
-                     op == aggregation::COLLECT),
+                     op == aggregation::COLLECT_LIST),
                    std::unique_ptr<column>>
   operator()(column_view const& input,
              column_view const& default_outputs,
@@ -897,11 +897,11 @@ struct rolling_window_launcher {
   }
 
   /**
-   * @brief Creates the offsets child of the result of the `COLLECT` window aggregation
+   * @brief Creates the offsets child of the result of the `COLLECT_LIST` window aggregation
    *
    * Given the input column, the preceding/following window bounds, and `min_periods`,
    * the sizes of each list row may be computed. These values can then be used to
-   * calculate the offsets for the result of `COLLECT`.
+   * calculate the offsets for the result of `COLLECT_LIST`.
    *
    * Note: If `min_periods` exceeds the number of observations for a window, the size
    * is set to `0` (since the result is `null`).
@@ -945,7 +945,7 @@ struct rolling_window_launcher {
   }
 
   /**
-   * @brief Generate mapping of each row in the COLLECT result's child column
+   * @brief Generate mapping of each row in the COLLECT_LIST result's child column
    * to the index of the row it belongs to.
    *
    *  If
@@ -1030,7 +1030,7 @@ struct rolling_window_launcher {
 
   /**
    * @brief Create gather map to generate the child column of the result of
-   * the `COLLECT` window aggregation.
+   * the `COLLECT_LIST` window aggregation.
    */
   template <typename PrecedingIter>
   std::unique_ptr<column> create_collect_gather_map(column_view const& child_offsets,
@@ -1064,7 +1064,7 @@ struct rolling_window_launcher {
   }
 
   /**
-   * @brief Count null entries in result of COLLECT.
+   * @brief Count null entries in result of COLLECT_LIST.
    */
   size_type count_child_nulls(column_view const& input,
                               std::unique_ptr<column> const& gather_map,
@@ -1139,7 +1139,7 @@ struct rolling_window_launcher {
   }
 
   template <aggregation::Kind op, typename PrecedingIter, typename FollowingIter>
-  std::enable_if_t<(op == aggregation::COLLECT), std::unique_ptr<column>> operator()(
+  std::enable_if_t<(op == aggregation::COLLECT_LIST), std::unique_ptr<column>> operator()(
     column_view const& input,
     column_view const& default_outputs,
     PrecedingIter preceding_begin_raw,
@@ -1150,7 +1150,7 @@ struct rolling_window_launcher {
     rmm::mr::device_memory_resource* mr)
   {
     CUDF_EXPECTS(default_outputs.is_empty(),
-                 "COLLECT window function does not support default values.");
+                 "COLLECT_LIST window function does not support default values.");
 
     if (input.is_empty()) return empty_like(input);
 
@@ -1370,6 +1370,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
   auto input_col = cudf::is_dictionary(input.type())
                      ? dictionary_column_view(input).get_indices_annotated()
                      : input;
+
   auto output = cudf::type_dispatcher(input_col.type(),
                                       dispatch_rolling{},
                                       input_col,
diff --git a/cpp/src/rolling/rolling_detail.hpp b/cpp/src/rolling/rolling_detail.hpp
index d7fa92f1978..18bd0ea2217 100644
--- a/cpp/src/rolling/rolling_detail.hpp
+++ b/cpp/src/rolling/rolling_detail.hpp
@@ -41,7 +41,7 @@ static constexpr bool is_rolling_supported()
       (op == aggregation::SUM) or (op == aggregation::MIN) or (op == aggregation::MAX) or
       (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
       (op == aggregation::MEAN) or (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or
-      (op == aggregation::LAG) or (op == aggregation::COLLECT);
+      (op == aggregation::LAG) or (op == aggregation::COLLECT_LIST);
 
     constexpr bool is_valid_numeric_agg =
       (cudf::is_numeric<ColumnType>() or cudf::is_duration<ColumnType>() or
@@ -54,23 +54,23 @@ static constexpr bool is_rolling_supported()
     return (op == aggregation::MIN) or (op == aggregation::MAX) or
            (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
            (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or
-           (op == aggregation::LAG) or (op == aggregation::COLLECT);
+           (op == aggregation::LAG) or (op == aggregation::COLLECT_LIST);
   } else if (cudf::is_fixed_point<ColumnType>()) {
     return (op == aggregation::SUM) or (op == aggregation::MIN) or (op == aggregation::MAX) or
            (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
            (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or
-           (op == aggregation::LAG) or (op == aggregation::COLLECT);
+           (op == aggregation::LAG) or (op == aggregation::COLLECT_LIST);
   } else if (std::is_same<ColumnType, cudf::string_view>()) {
     return (op == aggregation::MIN) or (op == aggregation::MAX) or
            (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
-           (op == aggregation::ROW_NUMBER) or (op == aggregation::COLLECT);
+           (op == aggregation::ROW_NUMBER) or (op == aggregation::COLLECT_LIST);
 
   } else if (std::is_same<ColumnType, cudf::list_view>()) {
     return (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
-           (op == aggregation::ROW_NUMBER) or (op == aggregation::COLLECT);
+           (op == aggregation::ROW_NUMBER) or (op == aggregation::COLLECT_LIST);
   } else if (std::is_same<ColumnType, cudf::struct_view>()) {
     // TODO: Add support for COUNT_VALID, COUNT_ALL, ROW_NUMBER.
-    return op == aggregation::COLLECT;
+    return op == aggregation::COLLECT_LIST;
   } else {
     return false;
   }
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 2dcece0b3be..8b46f66a48f 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -35,6 +35,7 @@
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/logical.h>
+#include <thrust/optional.h>
 
 #include <map>
 #include <vector>
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index 5c5032b5c87..7eee2b3cc0e 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -25,7 +25,6 @@
 #include <cudf/strings/string.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <strings/convert/utilities.cuh>
 #include <strings/utilities.cuh>
@@ -38,6 +37,160 @@
 
 namespace cudf {
 namespace strings {
+
+namespace detail {
+namespace {
+
+/**
+ * @brief This only checks if a string is a valid integer within the bounds of its storage type.
+ */
+template <typename IntegerType>
+struct string_to_integer_check_fn {
+  __device__ bool operator()(thrust::pair<string_view, bool> const& p) const
+  {
+    if (!p.second || p.first.empty()) { return false; }
+
+    auto const d_str = p.first.data();
+    if (d_str[0] == '-' && std::is_unsigned<IntegerType>::value) { return false; }
+
+    auto iter           = d_str + static_cast<int>((d_str[0] == '-' || d_str[0] == '+'));
+    auto const iter_end = d_str + p.first.size_bytes();
+    if (iter == iter_end) { return false; }
+
+    auto const sign = d_str[0] == '-' ? IntegerType{-1} : IntegerType{1};
+    auto const bound_val =
+      sign > 0 ? std::numeric_limits<IntegerType>::max() : std::numeric_limits<IntegerType>::min();
+
+    IntegerType value = 0;      // parse the string to integer and check for overflow along the way
+    while (iter != iter_end) {  // check all bytes for valid characters
+      auto const chr = *iter++;
+      // Check for valid character
+      if (chr < '0' || chr > '9') { return false; }
+
+      // Check for underflow and overflow:
+      auto const digit       = static_cast<IntegerType>(chr - '0');
+      auto const bound_check = (bound_val - sign * digit) / IntegerType{10} * sign;
+      if (value > bound_check) return false;
+      value = value * IntegerType{10} + digit;
+    }
+
+    return true;
+  }
+};
+
+/**
+ * @brief The dispatch functions for checking if strings are valid integers.
+ */
+struct dispatch_is_integer_fn {
+  template <typename T, std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+  std::unique_ptr<column> operator()(strings_column_view const& strings,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
+  {
+    auto const d_column = column_device_view::create(strings.parent(), stream);
+    auto results        = make_numeric_column(data_type{type_id::BOOL8},
+                                       strings.size(),
+                                       cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                       strings.null_count(),
+                                       stream,
+                                       mr);
+
+    auto d_results = results->mutable_view().data<bool>();
+    if (strings.has_nulls()) {
+      thrust::transform(rmm::exec_policy(stream),
+                        d_column->pair_begin<string_view, true>(),
+                        d_column->pair_end<string_view, true>(),
+                        d_results,
+                        string_to_integer_check_fn<T>{});
+    } else {
+      thrust::transform(rmm::exec_policy(stream),
+                        d_column->pair_begin<string_view, false>(),
+                        d_column->pair_end<string_view, false>(),
+                        d_results,
+                        string_to_integer_check_fn<T>{});
+    }
+
+    // Calling mutable_view() on a column invalidates it's null count so we need to set it back
+    results->set_null_count(strings.null_count());
+
+    return results;
+  }
+
+  template <typename T, std::enable_if_t<not std::is_integral<T>::value>* = nullptr>
+  std::unique_ptr<column> operator()(strings_column_view const&,
+                                     rmm::cuda_stream_view,
+                                     rmm::mr::device_memory_resource*) const
+  {
+    CUDF_FAIL("is_integer is expecting an integer type");
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<column> is_integer(
+  strings_column_view const& strings,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  auto const d_column = column_device_view::create(strings.parent(), stream);
+  auto results        = make_numeric_column(data_type{type_id::BOOL8},
+                                     strings.size(),
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
+
+  auto d_results = results->mutable_view().data<bool>();
+  if (strings.has_nulls()) {
+    thrust::transform(
+      rmm::exec_policy(stream),
+      d_column->pair_begin<string_view, true>(),
+      d_column->pair_end<string_view, true>(),
+      d_results,
+      [] __device__(auto const& p) { return p.second ? string::is_integer(p.first) : false; });
+  } else {
+    thrust::transform(
+      rmm::exec_policy(stream),
+      d_column->pair_begin<string_view, false>(),
+      d_column->pair_end<string_view, false>(),
+      d_results,
+      [] __device__(auto const& p) { return p.second ? string::is_integer(p.first) : false; });
+  }
+
+  // Calling mutable_view() on a column invalidates it's null count so we need to set it back
+  results->set_null_count(strings.null_count());
+
+  return results;
+}
+
+std::unique_ptr<column> is_integer(
+  strings_column_view const& strings,
+  data_type int_type,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  if (strings.is_empty()) { return cudf::make_empty_column(data_type{type_id::BOOL8}); }
+  return type_dispatcher(int_type, dispatch_is_integer_fn{}, strings, stream, mr);
+}
+
+}  // namespace detail
+
+// external APIs
+std::unique_ptr<column> is_integer(strings_column_view const& strings,
+                                   rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::is_integer(strings, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> is_integer(strings_column_view const& strings,
+                                   data_type int_type,
+                                   rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::is_integer(strings, int_type, rmm::cuda_stream_default, mr);
+}
+
 namespace detail {
 namespace {
 /**
@@ -69,11 +222,10 @@ struct dispatch_to_integers_fn {
                   mutable_column_view& output_column,
                   rmm::cuda_stream_view stream) const
   {
-    auto d_results = output_column.data<IntegerType>();
     thrust::transform(rmm::exec_policy(stream),
                       thrust::make_counting_iterator<size_type>(0),
                       thrust::make_counting_iterator<size_type>(strings_column.size()),
-                      d_results,
+                      output_column.data<IntegerType>(),
                       string_to_integer_fn<IntegerType>{strings_column});
   }
   // non-integral types throw an exception
@@ -102,19 +254,22 @@ std::unique_ptr<column> to_integers(strings_column_view const& strings,
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_numeric_column(output_type, 0);
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_strings      = *strings_column;
-  // create integer output column copying the strings null-mask
-  auto results      = make_numeric_column(output_type,
+
+  // Create integer output column copying the strings null-mask
+  auto results = make_numeric_column(output_type,
                                      strings_count,
                                      cudf::detail::copy_bitmask(strings.parent(), stream, mr),
                                      strings.null_count(),
                                      stream,
                                      mr);
-  auto results_view = results->mutable_view();
-  // fill output column with integers
-  type_dispatcher(output_type, dispatch_to_integers_fn{}, d_strings, results_view, stream);
+  // Fill output column with integers
+  auto const strings_dev_view = column_device_view::create(strings.parent(), stream);
+  auto results_view           = results->mutable_view();
+  type_dispatcher(output_type, dispatch_to_integers_fn{}, *strings_dev_view, results_view, stream);
+
+  // Calling mutable_view() on a column invalidates it's null count so we need to set it back
   results->set_null_count(strings.null_count());
+
   return results;
 }
 
@@ -253,42 +408,5 @@ std::unique_ptr<column> from_integers(column_view const& integers,
   return detail::from_integers(integers, rmm::cuda_stream_default, mr);
 }
 
-namespace detail {
-std::unique_ptr<column> is_integer(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_column       = *strings_column;
-  // create output column
-  auto results   = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings.size(),
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
-  auto d_results = results->mutable_view().data<bool>();
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(strings.size()),
-                    d_results,
-                    [d_column] __device__(size_type idx) {
-                      if (d_column.is_null(idx)) return false;
-                      return string::is_integer(d_column.element<string_view>(idx));
-                    });
-  results->set_null_count(strings.null_count());
-  return results;
-}
-}  // namespace detail
-
-// external API
-std::unique_ptr<column> is_integer(strings_column_view const& strings,
-                                   rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::is_integer(strings, rmm::cuda_stream_default, mr);
-}
-
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index 95f9ecbe2ef..cac774ef43e 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,7 +43,8 @@ namespace {
  *
  * The backref numbers are expected to be 1-based.
  *
- * Returns a modified string without back-ref indicators.
+ * Returns a modified string without back-ref indicators and a vector of backref
+ * byte position pairs.
  * ```
  * Example:
  *    for input string:    'hello \2 and \1'
@@ -51,8 +52,9 @@ namespace {
  *    returned string is:  'hello  and '
  * ```
  */
-std::string parse_backrefs(std::string const& repl, std::vector<backref_type>& backrefs)
+std::pair<std::string, std::vector<backref_type>> parse_backrefs(std::string const& repl)
 {
+  std::vector<backref_type> backrefs;
   std::string str = repl;  // make a modifiable copy
   std::smatch m;
   std::regex ex("(\\\\\\d+)");  // this searches for backslash-number(s); example "\1"
@@ -60,21 +62,19 @@ std::string parse_backrefs(std::string const& repl, std::vector<backref_type>& b
   size_type byte_offset = 0;
   while (std::regex_search(str, m, ex)) {
     if (m.size() == 0) break;
-    backref_type item;
-    std::string bref   = m[0];
-    size_type position = static_cast<size_type>(m.position(0));
-    size_type length   = static_cast<size_type>(bref.length());
+    std::string const backref = m[0];
+    size_type const position  = static_cast<size_type>(m.position(0));
+    size_type const length    = static_cast<size_type>(backref.length());
     byte_offset += position;
-    item.first = std::atoi(bref.c_str() + 1);  // back-ref index number
-    CUDF_EXPECTS(item.first > 0, "Back-reference numbers must be greater than 0");
-    item.second = byte_offset;  // position within the string
+    size_type const index = std::atoi(backref.c_str() + 1);  // back-ref index number
+    CUDF_EXPECTS(index > 0, "Back-reference numbers must be greater than 0");
     rtn += str.substr(0, position);
     str = str.substr(position + length);
-    backrefs.push_back(item);
+    backrefs.push_back({index, byte_offset});
   }
   if (!str.empty())  // add the remainder
     rtn += str;      // of the string
-  return rtn;
+  return {rtn, backrefs};
 }
 
 }  // namespace
@@ -87,54 +87,54 @@ std::unique_ptr<column> replace_with_backrefs(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  auto strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(stream, mr);
+  if (strings.is_empty()) return make_empty_strings_column(stream, mr);
 
   CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
   CUDF_EXPECTS(!repl.empty(), "Parameter repl must not be empty");
 
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_strings      = *strings_column;
+  auto d_strings = column_device_view::create(strings.parent(), stream);
   // compile regex into device object
-  auto prog   = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
-  auto d_prog = *prog;
-  auto regex_insts = d_prog.insts_counts();
+  auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings.size(), stream);
+  auto const regex_insts = d_prog->insts_counts();
 
   // parse the repl string for backref indicators
-  std::vector<backref_type> h_backrefs;
-  std::string repl_template = parse_backrefs(repl, h_backrefs);
-  rmm::device_vector<backref_type> backrefs(h_backrefs);
-  string_scalar repl_scalar(repl_template);
-  string_view d_repl_template{repl_scalar.data(), repl_scalar.size()};
-
-  // copy null mask
-  auto null_mask  = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
-  auto null_count = strings.null_count();
+  auto const parse_result = parse_backrefs(repl);
+  rmm::device_uvector<backref_type> backrefs(parse_result.second.size(), stream);
+  CUDA_TRY(cudaMemcpyAsync(backrefs.data(),
+                           parse_result.second.data(),
+                           sizeof(backref_type) * backrefs.size(),
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
+  string_scalar repl_scalar(parse_result.first, true, stream);
+  string_view const d_repl_template = repl_scalar.value();
+
+  using BackRefIterator = decltype(backrefs.begin());
 
   // create child columns
-  children_pair children(nullptr, nullptr);
-  // Each invocation is predicated on the stack size
-  // which is dependent on the number of regex instructions
-  if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS)) {
-    children = make_strings_children(
-      backrefs_fn<RX_STACK_SMALL>{
-        d_strings, d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
-      strings_count,
-      null_count,
-      stream,
-      mr);
-  } else if (regex_insts <= RX_MEDIUM_INSTS)
-    children = replace_with_backrefs_medium(
-      d_strings, d_prog, d_repl_template, backrefs, null_count, stream, mr);
-  else
-    children = replace_with_backrefs_large(
-      d_strings, d_prog, d_repl_template, backrefs, null_count, stream, mr);
-
-  return make_strings_column(strings_count,
+  children_pair children = [&] {
+    // Each invocation is predicated on the stack size
+    // which is dependent on the number of regex instructions
+    if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS)) {
+      return make_strings_children(
+        backrefs_fn<BackRefIterator, RX_STACK_SMALL>{
+          *d_strings, *d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
+        strings.size(),
+        strings.null_count(),
+        stream,
+        mr);
+    } else if (regex_insts <= RX_MEDIUM_INSTS)
+      return replace_with_backrefs_medium(
+        *d_strings, *d_prog, d_repl_template, backrefs, strings.null_count(), stream, mr);
+    else
+      return replace_with_backrefs_large(
+        *d_strings, *d_prog, d_repl_template, backrefs, strings.null_count(), stream, mr);
+  }();
+
+  return make_strings_column(strings.size(),
                              std::move(children.first),
                              std::move(children.second),
-                             null_count,
-                             std::move(null_mask),
+                             strings.null_count(),
+                             cudf::detail::copy_bitmask(strings.parent(), stream, mr),
                              stream,
                              mr);
 }
diff --git a/cpp/src/strings/replace/backref_re.cuh b/cpp/src/strings/replace/backref_re.cuh
index f13d84cf9ca..529b91a98e5 100644
--- a/cpp/src/strings/replace/backref_re.cuh
+++ b/cpp/src/strings/replace/backref_re.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,13 +42,13 @@ using backref_type = thrust::pair<size_type, size_type>;
  * Small to medium instruction lengths can use the stack effectively though smaller executes faster.
  * Longer patterns require global memory. Shorter patterns are common in data cleaning.
  */
-template <size_t stack_size>
+template <typename Iterator, size_t stack_size>
 struct backrefs_fn {
   column_device_view const d_strings;
   reprog_device prog;
   string_view const d_repl;  // string replacement template
-  rmm::device_vector<backref_type>::iterator backrefs_begin;
-  rmm::device_vector<backref_type>::iterator backrefs_end;
+  Iterator backrefs_begin;
+  Iterator backrefs_end;
   int32_t* d_offsets{};
   char* d_chars{};
 
@@ -117,7 +117,7 @@ using children_pair = std::pair<std::unique_ptr<column>, std::unique_ptr<column>
 children_pair replace_with_backrefs_medium(column_device_view const& d_strings,
                                            reprog_device& d_prog,
                                            string_view const& d_repl_template,
-                                           rmm::device_vector<backref_type>& backrefs,
+                                           device_span<backref_type> backrefs,
                                            size_type null_count,
                                            rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr);
@@ -125,7 +125,7 @@ children_pair replace_with_backrefs_medium(column_device_view const& d_strings,
 children_pair replace_with_backrefs_large(column_device_view const& d_strings,
                                           reprog_device& d_prog,
                                           string_view const& d_repl_template,
-                                          rmm::device_vector<backref_type>& backrefs,
+                                          device_span<backref_type> backrefs,
                                           size_type null_count,
                                           rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr);
diff --git a/cpp/src/strings/replace/backref_re_large.cu b/cpp/src/strings/replace/backref_re_large.cu
index 0b078132623..56bd8941b8a 100644
--- a/cpp/src/strings/replace/backref_re_large.cu
+++ b/cpp/src/strings/replace/backref_re_large.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,17 +24,17 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
-//
 children_pair replace_with_backrefs_large(column_device_view const& d_strings,
                                           reprog_device& d_prog,
                                           string_view const& d_repl_template,
-                                          rmm::device_vector<backref_type>& backrefs,
+                                          device_span<backref_type> backrefs,
                                           size_type null_count,
                                           rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
 {
+  using Iterator = decltype(backrefs.begin());
   return make_strings_children(
-    backrefs_fn<RX_STACK_LARGE>{
+    backrefs_fn<Iterator, RX_STACK_LARGE>{
       d_strings, d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
     d_strings.size(),
     null_count,
diff --git a/cpp/src/strings/replace/backref_re_medium.cu b/cpp/src/strings/replace/backref_re_medium.cu
index 899e0cb2a3e..8b1dd6c5999 100644
--- a/cpp/src/strings/replace/backref_re_medium.cu
+++ b/cpp/src/strings/replace/backref_re_medium.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,17 +24,17 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
-//
 children_pair replace_with_backrefs_medium(column_device_view const& d_strings,
                                            reprog_device& d_prog,
                                            string_view const& d_repl_template,
-                                           rmm::device_vector<backref_type>& backrefs,
+                                           device_span<backref_type> backrefs,
                                            size_type null_count,
                                            rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
+  using Iterator = decltype(backrefs.begin());
   return make_strings_children(
-    backrefs_fn<RX_STACK_MEDIUM>{
+    backrefs_fn<Iterator, RX_STACK_MEDIUM>{
       d_strings, d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
     d_strings.size(),
     null_count,
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index 3eb551ead18..39725361741 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -139,15 +139,13 @@ std::unique_ptr<column> replace_re(
   auto strings_count = strings.size();
   if (strings_count == 0) return make_empty_strings_column(stream, mr);
   if (patterns.empty())  // no patterns; just return a copy
-    return std::make_unique<column>(strings.parent());
+    return std::make_unique<column>(strings.parent(), stream, mr);
 
   CUDF_EXPECTS(!repls.has_nulls(), "Parameter repls must not have any nulls");
 
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_strings      = *strings_column;
-  auto repls_column   = column_device_view::create(repls.parent(), stream);
-  auto d_repls        = *repls_column;
-  auto d_flags        = get_character_flags_table();
+  auto d_strings = column_device_view::create(strings.parent(), stream);
+  auto d_repls   = column_device_view::create(repls.parent(), stream);
+  auto d_flags   = get_character_flags_table();
 
   // compile regexes into device objects
   size_type regex_insts = 0;
@@ -170,37 +168,39 @@ std::unique_ptr<column> replace_re(
   reprog_device* d_progs = reinterpret_cast<reprog_device*>(progs_buffer.data());
 
   // create working buffer for ranges pairs
-  rmm::device_vector<found_range> found_ranges(patterns.size() * strings_count);
-  auto d_found_ranges = found_ranges.data().get();
+  rmm::device_uvector<found_range> found_ranges(patterns.size() * strings_count, stream);
+  auto d_found_ranges = found_ranges.data();
 
   // create child columns
-  std::pair<std::unique_ptr<column>, std::unique_ptr<column>> children(nullptr, nullptr);
-  // Each invocation is predicated on the stack size which is dependent on the number of regex
-  // instructions
-  if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS))
-    children = make_strings_children(
-      replace_multi_regex_fn<RX_STACK_SMALL>{
-        d_strings, d_progs, static_cast<size_type>(progs.size()), d_found_ranges, d_repls},
-      strings_count,
-      strings.null_count(),
-      stream,
-      mr);
-  else if (regex_insts <= RX_MEDIUM_INSTS)
-    children = make_strings_children(
-      replace_multi_regex_fn<RX_STACK_MEDIUM>{
-        d_strings, d_progs, static_cast<size_type>(progs.size()), d_found_ranges, d_repls},
-      strings_count,
-      strings.null_count(),
-      stream,
-      mr);
-  else
-    children = make_strings_children(
-      replace_multi_regex_fn<RX_STACK_LARGE>{
-        d_strings, d_progs, static_cast<size_type>(progs.size()), d_found_ranges, d_repls},
-      strings_count,
-      strings.null_count(),
-      stream,
-      mr);
+  // std::pair<std::unique_ptr<column>, std::unique_ptr<column>> children(nullptr, nullptr);
+  auto children = [&] {
+    // Each invocation is predicated on the stack size which is dependent on the number of regex
+    // instructions
+    if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS))
+      return make_strings_children(
+        replace_multi_regex_fn<RX_STACK_SMALL>{
+          *d_strings, d_progs, static_cast<size_type>(progs.size()), d_found_ranges, *d_repls},
+        strings_count,
+        strings.null_count(),
+        stream,
+        mr);
+    else if (regex_insts <= RX_MEDIUM_INSTS)
+      return make_strings_children(
+        replace_multi_regex_fn<RX_STACK_MEDIUM>{
+          *d_strings, d_progs, static_cast<size_type>(progs.size()), d_found_ranges, *d_repls},
+        strings_count,
+        strings.null_count(),
+        stream,
+        mr);
+    else
+      return make_strings_children(
+        replace_multi_regex_fn<RX_STACK_LARGE>{
+          *d_strings, d_progs, static_cast<size_type>(progs.size()), d_found_ranges, *d_repls},
+        strings_count,
+        strings.null_count(),
+        stream,
+        mr);
+  }();
 
   return make_strings_column(strings_count,
                              std::move(children.first),
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index 08af1d76d22..138fe3fa508 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -19,7 +19,6 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/utilities.hpp>
@@ -30,7 +29,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/find.h>
+#include <thrust/binary_search.h>
+#include <thrust/sort.h>
 
 #include <algorithm>
 
@@ -46,31 +46,37 @@ namespace {
  */
 struct translate_fn {
   column_device_view const d_strings;
-  rmm::device_vector<translate_table>::iterator table_begin;
-  rmm::device_vector<translate_table>::iterator table_end;
-  int32_t const* d_offsets{};
+  rmm::device_uvector<translate_table>::iterator table_begin;
+  rmm::device_uvector<translate_table>::iterator table_end;
+  int32_t* d_offsets{};
   char* d_chars{};
 
-  __device__ size_type operator()(size_type idx)
+  __device__ void operator()(size_type idx)
   {
-    if (d_strings.is_null(idx)) return 0;
-    string_view d_str = d_strings.element<string_view>(idx);
-    size_type bytes   = d_str.size_bytes();
-    char* out_ptr     = d_offsets ? d_chars + d_offsets[idx] : nullptr;
+    if (d_strings.is_null(idx)) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
+    string_view const d_str = d_strings.element<string_view>(idx);
+
+    size_type bytes = d_str.size_bytes();
+    char* out_ptr   = d_chars ? d_chars + d_offsets[idx] : nullptr;
     for (auto chr : d_str) {
-      auto entry =
-        thrust::find_if(thrust::seq, table_begin, table_end, [chr] __device__(auto const& te) {
-          return te.first == chr;
-        });
-      if (entry != table_end) {
+      auto const entry =
+        thrust::lower_bound(thrust::seq,
+                            table_begin,
+                            table_end,
+                            translate_table{chr, 0},
+                            [](auto const& lhs, auto const& rhs) { return lhs.first < rhs.first; });
+      if (entry != table_end && entry->first == chr) {
         bytes -= bytes_in_char_utf8(chr);
-        chr = static_cast<translate_table>(*entry).second;
+        chr = entry->second;
         if (chr)  // if null, skip the character
           bytes += bytes_in_char_utf8(chr);
       }
       if (chr && out_ptr) out_ptr += from_char_utf8(chr, out_ptr);
     }
-    return bytes;
+    if (!d_chars) d_offsets[idx] = bytes;
   }
 };
 
@@ -83,8 +89,7 @@ std::unique_ptr<column> translate(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(stream, mr);
+  if (strings.is_empty()) return make_empty_strings_column(stream, mr);
 
   size_type table_size = static_cast<size_type>(chars_table.size());
   // convert input table
@@ -92,35 +97,32 @@ std::unique_ptr<column> translate(
   std::transform(chars_table.begin(), chars_table.end(), htable.begin(), [](auto entry) {
     return translate_table{entry.first, entry.second};
   });
+  // The size of this table is usually much less than 100 so it is was
+  // found to be more efficient to sort on the CPU than the GPU.
+  thrust::sort(htable.begin(), htable.end(), [](auto const& lhs, auto const& rhs) {
+    return lhs.first < rhs.first;
+  });
   // copy translate table to device memory
-  rmm::device_vector<translate_table> table(htable);
-
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_strings      = *strings_column;
-  // create null mask
-  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
-  // create offsets column
-  auto offsets_transformer_itr = cudf::detail::make_counting_transform_iterator(
-    0, translate_fn{d_strings, table.begin(), table.end()});
-  auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_offsets = offsets_column->view().data<int32_t>();
-
-  // build chars column
-  size_type bytes   = thrust::device_pointer_cast(d_offsets)[strings_count];
-  auto chars_column = strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), bytes, stream, mr);
-  auto d_chars = chars_column->mutable_view().data<char>();
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<cudf::size_type>(0),
-                     strings_count,
-                     translate_fn{d_strings, table.begin(), table.end(), d_offsets, d_chars});
-
-  return make_strings_column(strings_count,
-                             std::move(offsets_column),
-                             std::move(chars_column),
+  rmm::device_uvector<translate_table> table(htable.size(), stream);
+  CUDA_TRY(cudaMemcpyAsync(table.data(),
+                           htable.data(),
+                           sizeof(translate_table) * htable.size(),
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
+
+  auto d_strings = column_device_view::create(strings.parent(), stream);
+
+  auto children = make_strings_children(translate_fn{*d_strings, table.begin(), table.end()},
+                                        strings.size(),
+                                        strings.null_count(),
+                                        stream,
+                                        mr);
+
+  return make_strings_column(strings.size(),
+                             std::move(children.first),
+                             std::move(children.second),
                              strings.null_count(),
-                             std::move(null_mask),
+                             cudf::detail::copy_bitmask(strings.parent(), stream, mr),
                              stream,
                              mr);
 }
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index 6ebe529b56e..e3a43ac25c0 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -14,6 +14,12 @@
  * limitations under the License.
  */
 
+#include <nvtext/normalize.hpp>
+#include <text/subword/detail/data_normalizer.hpp>
+#include <text/utilities/tokenize_ops.cuh>
+
+#include <strings/utilities.cuh>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -22,17 +28,11 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 
-#include <strings/utilities.cuh>
-
-#include <nvtext/normalize.hpp>
-
-#include <text/subword/detail/data_normalizer.hpp>
-#include <text/utilities/tokenize_ops.cuh>
-
 #include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/for_each.h>
@@ -54,32 +54,39 @@ namespace {
  */
 struct normalize_spaces_fn {
   cudf::column_device_view const d_strings;  // strings to normalize
-  int32_t const* d_offsets{};                // offsets into d_buffer
-  char* d_buffer{};                          // output buffer for characters
+  int32_t* d_offsets{};                      // offsets into d_buffer
+  char* d_chars{};                           // output buffer for characters
 
-  __device__ int32_t operator()(cudf::size_type idx)
+  __device__ void operator()(cudf::size_type idx)
   {
-    if (d_strings.is_null(idx)) return 0;
-    cudf::string_view single_space(" ", 1);
+    if (d_strings.is_null(idx)) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
+    cudf::string_view const single_space(" ", 1);
     auto const d_str = d_strings.element<cudf::string_view>(idx);
-    char* buffer     = d_offsets ? d_buffer + d_offsets[idx] : nullptr;
+    char* buffer     = d_chars ? d_chars + d_offsets[idx] : nullptr;
     char* optr       = buffer;  // running output pointer
     int32_t nbytes   = 0;       // holds the number of bytes per output string
-    // create tokenizer for this string with whitespace delimiter (default)
+
+    // create a tokenizer for this string with whitespace delimiter (default)
     characters_tokenizer tokenizer(d_str);
+
     // this will retrieve tokens automatically skipping runs of whitespace
     while (tokenizer.next_token()) {
-      auto token_pos = tokenizer.token_byte_positions();
+      auto const token_pos = tokenizer.token_byte_positions();
       nbytes += token_pos.second - token_pos.first + 1;  // token size plus a single space
       if (optr) {
-        cudf::string_view token(d_str.data() + token_pos.first, token_pos.second - token_pos.first);
+        cudf::string_view const token(d_str.data() + token_pos.first,
+                                      token_pos.second - token_pos.first);
         if (optr != buffer)  // prepend space unless we are at the beginning
           optr = cudf::strings::detail::copy_string(optr, single_space);
         // write token to output buffer
-        optr = cudf::strings::detail::copy_string(optr, token);  // copy token to output
+        optr = cudf::strings::detail::copy_string(optr, token);
       }
     }
-    return (nbytes > 0) ? nbytes - 1 : 0;  // remove trailing space
+    // remove trailing space
+    if (!d_chars) d_offsets[idx] = (nbytes > 0) ? nbytes - 1 : 0;
   }
 };
 
@@ -95,7 +102,7 @@ struct codepoint_to_utf8_fn {
   cudf::column_device_view const d_strings;  // input strings
   uint32_t const* cp_data;                   // full code-point array
   int32_t const* d_cp_offsets{};             // offsets to each string's code-point array
-  int32_t const* d_offsets{};                // offsets for the output strings
+  int32_t* d_offsets{};                      // offsets for the output strings
   char* d_chars{};                           // buffer for the output strings column
 
   /**
@@ -105,7 +112,7 @@ struct codepoint_to_utf8_fn {
    * @param count number of code-points in `str_cps`
    * @return Number of bytes required for the output
    */
-  __device__ cudf::size_type compute_output_size(uint32_t const* str_cps, uint32_t count)
+  __device__ int32_t compute_output_size(uint32_t const* str_cps, uint32_t count)
   {
     return thrust::transform_reduce(
       thrust::seq,
@@ -113,17 +120,23 @@ struct codepoint_to_utf8_fn {
       str_cps + count,
       [](auto cp) { return 1 + (cp >= UTF8_1BYTE) + (cp >= UTF8_2BYTE) + (cp >= UTF8_3BYTE); },
       0,
-      thrust::plus<cudf::size_type>());
+      thrust::plus<int32_t>());
   }
 
-  __device__ cudf::size_type operator()(cudf::size_type idx)
+  __device__ void operator()(cudf::size_type idx)
   {
-    if (d_strings.is_null(idx)) return 0;
+    if (d_strings.is_null(idx)) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
     auto const d_str  = d_strings.element<cudf::string_view>(idx);
     auto const offset = d_cp_offsets[idx];
     auto const count  = d_cp_offsets[idx + 1] - offset;  // number of code-points
     auto str_cps      = cp_data + offset;                // code-points for this string
-    if (!d_chars) return compute_output_size(str_cps, count);
+    if (!d_chars) {
+      d_offsets[idx] = compute_output_size(str_cps, count);
+      return;
+    }
     // convert each code-point to 1-4 UTF-8 encoded bytes
     char* out_ptr = d_chars + d_offsets[idx];
     for (uint32_t jdx = 0; jdx < count; ++jdx) {
@@ -149,7 +162,6 @@ struct codepoint_to_utf8_fn {
         *out_ptr++ = static_cast<char>((code_point & 0x3F) | 0x0080);
       }
     }
-    return 0;
   }
 };
 
@@ -161,40 +173,20 @@ std::unique_ptr<cudf::column> normalize_spaces(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  cudf::size_type strings_count = strings.size();
-  if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
+  if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
 
   // create device column
-  auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
-  auto d_strings      = *strings_column;
-  // copy bitmask
-  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
-
-  // create offsets by calculating size of each string for output
-  auto offsets_transformer_itr = cudf::detail::make_counting_transform_iterator(
-    0, normalize_spaces_fn{d_strings});  // this does size-only calc
-  auto offsets_column = cudf::strings::detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_offsets = offsets_column->view().data<int32_t>();
-
-  // build the chars column
-  cudf::size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
-  auto chars_column     = cudf::strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), bytes, stream, mr);
-  auto d_chars = chars_column->mutable_view().data<char>();
+  auto d_strings = cudf::column_device_view::create(strings.parent(), stream);
 
-  // copy tokens to the chars buffer
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<cudf::size_type>(0),
-                     strings_count,
-                     normalize_spaces_fn{d_strings, d_offsets, d_chars});
-  chars_column->set_null_count(0);  // reset null count for child column
+  // build offsets and children using the normalize_space_fn
+  auto children = cudf::strings::detail::make_strings_children(
+    normalize_spaces_fn{*d_strings}, strings.size(), strings.null_count(), stream, mr);
 
-  return cudf::make_strings_column(strings_count,
-                                   std::move(offsets_column),
-                                   std::move(chars_column),
+  return cudf::make_strings_column(strings.size(),
+                                   std::move(children.first),
+                                   std::move(children.second),
                                    strings.null_count(),
-                                   std::move(null_mask),
+                                   cudf::detail::copy_bitmask(strings.parent(), stream, mr),
                                    stream,
                                    mr);
 }
@@ -207,8 +199,7 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
                                                    rmm::cuda_stream_view stream,
                                                    rmm::mr::device_memory_resource* mr)
 {
-  auto const strings_count = strings.size();
-  if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
+  if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
 
   // create the normalizer and call it
   data_normalizer normalizer(stream, do_lower_case);
@@ -229,33 +220,20 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   // - the cp_offsets identify which code-points go with which strings
   uint32_t const* cp_chars  = result.first->data();
   int32_t const* cp_offsets = reinterpret_cast<int32_t const*>(result.second->data());
-  auto strings_column       = cudf::column_device_view::create(strings.parent(), stream);
-
-  // build the output offsets column: compute the output size of each string
-  auto offsets_transformer_itr = cudf::detail::make_counting_transform_iterator(
-    0, codepoint_to_utf8_fn{*strings_column, cp_chars, cp_offsets});
-  auto offsets_column = cudf::strings::detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_offsets = offsets_column->view().data<int32_t>();
 
-  // create the output chars column
-  cudf::size_type output_bytes =
-    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
-  auto chars_column = cudf::strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), output_bytes, stream, mr);
-  auto d_chars = chars_column->mutable_view().data<char>();
+  auto d_strings = cudf::column_device_view::create(strings.parent(), stream);
 
-  // build the chars output data: convert the 4-byte code-point values into UTF-8 chars
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<cudf::size_type>(0),
-    strings_count,
-    codepoint_to_utf8_fn{*strings_column, cp_chars, cp_offsets, d_offsets, d_chars});
-  chars_column->set_null_count(0);  // reset null count for child column
+  // build offsets and children using the codepoint_to_utf8_fn
+  auto children = cudf::strings::detail::make_strings_children(
+    codepoint_to_utf8_fn{*d_strings, cp_chars, cp_offsets},
+    strings.size(),
+    strings.null_count(),
+    stream,
+    mr);
 
-  return cudf::make_strings_column(strings_count,
-                                   std::move(offsets_column),
-                                   std::move(chars_column),
+  return cudf::make_strings_column(strings.size(),
+                                   std::move(children.first),
+                                   std::move(children.second),
                                    strings.null_count(),
                                    cudf::detail::copy_bitmask(strings.parent(), stream, mr),
                                    stream,
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index e95aab16098..ab14c2577bb 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -54,6 +54,7 @@ ConfigureTest(ERROR_TEST error/error_handling_test.cu)
 ###################################################################################################
 # - groupby tests ---------------------------------------------------------------------------------
 ConfigureTest(GROUPBY_TEST
+    groupby/collect_set_test.cpp
     groupby/groupby_groups_test.cpp
     groupby/group_argmin_test.cpp
     groupby/group_argmax_test.cpp
@@ -70,7 +71,11 @@ ConfigureTest(GROUPBY_TEST
     groupby/group_quantile_test.cpp
     groupby/group_nunique_test.cpp
     groupby/group_nth_element_test.cpp
-    groupby/group_collect_test.cpp)
+    groupby/group_collect_test.cpp
+    groupby/group_sum_scan_test.cpp
+    groupby/group_min_scan_test.cpp
+    groupby/group_max_scan_test.cpp
+    groupby/group_count_scan_test.cpp)
 
 ###################################################################################################
 # - join tests ------------------------------------------------------------------------------------
@@ -394,6 +399,11 @@ ConfigureTest(LISTS_TEST
     lists/extract_tests.cpp
     lists/sort_lists_tests.cpp)
 
+###################################################################################################
+# - bin tests ----------------------------------------------------------------------------------
+ConfigureTest(LABEL_BINS_TEST
+    labeling/label_bins_tests.cpp)
+
 ###################################################################################################
 ### enable testing ################################################################################
 ###################################################################################################
diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu
index debf540ea8e..da9509e94a6 100644
--- a/cpp/tests/error/error_handling_test.cu
+++ b/cpp/tests/error/error_handling_test.cu
@@ -83,11 +83,13 @@ TEST(StreamCheck, CatchFailedKernel)
                             "invalid configuration argument");
 }
 
-__global__ void assert_false_kernel() { release_assert(false && "this kernel should die"); }
+#ifndef NDEBUG
+
+__global__ void assert_false_kernel() { cudf_assert(false && "this kernel should die"); }
 
-__global__ void assert_true_kernel() { release_assert(true && "this kernel should live"); }
+__global__ void assert_true_kernel() { cudf_assert(true && "this kernel should live"); }
 
-TEST(ReleaseAssertDeathTest, release_assert_false)
+TEST(DebugAssertDeathTest, cudf_assert_false)
 {
   testing::FLAGS_gtest_death_test_style = "threadsafe";
 
@@ -100,19 +102,21 @@ TEST(ReleaseAssertDeathTest, release_assert_false)
     // each attempted kernel launch
     if (cudaErrorAssert == cudaDeviceSynchronize()) { std::abort(); }
 
-    // If we reach this point, the release_assert didn't work so we exit normally, which will cause
+    // If we reach this point, the cudf_assert didn't work so we exit normally, which will cause
     // EXPECT_DEATH to fail.
   };
 
   EXPECT_DEATH(call_kernel(), "this kernel should die");
 }
 
-TEST(ReleaseAssert, release_assert_true)
+TEST(DebugAssert, cudf_assert_true)
 {
   assert_true_kernel<<<1, 1>>>();
   ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize());
 }
 
+#endif
+
 // These tests don't use CUDF_TEST_PROGRAM_MAIN because :
 // 1.) They don't need the RMM Pool
 // 2.) The RMM Pool interferes with the death test
diff --git a/cpp/tests/groupby/collect_set_test.cpp b/cpp/tests/groupby/collect_set_test.cpp
new file mode 100644
index 00000000000..5303b8f4f61
--- /dev/null
+++ b/cpp/tests/groupby/collect_set_test.cpp
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+
+namespace cudf {
+namespace test {
+
+#define COL_K cudf::test::fixed_width_column_wrapper<int32_t, int32_t>
+#define COL_V cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>
+#define COL_S cudf::test::strings_column_wrapper
+#define LCL_V cudf::test::lists_column_wrapper<TypeParam, int32_t>
+#define LCL_S cudf::test::lists_column_wrapper<cudf::string_view>
+#define VALIDITY std::initializer_list<bool>
+#define COLLECT_SET cudf::make_collect_set_aggregation()
+#define COLLECT_SET_NULL_UNEQUAL \
+  cudf::make_collect_set_aggregation(null_policy::INCLUDE, null_equality::UNEQUAL)
+
+struct CollectSetTest : public cudf::test::BaseFixture {
+};
+
+template <typename V>
+struct CollectSetTypedTest : public cudf::test::BaseFixture {
+};
+
+using FixedWidthTypesNotBool = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
+                                                  cudf::test::FloatingPointTypes,
+                                                  cudf::test::TimestampTypes>;
+TYPED_TEST_CASE(CollectSetTypedTest, FixedWidthTypesNotBool);
+
+TYPED_TEST(CollectSetTypedTest, ExceptionTests)
+{
+  std::vector<groupby::aggregation_request> agg_requests(1);
+  agg_requests[0].values = COL_V{{1, 2, 3, 4, 5, 6}, {true, false, true, false, true, false}};
+  agg_requests[0].aggregations.push_back(cudf::make_collect_list_aggregation(null_policy::EXCLUDE));
+
+  // groupby cannot exclude nulls
+  groupby::groupby gby{table_view{{COL_K{1, 1, 2, 2, 3, 3}}}};
+  EXPECT_THROW(gby.aggregate(agg_requests), cudf::logic_error);
+}
+
+TYPED_TEST(CollectSetTypedTest, TrivialInput)
+{
+  // Empty input
+  // TODO: Enable this test after issue#7611 has been fixed
+  // test_single_agg(COL_K{}, COL_V{}, COL_K{}, COL_V{}, COLLECT_SET);
+
+  // Single key input
+  {
+    COL_K keys{1};
+    COL_V vals{10};
+    COL_K keys_expected{1};
+    LCL_V vals_expected{LCL_V{10}};
+    test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET);
+  }
+
+  // Non-repeated keys
+  {
+    COL_K keys{2, 1};
+    COL_V vals{20, 10};
+    COL_K keys_expected{1, 2};
+    LCL_V vals_expected{LCL_V{10}, LCL_V{20}};
+    test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET);
+  }
+}
+
+TYPED_TEST(CollectSetTypedTest, TypicalInput)
+{
+  // Pre-sorted keys
+  {
+    COL_K keys{1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3};
+    COL_V vals{10, 11, 10, 10, 20, 21, 21, 20, 30, 33, 32, 31};
+    COL_K keys_expected{1, 2, 3};
+    LCL_V vals_expected{{10, 11}, {20, 21}, {30, 31, 32, 33}};
+    test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET);
+  }
+
+  // Expect the result keys to be sorted by sort-based groupby
+  {
+    COL_K keys{4, 1, 2, 4, 3, 3, 2, 1};
+    COL_V vals{40, 10, 20, 40, 30, 30, 20, 11};
+    COL_K keys_expected{1, 2, 3, 4};
+    LCL_V vals_expected{{10, 11}, {20}, {30}, {40}};
+    test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET);
+  }
+}
+
+// Keys and values columns are sliced columns
+TYPED_TEST(CollectSetTypedTest, SlicedColumnsInput)
+{
+  COL_K keys_original{1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3};
+  COL_V vals_original{10, 11, 10, 10, 20, 21, 21, 20, 30, 33, 32, 31};
+  {
+    auto const keys          = cudf::slice(keys_original, {0, 4})[0];  // { 1, 1, 1, 1 }
+    auto const vals          = cudf::slice(vals_original, {0, 4})[0];  // { 10, 11, 10, 10 }
+    auto const keys_expected = COL_K{1};
+    auto const vals_expected = LCL_V{{10, 11}};
+    test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET);
+  }
+  {
+    auto const keys = cudf::slice(keys_original, {2, 10})[0];  // { 1, 1, 2, 2, 2, 2, 3, 3 }
+    auto const vals = cudf::slice(vals_original, {2, 10})[0];  // { 10, 10, 20, 21, 21, 20, 30, 33 }
+    auto const keys_expected = COL_K{1, 2, 3};
+    auto const vals_expected = LCL_V{{10}, {20, 21}, {30, 33}};
+    test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET);
+  }
+}
+
+TEST_F(CollectSetTest, StringInput)
+{
+  COL_K keys{1, 2, 3, 3, 2, 1, 2, 1, 2, 1, 1, 1, 1};
+  COL_S vals{
+    "String 1, first",
+    "String 2, first",
+    "String 3, first",
+    "String 3, second",
+    "String 2, second",
+    "String 1, second",
+    "String 2, second",  // repeated
+    "String 1, second",  // repeated
+    "String 2, second",  // repeated
+    "String 1, second",  // repeated
+    "String 1, second",  // repeated
+    "String 1, second",  // repeated
+    "String 1, second"   // repeated
+  };
+  COL_K keys_expected{1, 2, 3};
+  LCL_S vals_expected{{"String 1, first", "String 1, second"},
+                      {"String 2, first", "String 2, second"},
+                      {"String 3, first", "String 3, second"}};
+  test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET);
+}
+
+TYPED_TEST(CollectSetTypedTest, CollectWithNulls)
+{
+  // Just use an arbitrary value to store null entries
+  // Using this alias variable will make the code look cleaner
+  constexpr int32_t null = 0;
+
+  // Pre-sorted keys
+  {
+    COL_K keys{1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3};
+    COL_V vals{{10, 10, null, null, 20, null, null, null, 30, 31, 30, 31},
+               {true, true, false, false, true, false, false, false, true, true, true, true}};
+    COL_K keys_expected{1, 2, 3};
+
+    // By default, nulls are consider equals, thus only one null is kept per key
+    LCL_V vals_expected{{{10, null}, VALIDITY{true, false}},
+                        {{20, null}, VALIDITY{true, false}},
+                        {{30, 31}, VALIDITY{true, true}}};
+    test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET);
+
+    // All nulls per key are kept (nulls are put at the end of each list)
+    vals_expected = LCL_V{{{10, null, null}, VALIDITY{true, false, false}},
+                          {{20, null, null, null}, VALIDITY{true, false, false, false}},
+                          {{30, 31}, VALIDITY{true, true}}};
+    test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET_NULL_UNEQUAL);
+  }
+
+  // Expect the result keys to be sorted by sort-based groupby
+  {
+    COL_K keys{4, 1, 2, 4, 3, 3, 3, 3, 2, 1};
+    COL_V vals{{40, 10, 20, 40, null, null, null, null, 21, null},
+               {true, true, true, true, false, false, false, false, true, false}};
+    COL_K keys_expected{1, 2, 3, 4};
+
+    // By default, nulls are consider equals, thus only one null is kept per key
+    LCL_V vals_expected{{{10, null}, VALIDITY{true, false}},
+                        {{20, 21}, VALIDITY{true, true}},
+                        {{null}, VALIDITY{false}},
+                        {{40}, VALIDITY{true}}};
+    test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET);
+
+    // All nulls per key are kept (nulls are put at the end of each list)
+    vals_expected = LCL_V{{{10, null}, VALIDITY{true, false}},
+                          {{20, 21}, VALIDITY{true, true}},
+                          {{null, null, null, null}, VALIDITY{false, false, false, false}},
+                          {{40}, VALIDITY{true}}};
+    test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET_NULL_UNEQUAL);
+  }
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/groupby/group_collect_test.cpp b/cpp/tests/groupby/group_collect_test.cpp
index 9edd0a6932a..8a578ea0c0f 100644
--- a/cpp/tests/groupby/group_collect_test.cpp
+++ b/cpp/tests/groupby/group_collect_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,15 +26,15 @@ namespace cudf {
 namespace test {
 
 template <typename V>
-struct groupby_collect_test : public cudf::test::BaseFixture {
+struct groupby_collect_list_test : public cudf::test::BaseFixture {
 };
 
 using FixedWidthTypesNotBool = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
                                                   cudf::test::FloatingPointTypes,
                                                   cudf::test::TimestampTypes>;
-TYPED_TEST_CASE(groupby_collect_test, FixedWidthTypesNotBool);
+TYPED_TEST_CASE(groupby_collect_list_test, FixedWidthTypesNotBool);
 
-TYPED_TEST(groupby_collect_test, CollectWithoutNulls)
+TYPED_TEST(groupby_collect_list_test, CollectWithoutNulls)
 {
   using K = int32_t;
   using V = TypeParam;
@@ -45,11 +45,11 @@ TYPED_TEST(groupby_collect_test, CollectWithoutNulls)
   fixed_width_column_wrapper<K, int32_t> expect_keys{1, 2};
   lists_column_wrapper<V, int32_t> expect_vals{{1, 2, 3}, {4, 5, 6}};
 
-  auto agg = cudf::make_collect_aggregation();
+  auto agg = cudf::make_collect_list_aggregation();
   test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
 }
 
-TYPED_TEST(groupby_collect_test, CollectWithNulls)
+TYPED_TEST(groupby_collect_list_test, CollectWithNulls)
 {
   using K = int32_t;
   using V = TypeParam;
@@ -64,11 +64,11 @@ TYPED_TEST(groupby_collect_test, CollectWithNulls)
   lists_column_wrapper<V, int32_t> expect_vals{
     {{1, 2}, validity.begin()}, {{3, 4}, validity.begin()}, {{5, 6}, validity.begin()}};
 
-  auto agg = cudf::make_collect_aggregation();
+  auto agg = cudf::make_collect_list_aggregation();
   test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
 }
 
-TYPED_TEST(groupby_collect_test, CollectLists)
+TYPED_TEST(groupby_collect_list_test, CollectLists)
 {
   using K = int32_t;
   using V = TypeParam;
@@ -83,11 +83,11 @@ TYPED_TEST(groupby_collect_test, CollectLists)
   lists_column_wrapper<V, int32_t> expect_vals{
     {{1, 2}, {3, 4}}, {{5, 6, 7}, LCW{}}, {{9, 10}, {11}}};
 
-  auto agg = cudf::make_collect_aggregation();
+  auto agg = cudf::make_collect_list_aggregation();
   test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
 }
 
-TYPED_TEST(groupby_collect_test, dictionary)
+TYPED_TEST(groupby_collect_list_test, dictionary)
 {
   using K = int32_t;
   using V = TypeParam;
@@ -105,10 +105,11 @@ TYPED_TEST(groupby_collect_test, dictionary)
                                              0,
                                              rmm::device_buffer{0});
 
-  test_single_agg(keys, vals, expect_keys, expect_vals->view(), cudf::make_collect_aggregation());
+  test_single_agg(
+    keys, vals, expect_keys, expect_vals->view(), cudf::make_collect_list_aggregation());
 }
 
-TYPED_TEST(groupby_collect_test, CollectFailsWithNullExclusion)
+TYPED_TEST(groupby_collect_list_test, CollectFailsWithNullExclusion)
 {
   using K = int32_t;
   using V = TypeParam;
@@ -121,10 +122,10 @@ TYPED_TEST(groupby_collect_test, CollectFailsWithNullExclusion)
 
   std::vector<groupby::aggregation_request> agg_requests(1);
   agg_requests[0].values = values;
-  agg_requests[0].aggregations.push_back(cudf::make_collect_aggregation(null_policy::EXCLUDE));
+  agg_requests[0].aggregations.push_back(cudf::make_collect_list_aggregation(null_policy::EXCLUDE));
 
   CUDF_EXPECT_THROW_MESSAGE(gby.aggregate(agg_requests),
-                            "null exclusion is not supported on groupby COLLECT aggregation.");
+                            "null exclusion is not supported on groupby COLLECT_LIST aggregation.");
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/group_count_scan_test.cpp b/cpp/tests/groupby/group_count_scan_test.cpp
new file mode 100644
index 00000000000..b7b18982f51
--- /dev/null
+++ b/cpp/tests/groupby/group_count_scan_test.cpp
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+
+namespace cudf {
+namespace test {
+using K           = int32_t;
+using key_wrapper = fixed_width_column_wrapper<K>;
+
+template <typename T>
+struct groupby_count_scan_test : public cudf::test::BaseFixture {
+  using V              = T;
+  using R              = cudf::detail::target_type_t<V, aggregation::COUNT_ALL>;
+  using value_wrapper  = fixed_width_column_wrapper<V, int32_t>;
+  using result_wrapper = fixed_width_column_wrapper<R, int32_t>;
+};
+
+TYPED_TEST_CASE(groupby_count_scan_test, cudf::test::AllTypes);
+
+TYPED_TEST(groupby_count_scan_test, basic)
+{
+  using value_wrapper  = typename TestFixture::value_wrapper;
+  using result_wrapper = typename TestFixture::result_wrapper;
+
+  // clang-format off
+  key_wrapper keys  {1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  value_wrapper vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  key_wrapper expect_keys   {1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+  result_wrapper expect_vals{0, 1, 2, 0, 1, 2, 3, 0, 1, 2};
+  // clang-format on
+
+  auto agg1 = cudf::make_count_aggregation();
+  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg1)),
+                            "Unsupported groupby scan aggregation");
+
+  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2));
+}
+
+TYPED_TEST(groupby_count_scan_test, empty_cols)
+{
+  using value_wrapper  = typename TestFixture::value_wrapper;
+  using result_wrapper = typename TestFixture::result_wrapper;
+
+  // clang-format off
+  key_wrapper keys;
+  value_wrapper vals;
+
+  key_wrapper expect_keys;
+  result_wrapper expect_vals;
+  // clang-format on
+
+  auto agg1 = cudf::make_count_aggregation();
+  EXPECT_NO_THROW(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg1)));
+
+  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2));
+}
+
+TYPED_TEST(groupby_count_scan_test, zero_valid_keys)
+{
+  using value_wrapper  = typename TestFixture::value_wrapper;
+  using result_wrapper = typename TestFixture::result_wrapper;
+
+  // clang-format off
+  key_wrapper keys( {1, 2, 3}, all_null());
+  value_wrapper vals{3, 4, 5};
+
+  key_wrapper expect_keys{};
+  result_wrapper expect_vals{};
+  // clang-format on
+
+  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2));
+}
+
+TYPED_TEST(groupby_count_scan_test, zero_valid_values)
+{
+  using value_wrapper  = typename TestFixture::value_wrapper;
+  using result_wrapper = typename TestFixture::result_wrapper;
+
+  // clang-format off
+  key_wrapper keys   {1, 1, 1};
+  value_wrapper vals({3, 4, 5}, all_null());
+
+  key_wrapper expect_keys{1, 1, 1};
+  result_wrapper expect_vals{0, 1, 2};
+  // clang-format on
+
+  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2));
+}
+
+TYPED_TEST(groupby_count_scan_test, null_keys_and_values)
+{
+  using value_wrapper  = typename TestFixture::value_wrapper;
+  using result_wrapper = typename TestFixture::result_wrapper;
+
+  // clang-format off
+  key_wrapper keys(  {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  value_wrapper vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}, {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
+
+  //                        {1, 1, 1, 2, 2, 2, 2, 3, _, 3, 4}
+  key_wrapper expect_keys(  {1, 1, 1, 2, 2, 2, 2, 3,    3, 4}, all_valid());
+  //                        {0, 3, 6, 1, 4, _, 9, 2, 7, 8, -}
+  result_wrapper expect_vals{0, 1, 2, 0, 1,    2, 3, 0, 1, 0};
+  // clang-format on
+
+  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2));
+}
+
+struct groupby_count_scan_string_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_count_scan_string_test, basic)
+{
+  using V              = cudf::string_view;
+  using R              = cudf::detail::target_type_t<V, aggregation::COUNT_ALL>;
+  using result_wrapper = fixed_width_column_wrapper<R, int32_t>;
+
+  // clang-format off
+  key_wrapper keys           {  1,   3,   3,   5,   5,   0};
+  strings_column_wrapper vals{"1", "1", "1", "1", "1", "1"};
+
+  key_wrapper expect_keys   {0, 1, 3, 3, 5, 5};
+  result_wrapper expect_vals{0, 0, 0, 1, 0, 1};
+  // clang-format on
+
+  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2));
+}
+
+template <typename T>
+struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+
+TYPED_TEST(FixedPointTestBothReps, GroupByCountScan)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = fixed_point_column_wrapper<RepType>;
+
+  using V              = decimalXX;
+  using R              = cudf::detail::target_type_t<V, aggregation::COUNT_ALL>;
+  using result_wrapper = fixed_width_column_wrapper<R, int32_t>;
+
+  auto const scale = scale_type{-1};
+  // clang-format off
+  auto const keys = key_wrapper{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  auto const vals = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale};
+
+  auto const expect_keys = key_wrapper{1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+  auto const expect_vals = result_wrapper{0, 1, 2, 0, 1, 2, 3, 0, 1, 2};
+  // clang-format on
+
+  CUDF_EXPECT_THROW_MESSAGE(
+    test_single_scan(keys, vals, expect_keys, expect_vals, cudf::make_count_aggregation()),
+    "Unsupported groupby scan aggregation");
+
+  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2));
+}
+
+struct groupby_dictionary_count_scan_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_dictionary_count_scan_test, basic)
+{
+  using V              = std::string;
+  using R              = cudf::detail::target_type_t<V, aggregation::COUNT_ALL>;
+  using result_wrapper = fixed_width_column_wrapper<R, int32_t>;
+
+  // clang-format off
+  strings_column_wrapper keys{"1", "3", "3", "5", "5", "0"};
+  dictionary_column_wrapper<K> vals{1, 1, 1, 1, 1, 1};
+  strings_column_wrapper expect_keys{"0", "1", "3", "3", "5", "5"};
+  result_wrapper expect_vals{0, 0, 0, 1, 0, 1};
+  // clang-format on
+
+  auto agg1 = cudf::make_count_aggregation();
+  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg1)),
+                            "Unsupported groupby scan aggregation");
+  test_single_scan(
+    keys, vals, expect_keys, expect_vals, cudf::make_count_aggregation(null_policy::INCLUDE));
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/groupby/group_max_scan_test.cpp b/cpp/tests/groupby/group_max_scan_test.cpp
new file mode 100644
index 00000000000..c1fc48ca698
--- /dev/null
+++ b/cpp/tests/groupby/group_max_scan_test.cpp
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/dictionary/update_keys.hpp>
+
+namespace cudf {
+namespace test {
+using K           = int32_t;
+using key_wrapper = fixed_width_column_wrapper<K>;
+
+template <typename T>
+struct groupby_max_scan_test : public cudf::test::BaseFixture {
+  using V              = T;
+  using R              = cudf::detail::target_type_t<V, aggregation::MAX>;
+  using value_wrapper  = fixed_width_column_wrapper<V, int32_t>;
+  using result_wrapper = fixed_width_column_wrapper<R, int32_t>;
+};
+
+TYPED_TEST_CASE(groupby_max_scan_test, cudf::test::FixedWidthTypesWithoutFixedPoint);
+
+TYPED_TEST(groupby_max_scan_test, basic)
+{
+  using value_wrapper  = typename TestFixture::value_wrapper;
+  using result_wrapper = typename TestFixture::result_wrapper;
+
+  // clang-format off
+  key_wrapper keys   {1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  value_wrapper vals({5, 6, 7, 8, 9, 0, 1, 2, 3, 4});
+
+  key_wrapper expect_keys    {1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+                          // {5, 8, 1, 6, 9, 0, 4, 7, 2, 3}
+  result_wrapper expect_vals({5, 8, 8, 6, 9, 9, 9, 7, 7, 7});
+  // clang-format on
+
+  auto agg = cudf::make_max_aggregation();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_max_scan_test, empty_cols)
+{
+  using value_wrapper  = typename TestFixture::value_wrapper;
+  using result_wrapper = typename TestFixture::result_wrapper;
+
+  key_wrapper keys{};
+  value_wrapper vals{};
+
+  key_wrapper expect_keys{};
+  result_wrapper expect_vals{};
+
+  auto agg = cudf::make_max_aggregation();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_max_scan_test, zero_valid_keys)
+{
+  using value_wrapper  = typename TestFixture::value_wrapper;
+  using result_wrapper = typename TestFixture::result_wrapper;
+
+  // clang-format off
+  key_wrapper keys(  {1, 2, 3}, all_null());
+  value_wrapper vals({3, 4, 5});
+
+  key_wrapper expect_keys{};
+  result_wrapper expect_vals{};
+  // clang-format on
+
+  auto agg = cudf::make_max_aggregation();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_max_scan_test, zero_valid_values)
+{
+  using value_wrapper  = typename TestFixture::value_wrapper;
+  using result_wrapper = typename TestFixture::result_wrapper;
+
+  // clang-format off
+  key_wrapper keys   {1, 1, 1};
+  value_wrapper vals({3, 4, 5}, all_null());
+
+  key_wrapper expect_keys    {1, 1, 1};
+  result_wrapper expect_vals({-1, -1, -1}, all_null());
+  // clang-format on
+
+  auto agg = cudf::make_max_aggregation();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_max_scan_test, null_keys_and_values)
+{
+  using value_wrapper  = typename TestFixture::value_wrapper;
+  using result_wrapper = typename TestFixture::result_wrapper;
+
+  // clang-format off
+  key_wrapper keys(  {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  value_wrapper vals({5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 4}, {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
+
+                         //  {1, 1, 1, 2, 2, 2, 2, 3,   _, 3, 4}
+  key_wrapper expect_keys(   {1, 1, 1, 2, 2, 2, 2, 3,      3, 4}, all_valid());
+                         //  { -, 3, 6, 1, 4,  -, 9, 2, _, 8, -}
+  result_wrapper expect_vals({-1, 8, 8, 6, 9, -1, 9, 7,    7, -1},
+                             { 0, 1, 1, 1, 1,  0, 1, 1,    1, 0});
+  // clang-format on
+
+  auto agg = cudf::make_max_aggregation();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+template <typename T>
+struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+
+TYPED_TEST(FixedPointTestBothReps, GroupBySortMaxScanDecimalAsValue)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = fixed_point_column_wrapper<RepType>;
+
+  for (auto const i : {2, 1, 0, -1, -2}) {
+    auto const scale = scale_type{i};
+    // clang-format off
+    auto const keys = key_wrapper{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+    auto const vals = fp_wrapper{{5, 6, 7, 8, 9, 0, 1, 2, 3, 4}, scale};
+
+    //                                                        {5, 8, 1, 6, 9, 0, 4, 7, 2, 3}
+    auto const expect_keys     = key_wrapper{1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+    auto const expect_vals_max = fp_wrapper{{5, 8, 8, 6, 9, 9, 9, 7, 7, 7}, scale};
+    // clang-format on
+
+    auto agg = cudf::make_max_aggregation();
+    test_single_scan(keys, vals, expect_keys, expect_vals_max, std::move(agg));
+  }
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/groupby/group_min_scan_test.cpp b/cpp/tests/groupby/group_min_scan_test.cpp
new file mode 100644
index 00000000000..d3186d880cc
--- /dev/null
+++ b/cpp/tests/groupby/group_min_scan_test.cpp
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+
+namespace cudf {
+namespace test {
+using K           = int32_t;
+using key_wrapper = fixed_width_column_wrapper<K>;
+
+template <typename T>
+struct groupby_min_scan_test : public cudf::test::BaseFixture {
+  using V              = T;
+  using R              = cudf::detail::target_type_t<V, aggregation::MIN>;
+  using value_wrapper  = fixed_width_column_wrapper<V, int32_t>;
+  using result_wrapper = fixed_width_column_wrapper<R, int32_t>;
+};
+
+TYPED_TEST_CASE(groupby_min_scan_test, cudf::test::FixedWidthTypesWithoutFixedPoint);
+
+TYPED_TEST(groupby_min_scan_test, basic)
+{
+  using value_wrapper  = typename TestFixture::value_wrapper;
+  using result_wrapper = typename TestFixture::result_wrapper;
+
+  // clang-format off
+  key_wrapper keys   {1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  value_wrapper vals({5, 6, 7, 8, 9, 0, 1, 2, 3, 4});
+
+  key_wrapper expect_keys    {1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+  result_wrapper expect_vals({5, 5, 1, 6, 6, 0, 0, 7, 2, 2});
+  // clang-format on
+
+  auto agg = cudf::make_min_aggregation();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_min_scan_test, empty_cols)
+{
+  using value_wrapper  = typename TestFixture::value_wrapper;
+  using result_wrapper = typename TestFixture::result_wrapper;
+
+  key_wrapper keys{};
+  value_wrapper vals{};
+
+  key_wrapper expect_keys{};
+  result_wrapper expect_vals{};
+
+  auto agg = cudf::make_min_aggregation();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_min_scan_test, zero_valid_keys)
+{
+  using value_wrapper  = typename TestFixture::value_wrapper;
+  using result_wrapper = typename TestFixture::result_wrapper;
+
+  // clang-format off
+  key_wrapper keys({1, 2, 3}, all_null());
+  value_wrapper vals({3, 4, 5});
+
+  key_wrapper expect_keys{};
+  result_wrapper expect_vals{};
+  // clang-format on
+
+  auto agg = cudf::make_min_aggregation();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_min_scan_test, zero_valid_values)
+{
+  using value_wrapper  = typename TestFixture::value_wrapper;
+  using result_wrapper = typename TestFixture::result_wrapper;
+
+  // clang-format off
+  key_wrapper keys   {1, 1, 1};
+  value_wrapper vals({3, 4, 5}, all_null());
+
+  key_wrapper expect_keys    {1, 1, 1};
+  result_wrapper expect_vals({-1, -1, -1}, all_null());
+  // clang-format on
+
+  auto agg = cudf::make_min_aggregation();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_min_scan_test, null_keys_and_values)
+{
+  using value_wrapper  = typename TestFixture::value_wrapper;
+  using result_wrapper = typename TestFixture::result_wrapper;
+
+  // clang-format off
+  key_wrapper keys(  {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  value_wrapper vals({5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 4}, {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
+
+                         //  { 1, 1, 1, 2, 2,  2, 2, 3, _, 3, 4}
+  key_wrapper expect_keys(   { 1, 1, 1, 2, 2,  2, 2, 3,    3, 4}, all_valid());
+                         //  { _, 8, 1, 6, 9,  _, 4, 7, 2, 3, _}
+  result_wrapper expect_vals({-1, 8, 1, 6, 6, -1, 4, 7,    3, -1},
+                             { 0, 1, 1, 1, 1,  0, 1, 1,    1, 0});
+  // clang-format on
+
+  auto agg = cudf::make_min_aggregation();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+struct groupby_min_scan_string_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_min_scan_string_test, basic)
+{
+  key_wrapper keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  strings_column_wrapper vals{"año", "bit", "₹1", "aaa", "zit", "bat", "aaa", "$1", "₹1", "wut"};
+
+  key_wrapper expect_keys{1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+  strings_column_wrapper expect_vals;
+
+  auto agg = cudf::make_min_aggregation();
+  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)),
+                            "Unsupported groupby scan type-agg combination");
+}
+
+template <typename T>
+struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+
+TYPED_TEST(FixedPointTestBothReps, GroupBySortMinScanDecimalAsValue)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = fixed_point_column_wrapper<RepType>;
+
+  for (auto const i : {2, 1, 0, -1, -2}) {
+    auto const scale = scale_type{i};
+
+    // clang-format off
+    auto const keys = key_wrapper{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+    auto const vals = fp_wrapper{{5, 6, 7, 8, 9, 0, 1, 2, 3, 4}, scale};
+
+    // {5, 8, 1, 6, 9, 0, 4, 7, 2, 3}
+    auto const expect_keys     = key_wrapper{1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+    auto const expect_vals_min = fp_wrapper{{5, 5, 1, 6, 6, 0, 0, 7, 2, 2}, scale};
+    // clang-format on
+
+    auto agg = cudf::make_min_aggregation();
+    test_single_scan(keys, vals, expect_keys, expect_vals_min, std::move(agg));
+  }
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/groupby/group_sum_scan_test.cpp b/cpp/tests/groupby/group_sum_scan_test.cpp
new file mode 100644
index 00000000000..9f6c21462b3
--- /dev/null
+++ b/cpp/tests/groupby/group_sum_scan_test.cpp
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+
+namespace cudf {
+namespace test {
+using K           = int32_t;
+using key_wrapper = fixed_width_column_wrapper<K>;
+
+template <typename T>
+struct groupby_sum_scan_test : public cudf::test::BaseFixture {
+  using V              = T;
+  using R              = cudf::detail::target_type_t<V, aggregation::SUM>;
+  using value_wrapper  = fixed_width_column_wrapper<V, int32_t>;
+  using result_wrapper = fixed_width_column_wrapper<R, int32_t>;
+};
+
+using supported_types =
+  cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>,
+                     cudf::test::DurationTypes>;
+
+TYPED_TEST_CASE(groupby_sum_scan_test, supported_types);
+
+TYPED_TEST(groupby_sum_scan_test, basic)
+{
+  using value_wrapper  = typename TestFixture::value_wrapper;
+  using result_wrapper = typename TestFixture::result_wrapper;
+
+  // clang-format off
+  key_wrapper keys  {1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  value_wrapper vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  key_wrapper expect_keys   {1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+  //                        {0, 3, 6, 1, 4, 5, 9, 2, 7, 8}
+  result_wrapper expect_vals{0, 3, 9, 1, 5, 10, 19, 2, 9, 17};
+  // clang-format on
+  auto agg = cudf::make_sum_aggregation();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_sum_scan_test, empty_cols)
+{
+  using value_wrapper  = typename TestFixture::value_wrapper;
+  using result_wrapper = typename TestFixture::result_wrapper;
+
+  // clang-format off
+  key_wrapper keys{};
+  value_wrapper vals{};
+
+  key_wrapper expect_keys{};
+  result_wrapper expect_vals{};
+  // clang-format on
+
+  auto agg = cudf::make_sum_aggregation();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_sum_scan_test, zero_valid_keys)
+{
+  using value_wrapper  = typename TestFixture::value_wrapper;
+  using result_wrapper = typename TestFixture::result_wrapper;
+
+  // clang-format off
+  key_wrapper keys({1, 2, 3}, all_null());
+  value_wrapper vals{3, 4, 5};
+
+  key_wrapper expect_keys{};
+  result_wrapper expect_vals{};
+  // clang-format on
+
+  auto agg = cudf::make_sum_aggregation();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_sum_scan_test, zero_valid_values)
+{
+  using value_wrapper  = typename TestFixture::value_wrapper;
+  using result_wrapper = typename TestFixture::result_wrapper;
+
+  // clang-format off
+  key_wrapper keys   {1, 1, 1};
+  value_wrapper vals({3, 4, 5}, all_null());
+
+  key_wrapper expect_keys    {1, 1, 1};
+  result_wrapper expect_vals({3, 4, 5}, all_null());
+  // clang-format on
+
+  auto agg = cudf::make_sum_aggregation();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_sum_scan_test, null_keys_and_values)
+{
+  using value_wrapper  = typename TestFixture::value_wrapper;
+  using result_wrapper = typename TestFixture::result_wrapper;
+
+  // clang-format off
+  key_wrapper keys(  {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  value_wrapper vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}, {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
+
+  //                         { 1, 1, 1, 2, 2,  2,  2, 3, *, 3, 4};
+  key_wrapper expect_keys(   { 1, 1, 1, 2, 2,  2,  2, 3,    3, 4}, all_valid());
+                          // { -, 3, 6, 1, 4,  -,  9, 2, _, 8, -}
+  result_wrapper expect_vals({-1, 3, 9, 1, 5, -1, 14, 2,   10, -1},
+                             { 0, 1, 1, 1, 1,  0,  1, 1,    1, 0});
+  // clang-format on
+
+  auto agg = cudf::make_sum_aggregation();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+template <typename T>
+struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+
+TYPED_TEST(FixedPointTestBothReps, GroupBySortSumScanDecimalAsValue)
+{
+  using namespace numeric;
+  using decimalXX      = TypeParam;
+  using RepType        = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper     = fixed_point_column_wrapper<RepType>;
+  using out_fp_wrapper = fixed_point_column_wrapper<int64_t>;
+
+  for (auto const i : {2, 1, 0, -1, -2}) {
+    auto const scale = scale_type{i};
+    // clang-format off
+    auto const keys = key_wrapper{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+    auto const vals = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale};
+
+    auto const expect_keys     = key_wrapper    {1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+    auto const expect_vals_sum = out_fp_wrapper{{0, 3, 9, 1, 5, 10, 19, 2, 9, 17}, scale};
+    // clang-format on
+
+    auto agg2 = cudf::make_sum_aggregation();
+    test_single_scan(keys, vals, expect_keys, expect_vals_sum, std::move(agg2));
+  }
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/groupby/groupby_keys_test.cpp b/cpp/tests/groupby/groupby_keys_test.cpp
index 06ec9eb8968..78299e1a18c 100644
--- a/cpp/tests/groupby/groupby_keys_test.cpp
+++ b/cpp/tests/groupby/groupby_keys_test.cpp
@@ -33,166 +33,229 @@ using supported_types = cudf::test::
 
 TYPED_TEST_CASE(groupby_keys_test, supported_types);
 
-// clang-format off
 TYPED_TEST(groupby_keys_test, basic)
 {
-    using K = TypeParam;
-    using V = int32_t;
-    using R = cudf::detail::target_type_t<V, aggregation::COUNT_VALID>;
+  using K = TypeParam;
+  using V = int32_t;
+  using R = cudf::detail::target_type_t<V, aggregation::COUNT_VALID>;
 
-    fixed_width_column_wrapper<K> keys        { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-    fixed_width_column_wrapper<V> vals        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  // clang-format off
+  fixed_width_column_wrapper<K> keys        { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  fixed_width_column_wrapper<V> vals        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
 
-    fixed_width_column_wrapper<K> expect_keys { 1, 2, 3 };
-    fixed_width_column_wrapper<R> expect_vals { 3, 4, 3 };
+  fixed_width_column_wrapper<K> expect_keys { 1, 2, 3 };
+  fixed_width_column_wrapper<R> expect_vals { 3, 4, 3 };
+  // clang-format on
 
-    auto agg = cudf::make_count_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+  auto agg = cudf::make_count_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
 TYPED_TEST(groupby_keys_test, zero_valid_keys)
 {
-    using K = TypeParam;
-    using V = int32_t;
-    using R = cudf::detail::target_type_t<V, aggregation::COUNT_VALID>;
+  using K = TypeParam;
+  using V = int32_t;
+  using R = cudf::detail::target_type_t<V, aggregation::COUNT_VALID>;
 
-    fixed_width_column_wrapper<K> keys      ( { 1, 2, 3}, all_null() );
-    fixed_width_column_wrapper<V> vals        { 3, 4, 5};
+  // clang-format off
+  fixed_width_column_wrapper<K> keys      ( { 1, 2, 3}, all_null() );
+  fixed_width_column_wrapper<V> vals        { 3, 4, 5};
 
-    fixed_width_column_wrapper<K> expect_keys { };
-    fixed_width_column_wrapper<R> expect_vals { };
+  fixed_width_column_wrapper<K> expect_keys { };
+  fixed_width_column_wrapper<R> expect_vals { };
+  // clang-format on
 
-    auto agg = cudf::make_count_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+  auto agg = cudf::make_count_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
 TYPED_TEST(groupby_keys_test, some_null_keys)
 {
-    using K = TypeParam;
-    using V = int32_t;
-    using R = cudf::detail::target_type_t<V, aggregation::COUNT_VALID>;
-
-    fixed_width_column_wrapper<K> keys(       { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                              { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
-    fixed_width_column_wrapper<V> vals        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4};
+  using K = TypeParam;
+  using V = int32_t;
+  using R = cudf::detail::target_type_t<V, aggregation::COUNT_VALID>;
 
-                                          //  { 1, 1, 1,  2, 2, 2, 2,  3, 3,  4}
-    fixed_width_column_wrapper<K> expect_keys({ 1,        2,           3,     4}, all_valid());
-                                          //  { 0, 3, 6,  1, 4, 5, 9,  2, 8,  -}
-    fixed_width_column_wrapper<R> expect_vals { 3,        4,           2,     1};
+  // clang-format off
+  fixed_width_column_wrapper<K> keys(       { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+                                            { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  fixed_width_column_wrapper<V> vals        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4};
+
+                                        //  { 1, 1, 1,  2, 2, 2, 2,  3, 3,  4}
+  fixed_width_column_wrapper<K> expect_keys({ 1,        2,           3,     4}, all_valid());
+                                        //  { 0, 3, 6,  1, 4, 5, 9,  2, 8,  -}
+  fixed_width_column_wrapper<R> expect_vals { 3,        4,           2,     1};
+  // clang-format on
 
-    auto agg = cudf::make_count_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+  auto agg = cudf::make_count_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
 TYPED_TEST(groupby_keys_test, include_null_keys)
 {
-    using K = TypeParam;
-    using V = int32_t;
-    using R = cudf::detail::target_type_t<V, aggregation::SUM>;
-
-    fixed_width_column_wrapper<K> keys(       { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                              { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
-    fixed_width_column_wrapper<V> vals        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4};
-
-                                          //  { 1, 1, 1,  2, 2, 2, 2,  3, 3,  4,  -}
-    fixed_width_column_wrapper<K> expect_keys({ 1,        2,           3,     4,  3},
-                                              { 1,        1,           1,     1,  0});
-                                          //  { 0, 3, 6,  1, 4, 5, 9,  2, 8,  -,  -}
-    fixed_width_column_wrapper<R> expect_vals { 9,        19,          10,    4,  7};
-
-    auto agg = cudf::make_sum_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg),
-        force_use_sort_impl::NO, null_policy::INCLUDE);
+  using K = TypeParam;
+  using V = int32_t;
+  using R = cudf::detail::target_type_t<V, aggregation::SUM>;
+
+  // clang-format off
+  fixed_width_column_wrapper<K> keys(       { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+                                            { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  fixed_width_column_wrapper<V> vals        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4};
+
+                                        //  { 1, 1, 1,  2, 2, 2, 2,  3, 3,  4,  -}
+  fixed_width_column_wrapper<K> expect_keys({ 1,        2,           3,     4,  3},
+                                            { 1,        1,           1,     1,  0});
+                                        //  { 0, 3, 6,  1, 4, 5, 9,  2, 8,  -,  -}
+  fixed_width_column_wrapper<R> expect_vals { 9,        19,          10,    4,  7};
+  // clang-format on
+
+  auto agg = cudf::make_sum_aggregation();
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  std::move(agg),
+                  force_use_sort_impl::NO,
+                  null_policy::INCLUDE);
 }
 
 TYPED_TEST(groupby_keys_test, pre_sorted_keys)
 {
-    using K = TypeParam;
-    using V = int32_t;
-    using R = cudf::detail::target_type_t<V, aggregation::SUM>;
+  using K = TypeParam;
+  using V = int32_t;
+  using R = cudf::detail::target_type_t<V, aggregation::SUM>;
 
-    fixed_width_column_wrapper<K> keys        { 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4};
-    fixed_width_column_wrapper<V> vals        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4};
+  // clang-format off
+  fixed_width_column_wrapper<K> keys        { 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4};
+  fixed_width_column_wrapper<V> vals        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4};
 
-    fixed_width_column_wrapper<K> expect_keys { 1,       2,          3,       4};
-    fixed_width_column_wrapper<R> expect_vals { 3,       18,         24,      4};
+  fixed_width_column_wrapper<K> expect_keys { 1,       2,          3,       4};
+  fixed_width_column_wrapper<R> expect_vals { 3,       18,         24,      4};
+  // clang-format on
 
-    auto agg = cudf::make_sum_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), 
-        force_use_sort_impl::YES, null_policy::EXCLUDE, sorted::YES);
+  auto agg = cudf::make_sum_aggregation();
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  std::move(agg),
+                  force_use_sort_impl::YES,
+                  null_policy::EXCLUDE,
+                  sorted::YES);
 }
 
 TYPED_TEST(groupby_keys_test, pre_sorted_keys_descending)
 {
-    using K = TypeParam;
-    using V = int32_t;
-    using R = cudf::detail::target_type_t<V, aggregation::SUM>;
+  using K = TypeParam;
+  using V = int32_t;
+  using R = cudf::detail::target_type_t<V, aggregation::SUM>;
 
-    fixed_width_column_wrapper<K> keys        { 4, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1};
-    fixed_width_column_wrapper<V> vals        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4};
+  // clang-format off
+  fixed_width_column_wrapper<K> keys        { 4, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1};
+  fixed_width_column_wrapper<V> vals        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4};
 
-    fixed_width_column_wrapper<K> expect_keys { 4, 3,       2,          1      };
-    fixed_width_column_wrapper<R> expect_vals { 0, 6,       22,        21      };
+  fixed_width_column_wrapper<K> expect_keys { 4, 3,       2,          1      };
+  fixed_width_column_wrapper<R> expect_vals { 0, 6,       22,        21      };
+  // clang-format on
 
-    auto agg = cudf::make_sum_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg),
-        force_use_sort_impl::YES, null_policy::EXCLUDE, sorted::YES, {order::DESCENDING});
+  auto agg = cudf::make_sum_aggregation();
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  std::move(agg),
+                  force_use_sort_impl::YES,
+                  null_policy::EXCLUDE,
+                  sorted::YES,
+                  {order::DESCENDING});
 }
 
 TYPED_TEST(groupby_keys_test, pre_sorted_keys_nullable)
 {
-    using K = TypeParam;
-    using V = int32_t;
-    using R = cudf::detail::target_type_t<V, aggregation::SUM>;
+  using K = TypeParam;
+  using V = int32_t;
+  using R = cudf::detail::target_type_t<V, aggregation::SUM>;
 
-    fixed_width_column_wrapper<K> keys(       { 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4},
-                                              { 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1});
-    fixed_width_column_wrapper<V> vals        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4};
+  // clang-format off
+  fixed_width_column_wrapper<K> keys(       { 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4},
+                                            { 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1});
+  fixed_width_column_wrapper<V> vals        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4};
 
-    fixed_width_column_wrapper<K> expect_keys({ 1,       2,          3,       4}, all_valid());
-    fixed_width_column_wrapper<R> expect_vals { 3,       15,         17,      4};
+  fixed_width_column_wrapper<K> expect_keys({ 1,       2,          3,       4}, all_valid());
+  fixed_width_column_wrapper<R> expect_vals { 3,       15,         17,      4};
+  // clang-format on
 
-    auto agg = cudf::make_sum_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg),
-        force_use_sort_impl::YES, null_policy::EXCLUDE, sorted::YES); 
+  auto agg = cudf::make_sum_aggregation();
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  std::move(agg),
+                  force_use_sort_impl::YES,
+                  null_policy::EXCLUDE,
+                  sorted::YES);
 }
 
 TYPED_TEST(groupby_keys_test, pre_sorted_keys_nulls_before_include_nulls)
 {
-    using K = TypeParam;
-    using V = int32_t;
-    using R = cudf::detail::target_type_t<V, aggregation::SUM>;
-
-    fixed_width_column_wrapper<K> keys(       { 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4},
-                                              { 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1});
-    fixed_width_column_wrapper<V> vals        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4};
-
-                                          //  { 1, 1, 1,  -, -,  2, 2,  -,  3, 3,  4}
-    fixed_width_column_wrapper<K> expect_keys({ 1,        2,     2,     3,  3,     4},
-                                              { 1,        0,     1,     0,  1,     1});
-    fixed_width_column_wrapper<R> expect_vals { 3,        7,     11,    7,  17,    4};
-
-    auto agg = cudf::make_sum_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg),
-        force_use_sort_impl::YES, null_policy::INCLUDE, sorted::YES); 
+  using K = TypeParam;
+  using V = int32_t;
+  using R = cudf::detail::target_type_t<V, aggregation::SUM>;
+
+  // clang-format off
+  fixed_width_column_wrapper<K> keys(       { 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4},
+                                            { 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1});
+  fixed_width_column_wrapper<V> vals        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4};
+
+                                        //  { 1, 1, 1,  -, -,  2, 2,  -,  3, 3,  4}
+  fixed_width_column_wrapper<K> expect_keys({ 1,        2,     2,     3,  3,     4},
+                                            { 1,        0,     1,     0,  1,     1});
+  fixed_width_column_wrapper<R> expect_vals { 3,        7,     11,    7,  17,    4};
+  // clang-format on
+
+  auto agg = cudf::make_sum_aggregation();
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  std::move(agg),
+                  force_use_sort_impl::YES,
+                  null_policy::INCLUDE,
+                  sorted::YES);
+}
+
+TYPED_TEST(groupby_keys_test, mismatch_num_rows)
+{
+  using K = TypeParam;
+  using V = int32_t;
+
+  fixed_width_column_wrapper<K> keys{1, 2, 3};
+  fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4};
+
+  auto agg = cudf::make_count_aggregation();
+  CUDF_EXPECT_THROW_MESSAGE(test_single_agg(keys, vals, keys, vals, std::move(agg)),
+                            "Size mismatch between request values and groupby keys.");
+  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, keys, vals, std::move(agg)),
+                            "Size mismatch between request values and groupby keys.");
 }
 
-struct groupby_string_keys_test : public cudf::test::BaseFixture {};
+struct groupby_string_keys_test : public cudf::test::BaseFixture {
+};
 
 TEST_F(groupby_string_keys_test, basic)
 {
-    using V = int32_t;
-    using R = cudf::detail::target_type_t<V, aggregation::SUM>;
+  using V = int32_t;
+  using R = cudf::detail::target_type_t<V, aggregation::SUM>;
 
-    strings_column_wrapper        keys        { "aaa", "año", "₹1", "aaa", "año", "año", "aaa", "₹1", "₹1", "año"};
-    fixed_width_column_wrapper<V> vals        {     0,     1,    2,     3,     4,     5,     6,    7,    8,     9};
+  // clang-format off
+  strings_column_wrapper        keys        { "aaa", "año", "₹1", "aaa", "año", "año", "aaa", "₹1", "₹1", "año"};
+  fixed_width_column_wrapper<V> vals        {     0,     1,    2,     3,     4,     5,     6,    7,    8,     9};
 
-    strings_column_wrapper        expect_keys({ "aaa", "año", "₹1" });
-    fixed_width_column_wrapper<R> expect_vals {     9,    19,   17 };
+  strings_column_wrapper        expect_keys({ "aaa", "año", "₹1" });
+  fixed_width_column_wrapper<R> expect_vals {     9,    19,   17 };
+  // clang-format on
 
-    auto agg = cudf::make_sum_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+  auto agg = cudf::make_sum_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 // clang-format on
 
diff --git a/cpp/tests/groupby/groupby_test_util.hpp b/cpp/tests/groupby/groupby_test_util.hpp
index 0b68b7bbfb4..c7e27cd6367 100644
--- a/cpp/tests/groupby/groupby_test_util.hpp
+++ b/cpp/tests/groupby/groupby_test_util.hpp
@@ -99,6 +99,32 @@ inline void test_single_agg(column_view const& keys,
   }
 }
 
+inline void test_single_scan(column_view const& keys,
+                             column_view const& values,
+                             column_view const& expect_keys,
+                             column_view const& expect_vals,
+                             std::unique_ptr<aggregation>&& agg,
+                             null_policy include_null_keys                  = null_policy::EXCLUDE,
+                             sorted keys_are_sorted                         = sorted::NO,
+                             std::vector<order> const& column_order         = {},
+                             std::vector<null_order> const& null_precedence = {})
+{
+  std::vector<groupby::aggregation_request> requests;
+  requests.emplace_back(groupby::aggregation_request());
+  requests[0].values = values;
+
+  requests[0].aggregations.push_back(std::move(agg));
+
+  groupby::groupby gb_obj(
+    table_view({keys}), include_null_keys, keys_are_sorted, column_order, null_precedence);
+
+  // groupby scan uses sort implementation
+  auto result = gb_obj.scan(requests);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(table_view({expect_keys}), result.first->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expect_vals, *result.second[0].results[0], true);
+}
+
 inline auto all_valid()
 {
   auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 995ee94472f..013457d8ed6 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -100,6 +100,89 @@ std::unique_ptr<cudf::table> create_compressible_fixed_table(cudf::size_type num
   return create_fixed_table<T>(num_columns, num_rows, include_validity, compressible_elements);
 }
 
+// this function replicates the "list_gen" function in
+// python/cudf/cudf/tests/test_parquet.py
+template <typename T>
+std::unique_ptr<cudf::column> make_parquet_list_col(
+  int skip_rows, int num_rows, int lists_per_row, int list_size, bool include_validity)
+{
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0 ? 1 : 0; });
+
+  // root list
+  std::vector<int> row_offsets(num_rows + 1);
+  int row_offset_count = 0;
+  {
+    int offset = 0;
+    for (int idx = 0; idx < (num_rows) + 1; idx++) {
+      row_offsets[row_offset_count] = offset;
+      if (!include_validity || valids[idx]) { offset += lists_per_row; }
+      row_offset_count++;
+    }
+  }
+  cudf::test::fixed_width_column_wrapper<int> offsets(row_offsets.begin(),
+                                                      row_offsets.begin() + row_offset_count);
+
+  // child list
+  std::vector<int> child_row_offsets((num_rows * lists_per_row) + 1);
+  int child_row_offset_count = 0;
+  {
+    int offset = 0;
+    for (int idx = 0; idx < (num_rows * lists_per_row); idx++) {
+      int row_index = idx / lists_per_row;
+      if (include_validity && !valids[row_index]) { continue; }
+
+      child_row_offsets[child_row_offset_count] = offset;
+      offset += list_size;
+      child_row_offset_count++;
+    }
+    child_row_offsets[child_row_offset_count++] = offset;
+  }
+  cudf::test::fixed_width_column_wrapper<int> child_offsets(
+    child_row_offsets.begin(), child_row_offsets.begin() + child_row_offset_count);
+
+  // child values
+  std::vector<T> child_values(num_rows * lists_per_row * list_size);
+  T first_child_value_index = skip_rows * lists_per_row * list_size;
+  int child_value_count     = 0;
+  {
+    for (int idx = 0; idx < (num_rows * lists_per_row * list_size); idx++) {
+      int row_index = idx / (lists_per_row * list_size);
+
+      int val = first_child_value_index;
+      first_child_value_index++;
+
+      if (include_validity && !valids[row_index]) { continue; }
+
+      child_values[child_value_count] = val;
+      child_value_count++;
+    }
+  }
+  // validity by value instead of index
+  auto valids2 = cudf::detail::make_counting_transform_iterator(
+    0, [list_size](auto i) { return (i % list_size) % 2 == 0 ? 1 : 0; });
+  auto child_data = include_validity
+                      ? cudf::test::fixed_width_column_wrapper<T>(
+                          child_values.begin(), child_values.begin() + child_value_count, valids2)
+                      : cudf::test::fixed_width_column_wrapper<T>(
+                          child_values.begin(), child_values.begin() + child_value_count);
+
+  int child_offsets_size = static_cast<cudf::column_view>(child_offsets).size() - 1;
+  auto child             = cudf::make_lists_column(
+    child_offsets_size, child_offsets.release(), child_data.release(), 0, rmm::device_buffer{});
+
+  int offsets_size = static_cast<cudf::column_view>(offsets).size() - 1;
+  return include_validity
+           ? cudf::make_lists_column(
+               offsets_size,
+               offsets.release(),
+               std::move(child),
+               cudf::UNKNOWN_NULL_COUNT,
+               cudf::test::detail::make_null_mask(valids, valids + offsets_size))
+           : cudf::make_lists_column(
+               offsets_size, offsets.release(), std::move(child), 0, rmm::device_buffer{});
+}
+
 void compare_metadata_equality(cudf::io::table_input_metadata in_meta,
                                cudf::io::table_metadata out_meta)
 {
@@ -2188,6 +2271,125 @@ TEST_F(ParquetReaderTest, UserBounds)
   }
 }
 
+TEST_F(ParquetReaderTest, UserBoundsWithNulls)
+{
+  // clang-format off
+  cudf::test::fixed_width_column_wrapper<float> col{{1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,  5,5,5,5,5,5,5,5, 6,6,6,6,6,6,6,6, 7,7,7,7,7,7,7,7, 8,8,8,8,8,8,8,8}
+                                                   ,{1,1,1,0,0,0,1,1, 1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,0,0,  1,0,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,0}};
+  // clang-format on
+  cudf::table_view tbl({col});
+  auto filepath = temp_env->get_temp_filepath("UserBoundsWithNulls.parquet");
+  cudf_io::parquet_writer_options out_args =
+    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, tbl);
+  cudf_io::write_parquet(out_args);
+
+  // skip_rows / num_rows
+  // clang-format off
+  std::vector<std::pair<int, int>> params{ {-1, -1}, {1, 3}, {3, -1}, 
+                                           {31, -1}, {32, -1}, {33, -1},
+                                           {31, 5}, {32, 5}, {33, 5},
+                                           {-1, 7}, {-1, 31}, {-1, 32}, {-1, 33},
+                                           {62, -1}, {63, -1},
+                                           {62, 2}, {63, 1}};
+  // clang-format on
+  for (auto p : params) {
+    cudf_io::parquet_reader_options read_args =
+      cudf::io::parquet_reader_options::builder(cudf_io::source_info{filepath});
+    if (p.first >= 0) { read_args.set_skip_rows(p.first); }
+    if (p.second >= 0) { read_args.set_num_rows(p.second); }
+    auto result = cudf_io::read_parquet(read_args);
+
+    p.first  = p.first < 0 ? 0 : p.first;
+    p.second = p.second < 0 ? static_cast<cudf::column_view>(col).size() - p.first : p.second;
+    std::vector<cudf::size_type> slice_indices{p.first, p.first + p.second};
+    auto expected = cudf::slice(col, slice_indices);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), expected[0]);
+  }
+}
+
+TEST_F(ParquetReaderTest, UserBoundsWithNullsLarge)
+{
+  constexpr int num_rows = 30 * 1000000;
+
+  std::mt19937 gen(6747);
+  std::bernoulli_distribution bn(0.7f);
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [&](int index) { return bn(gen); });
+  auto values = thrust::make_counting_iterator(0);
+
+  cudf::test::fixed_width_column_wrapper<int> col(values, values + num_rows, valids);
+
+  // this file will have row groups of 1,000,000 each
+  cudf::table_view tbl({col});
+  auto filepath = temp_env->get_temp_filepath("UserBoundsWithNullsLarge.parquet");
+  cudf_io::parquet_writer_options out_args =
+    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, tbl);
+  cudf_io::write_parquet(out_args);
+
+  // skip_rows / num_rows
+  // clang-format off
+  std::vector<std::pair<int, int>> params{ {-1, -1}, {31, -1}, {32, -1}, {33, -1}, {1613470, -1}, {1999999, -1},
+                                           {31, 1}, {32, 1}, {33, 1},
+                                           // deliberately span some row group boundaries
+                                           {999000, 1001}, {999000, 2000}, {2999999, 2}, {13999997, -1},
+                                           {16785678, 3}, {22996176, 31},
+                                           {24001231, 17}, {29000001, 989999}, {29999999, 1} };
+  // clang-format on
+  for (auto p : params) {
+    cudf_io::parquet_reader_options read_args =
+      cudf::io::parquet_reader_options::builder(cudf_io::source_info{filepath});
+    if (p.first >= 0) { read_args.set_skip_rows(p.first); }
+    if (p.second >= 0) { read_args.set_num_rows(p.second); }
+    auto result = cudf_io::read_parquet(read_args);
+
+    p.first  = p.first < 0 ? 0 : p.first;
+    p.second = p.second < 0 ? static_cast<cudf::column_view>(col).size() - p.first : p.second;
+    std::vector<cudf::size_type> slice_indices{p.first, p.first + p.second};
+    auto expected = cudf::slice(col, slice_indices);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), expected[0]);
+  }
+}
+
+TEST_F(ParquetReaderTest, ListUserBoundsWithNullsLarge)
+{
+  constexpr int num_rows = 5 * 1000000;
+  auto colp              = make_parquet_list_col<int>(0, num_rows, 5, 8, true);
+  cudf::column_view col  = *colp;
+
+  // this file will have row groups of 1,000,000 each
+  cudf::table_view tbl({col});
+  auto filepath = temp_env->get_temp_filepath("ListUserBoundsWithNullsLarge.parquet");
+  cudf_io::parquet_writer_options out_args =
+    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, tbl);
+  cudf_io::write_parquet(out_args);
+
+  // skip_rows / num_rows
+  // clang-format off
+  std::vector<std::pair<int, int>> params{ {-1, -1}, {31, -1}, {32, -1}, {33, -1}, {161470, -1}, {4499997, -1},
+                                           {31, 1}, {32, 1}, {33, 1},
+                                           // deliberately span some row group boundaries
+                                           {999000, 1001}, {999000, 2000}, {2999999, 2},
+                                           {1678567, 3}, {4299676, 31},
+                                           {4001231, 17}, {1900000, 989999}, {4999999, 1} };
+  // clang-format on
+  for (auto p : params) {
+    cudf_io::parquet_reader_options read_args =
+      cudf::io::parquet_reader_options::builder(cudf_io::source_info{filepath});
+    if (p.first >= 0) { read_args.set_skip_rows(p.first); }
+    if (p.second >= 0) { read_args.set_num_rows(p.second); }
+    auto result = cudf_io::read_parquet(read_args);
+
+    p.first  = p.first < 0 ? 0 : p.first;
+    p.second = p.second < 0 ? static_cast<cudf::column_view>(col).size() - p.first : p.second;
+    std::vector<cudf::size_type> slice_indices{p.first, p.first + p.second};
+    auto expected = cudf::slice(col, slice_indices);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), expected[0]);
+  }
+}
+
 TEST_F(ParquetReaderTest, ReorderedColumns)
 {
   {
diff --git a/cpp/tests/labeling/label_bins_tests.cpp b/cpp/tests/labeling/label_bins_tests.cpp
new file mode 100644
index 00000000000..34c8ff7251f
--- /dev/null
+++ b/cpp/tests/labeling/label_bins_tests.cpp
@@ -0,0 +1,440 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/labeling/label_bins.hpp>
+#include <cudf/types.hpp>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_list_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <algorithm>
+#include <limits>
+#include <numeric>
+#include <vector>
+
+namespace {
+
+template <typename T>
+using fwc_wrapper = cudf::test::fixed_width_column_wrapper<T>;
+
+template <typename T>
+using fpc_wrapper = cudf::test::fixed_point_column_wrapper<T>;
+
+// TODO: Should we move these into type_lists? They seem generally useful.
+using cudf::test::FixedPointTypes;
+using cudf::test::FloatingPointTypes;
+using NumericTypesNotBool =
+  cudf::test::Concat<cudf::test::IntegralTypesNotBool, FloatingPointTypes>;
+using SignedNumericTypesNotBool =
+  cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>;
+
+struct BinTestFixture : public cudf::test::BaseFixture {
+};
+
+/*
+ * Test error cases.
+ *
+ * Most of these are not parameterized by type to avoid unnecessary test overhead.
+ */
+
+// Left edges type check.
+TEST(BinColumnErrorTests, TestInvalidLeft)
+{
+  fwc_wrapper<double> left_edges{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  fwc_wrapper<float> right_edges{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+  fwc_wrapper<float> input{0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5};
+
+  EXPECT_THROW(
+    cudf::label_bins(input, left_edges, cudf::inclusive::YES, right_edges, cudf::inclusive::NO),
+    cudf::logic_error);
+};
+
+// Right edges type check.
+TEST(BinColumnErrorTests, TestInvalidRight)
+{
+  fwc_wrapper<float> left_edges{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  fwc_wrapper<double> right_edges{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+  fwc_wrapper<float> input{0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5};
+
+  EXPECT_THROW(
+    cudf::label_bins(input, left_edges, cudf::inclusive::YES, right_edges, cudf::inclusive::NO),
+    cudf::logic_error);
+};
+
+// Input type check.
+TEST(BinColumnErrorTests, TestInvalidInput)
+{
+  fwc_wrapper<float> left_edges{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  fwc_wrapper<float> right_edges{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+  fwc_wrapper<double> input{0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5};
+
+  EXPECT_THROW(
+    cudf::label_bins(input, left_edges, cudf::inclusive::YES, right_edges, cudf::inclusive::NO),
+    cudf::logic_error);
+};
+
+// Number of left and right edges must match.
+TEST(BinColumnErrorTests, TestMismatchedEdges)
+{
+  fwc_wrapper<float> left_edges{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  fwc_wrapper<float> right_edges{1, 2, 3, 4, 5, 6, 7, 8, 9};
+  fwc_wrapper<float> input{0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5};
+
+  EXPECT_THROW(
+    cudf::label_bins(input, left_edges, cudf::inclusive::YES, right_edges, cudf::inclusive::NO),
+    cudf::logic_error);
+};
+
+// Left edges with nulls.
+TEST(BinColumnErrorTests, TestLeftEdgesWithNullsBefore)
+{
+  fwc_wrapper<float> left_edges{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+  fwc_wrapper<float> right_edges{1, 2, 3, 4, 5, 6, 7, 8, 9};
+  fwc_wrapper<float> input{0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5};
+
+  EXPECT_THROW(
+    cudf::label_bins(input, left_edges, cudf::inclusive::NO, right_edges, cudf::inclusive::NO),
+    cudf::logic_error);
+};
+
+// Right edges with nulls.
+TEST(BinColumnErrorTests, TestRightEdgesWithNullsBefore)
+{
+  fwc_wrapper<float> left_edges{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  fwc_wrapper<float> right_edges{{1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+  fwc_wrapper<float> input{0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5};
+
+  EXPECT_THROW(
+    cudf::label_bins(input, left_edges, cudf::inclusive::NO, right_edges, cudf::inclusive::NO),
+    cudf::logic_error);
+};
+
+/*
+ * Valid exceptional cases.
+ */
+
+template <typename T>
+struct GenericExceptionCasesBinTestFixture : public BinTestFixture {
+  void test(fwc_wrapper<T> input,
+            fwc_wrapper<cudf::size_type> expected,
+            fwc_wrapper<T> left_edges,
+            fwc_wrapper<T> right_edges)
+  {
+    auto result =
+      cudf::label_bins(input, left_edges, cudf::inclusive::NO, right_edges, cudf::inclusive::NO);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+};
+
+template <typename T>
+struct ExceptionCasesBinTestFixture : public GenericExceptionCasesBinTestFixture<T> {
+};
+
+TYPED_TEST_CASE(ExceptionCasesBinTestFixture, NumericTypesNotBool);
+
+// Empty input must return an empty output.
+TYPED_TEST(ExceptionCasesBinTestFixture, TestEmptyInput)
+{
+  this->test({}, {}, {0, 2, 4, 6, 8}, {2, 4, 6, 8, 10});
+};
+
+// If no edges are provided, the bin for all inputs is null.
+TYPED_TEST(ExceptionCasesBinTestFixture, TestEmptyEdges)
+{
+  this->test({1, 1}, {{0, 0}, {0, 0}}, {}, {});
+};
+
+// Values outside the bounds should be labeled NULL.
+TYPED_TEST(ExceptionCasesBinTestFixture, TestOutOfBoundsInput)
+{
+  this->test({7, 9, 11, 13}, {{3, 4, 0, 0}, {1, 1, 0, 0}}, {0, 2, 4, 6, 8}, {2, 4, 6, 8, 10});
+};
+
+// Null inputs must map to nulls.
+TYPED_TEST(ExceptionCasesBinTestFixture, TestInputWithNulls)
+{
+  this->test(
+    {{1, 3, 5, 7}, {0, 1, 0, 1}}, {{0, 1, 0, 3}, {0, 1, 0, 1}}, {0, 2, 4, 6, 8}, {2, 4, 6, 8, 10});
+};
+
+// Test that nan values are assigned the NULL label.
+template <typename T>
+struct NaNBinTestFixture : public GenericExceptionCasesBinTestFixture<T> {
+};
+
+TYPED_TEST_CASE(NaNBinTestFixture, FloatingPointTypes);
+
+TYPED_TEST(NaNBinTestFixture, TestNaN)
+{
+  if (std::numeric_limits<TypeParam>::has_quiet_NaN) {
+    this->test(
+      {std::numeric_limits<TypeParam>::quiet_NaN()}, {{0}, {0}}, {0, 2, 4, 6, 8}, {2, 4, 6, 8, 10});
+  }
+}
+
+/*
+ * Test inclusion options.
+ */
+
+template <typename T>
+struct BoundaryExclusionBinTestFixture : public BinTestFixture {
+  void test(cudf::inclusive left_inc,
+            cudf::inclusive right_inc,
+            fwc_wrapper<cudf::size_type> expected)
+  {
+    fwc_wrapper<T> left_edges{0, 2, 4, 6, 8};
+    fwc_wrapper<T> right_edges{2, 4, 6, 8, 10};
+    fwc_wrapper<T> input{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+
+    auto result = cudf::label_bins(input, left_edges, left_inc, right_edges, right_inc);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+};
+
+TYPED_TEST_CASE(BoundaryExclusionBinTestFixture, NumericTypesNotBool);
+
+// Boundary points when both bounds are excluded should be labeled null.
+TYPED_TEST(BoundaryExclusionBinTestFixture, TestNoIncludes)
+{
+  this->test(cudf::inclusive::NO,
+             cudf::inclusive::NO,
+             {{0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5}, {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0}});
+};
+
+// Boundary point 1 should be in bin 1 [1, 2).
+TYPED_TEST(BoundaryExclusionBinTestFixture, TestIncludeLeft)
+{
+  this->test(cudf::inclusive::YES,
+             cudf::inclusive::NO,
+             {{0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 0}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}});
+};
+
+// Boundary point 1 should be in bin 0 (0, 1].
+TYPED_TEST(BoundaryExclusionBinTestFixture, TestIncludeRight)
+{
+  this->test(cudf::inclusive::NO,
+             cudf::inclusive::YES,
+             {{0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4}, {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}});
+};
+
+/*
+ * Test real data.
+ */
+
+// Test numeric data of reasonable size with noncontiguous bins.
+template <typename T>
+struct RealDataBinTestFixture : public BinTestFixture {
+  void test(unsigned int num_elements   = 512,
+            unsigned int inputs_per_bin = 4,
+            T left_edge_start_val       = 0)
+  {
+    // Avoid testing numbers that are larger than the current type supports.
+    const T largest_value = (num_elements / inputs_per_bin) * 4;
+    num_elements          = std::min(std::numeric_limits<T>::max(), largest_value);
+
+    unsigned int num_edges = num_elements / inputs_per_bin;
+
+    std::vector<T> left_edge_vector(num_edges);
+    std::vector<T> right_edge_vector(num_edges);
+    std::vector<T> partial_input_vector(num_edges);
+    std::vector<T> input_vector;
+    std::vector<cudf::size_type> partial_expected_vector(num_edges);
+    std::vector<cudf::size_type> expected_vector;
+    std::vector<unsigned int> expected_validity(num_elements, 1);
+
+    std::iota(left_edge_vector.begin(), left_edge_vector.end(), left_edge_start_val);
+
+    // Create noncontiguous bins of width 2 separate by 2, and place inputs in the middle of each
+    // bin.
+    std::transform(
+      left_edge_vector.begin(), left_edge_vector.end(), left_edge_vector.begin(), [](T val) {
+        return val * 4;
+      });
+    std::transform(
+      left_edge_vector.begin(), left_edge_vector.end(), right_edge_vector.begin(), [](T val) {
+        return val + 2;
+      });
+    std::transform(
+      left_edge_vector.begin(), left_edge_vector.end(), partial_input_vector.begin(), [](T val) {
+        return val + 1;
+      });
+    std::iota(partial_expected_vector.begin(), partial_expected_vector.end(), 0);
+
+    // Create vector containing duplicates of all the inputs.
+    input_vector.reserve(num_elements);
+    expected_vector.reserve(num_elements);
+    for (unsigned int i = 0; i < inputs_per_bin; ++i) {
+      input_vector.insert(
+        input_vector.end(), partial_input_vector.begin(), partial_input_vector.end());
+      expected_vector.insert(
+        expected_vector.end(), partial_expected_vector.begin(), partial_expected_vector.end());
+    }
+
+    // Column wrappers are necessary inputs for the function.
+    fwc_wrapper<T> left_edges(left_edge_vector.begin(), left_edge_vector.end());
+    fwc_wrapper<T> right_edges(right_edge_vector.begin(), right_edge_vector.end());
+    fwc_wrapper<T> input(input_vector.begin(), input_vector.end());
+    fwc_wrapper<cudf::size_type> expected(
+      expected_vector.begin(), expected_vector.end(), expected_validity.begin());
+
+    auto result =
+      cudf::label_bins(input, left_edges, cudf::inclusive::YES, right_edges, cudf::inclusive::NO);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+};
+
+TYPED_TEST_CASE(RealDataBinTestFixture, NumericTypesNotBool);
+
+TYPED_TEST(RealDataBinTestFixture, TestRealData256) { this->test(256); };
+TYPED_TEST(RealDataBinTestFixture, TestRealData512) { this->test(512); };
+TYPED_TEST(RealDataBinTestFixture, TestRealData1024) { this->test(1024); };
+
+// Test negative numbers for signed types.
+template <typename T>
+struct NegativeNumbersBinTestFixture : public RealDataBinTestFixture<T> {
+  void test(unsigned int num_elements = 512, unsigned int inputs_per_bin = 4)
+  {
+    RealDataBinTestFixture<T>::test(
+      num_elements, inputs_per_bin, -static_cast<T>(num_elements / 2));
+  }
+};
+
+TYPED_TEST_CASE(NegativeNumbersBinTestFixture, SignedNumericTypesNotBool);
+
+TYPED_TEST(NegativeNumbersBinTestFixture, TestNegativeNumbers256) { this->test(256); };
+TYPED_TEST(NegativeNumbersBinTestFixture, TestNegativeNumbers512) { this->test(512); };
+TYPED_TEST(NegativeNumbersBinTestFixture, TestNegativeNumbers1024) { this->test(1024); };
+
+/*
+ * Test fixed point types.
+ */
+
+template <typename T>
+struct FixedPointBinTestFixture : public BinTestFixture {
+};
+
+TYPED_TEST_CASE(FixedPointBinTestFixture, FixedPointTypes);
+
+TYPED_TEST(FixedPointBinTestFixture, TestFixedPointData)
+{
+  using fpc_type_wrapper = fpc_wrapper<cudf::device_storage_type_t<TypeParam>>;
+
+  fpc_type_wrapper left_edges{{0, 10, 20, 30, 40, 50, 60, 70, 80, 90}, numeric::scale_type{0}};
+  fpc_type_wrapper right_edges{{10, 20, 30, 40, 50, 60, 70, 80, 90, 100}, numeric::scale_type{0}};
+  fpc_type_wrapper input{{25, 25, 25, 25, 25, 25, 25, 25, 25, 25}, numeric::scale_type{0}};
+
+  auto result =
+    cudf::label_bins(input, left_edges, cudf::inclusive::YES, right_edges, cudf::inclusive::NO);
+
+  // Check that every element is placed in bin 2.
+  fwc_wrapper<cudf::size_type> expected{{2, 2, 2, 2, 2, 2, 2, 2, 2, 2},
+                                        {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+};
+
+/*
+ * Test strings.
+ */
+
+// Basic test of strings of lowercase alphanumerics.
+TEST(TestStringData, SimpleStringTest)
+{
+  cudf::test::strings_column_wrapper left_edges{"a", "b", "c", "d", "e"};
+  cudf::test::strings_column_wrapper right_edges{"b", "c", "d", "e", "f"};
+  cudf::test::strings_column_wrapper input{"abc", "bcd", "cde", "def", "efg"};
+
+  auto result =
+    cudf::label_bins(input, left_edges, cudf::inclusive::YES, right_edges, cudf::inclusive::NO);
+
+  fwc_wrapper<cudf::size_type> expected{{0, 1, 2, 3, 4}, {1, 1, 1, 1, 1}};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+};
+
+// Test non-ASCII characters.
+TEST(TestStringData, NonAsciiStringTest)
+{
+  cudf::test::strings_column_wrapper left_edges{"A"};
+  cudf::test::strings_column_wrapper right_edges{"z"};
+  cudf::test::strings_column_wrapper input{"Héllo",
+                                           "thesé",
+                                           "HERE",
+                                           "tést strings",
+                                           "",
+                                           "1.75",
+                                           "-34",
+                                           "+9.8",
+                                           "17¼",
+                                           "x³",
+                                           "2³",
+                                           " 12⅝",
+                                           "1234567890",
+                                           "de",
+                                           "\t\r\n\f "};
+
+  auto result =
+    cudf::label_bins(input, left_edges, cudf::inclusive::NO, right_edges, cudf::inclusive::NO);
+
+  fwc_wrapper<cudf::size_type> expected{{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+                                        {1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0}};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+// Test sliced non-ASCII characters.
+TEST(TestStringData, SlicedNonAsciiStringTest)
+{
+  cudf::test::strings_column_wrapper left_edges{"A"};
+  cudf::test::strings_column_wrapper right_edges{"z"};
+  cudf::test::strings_column_wrapper input{"Héllo",
+                                           "thesé",
+                                           "HERE",
+                                           "tést strings",
+                                           "",
+                                           "1.75",
+                                           "-34",
+                                           "+9.8",
+                                           "17¼",
+                                           "x³",
+                                           "2³",
+                                           " 12⅝",
+                                           "1234567890",
+                                           "de",
+                                           "\t\r\n\f "};
+
+  auto sliced_inputs = cudf::slice(input, {1, 5, 5, 11});
+
+  {
+    auto result = cudf::label_bins(
+      sliced_inputs[0], left_edges, cudf::inclusive::NO, right_edges, cudf::inclusive::NO);
+    fwc_wrapper<cudf::size_type> expected{{0, 0, 0, 0}, {1, 1, 1, 0}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto result = cudf::label_bins(
+      sliced_inputs[1], left_edges, cudf::inclusive::NO, right_edges, cudf::inclusive::NO);
+    fwc_wrapper<cudf::size_type> expected{{0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 1, 0}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
+}  // anonymous namespace
+
+CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/rolling/collect_list_test.cpp b/cpp/tests/rolling/collect_list_test.cpp
index 6a3a80601d0..de179223d68 100644
--- a/cpp/tests/rolling/collect_list_test.cpp
+++ b/cpp/tests/rolling/collect_list_test.cpp
@@ -64,7 +64,7 @@ TYPED_TEST(TypedCollectListTest, BasicRollingWindow)
             static_cast<column_view>(foll_column).size());
 
   auto const result_column_based_window =
-    rolling_window(input_column, prev_column, foll_column, 1, make_collect_aggregation());
+    rolling_window(input_column, prev_column, foll_column, 1, make_collect_list_aggregation());
 
   auto const expected_result =
     lists_column_wrapper<T, int32_t>{
@@ -79,11 +79,11 @@ TYPED_TEST(TypedCollectListTest, BasicRollingWindow)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_column_based_window->view());
 
   auto const result_fixed_window =
-    rolling_window(input_column, 2, 1, 1, make_collect_aggregation());
+    rolling_window(input_column, 2, 1, 1, make_collect_list_aggregation());
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_fixed_window->view());
 
   auto const result_with_nulls_excluded =
-    rolling_window(input_column, 2, 1, 1, make_collect_aggregation(null_policy::EXCLUDE));
+    rolling_window(input_column, 2, 1, 1, make_collect_list_aggregation(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
 }
@@ -104,7 +104,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithEmptyOutputLists)
             static_cast<column_view>(foll_column).size());
 
   auto const result_column_based_window =
-    rolling_window(input_column, prev_column, foll_column, 0, make_collect_aggregation());
+    rolling_window(input_column, prev_column, foll_column, 0, make_collect_list_aggregation());
 
   auto const expected_result =
     lists_column_wrapper<T, int32_t>{
@@ -120,7 +120,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithEmptyOutputLists)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_column_based_window->view());
 
   auto const result_with_nulls_excluded = rolling_window(
-    input_column, prev_column, foll_column, 0, make_collect_aggregation(null_policy::EXCLUDE));
+    input_column, prev_column, foll_column, 0, make_collect_list_aggregation(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
 }
@@ -138,7 +138,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithEmptyOutputListsAtEnds)
   auto foll_column       = fixed_width_column_wrapper<size_type>{0, 1, 1, 1, 1, 0};
 
   auto const result =
-    rolling_window(input_column, prev_column, foll_column, 0, make_collect_aggregation());
+    rolling_window(input_column, prev_column, foll_column, 0, make_collect_list_aggregation());
 
   auto const expected_result =
     lists_column_wrapper<T, int32_t>{{}, {0, 1, 2}, {1, 2, 3}, {2, 3, 4}, {3, 4, 5}, {}}.release();
@@ -146,7 +146,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithEmptyOutputListsAtEnds)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view());
 
   auto const result_with_nulls_excluded = rolling_window(
-    input_column, prev_column, foll_column, 0, make_collect_aggregation(null_policy::EXCLUDE));
+    input_column, prev_column, foll_column, 0, make_collect_list_aggregation(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
 }
@@ -164,11 +164,11 @@ TYPED_TEST(TypedCollectListTest, RollingWindowHonoursMinPeriods)
   auto const input_column = fixed_width_column_wrapper<T, int32_t>{0, 1, 2, 3, 4, 5};
   auto const num_elements = static_cast<column_view>(input_column).size();
 
-  auto preceding   = 2;
-  auto following   = 1;
-  auto min_periods = 3;
-  auto const result =
-    rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation());
+  auto preceding    = 2;
+  auto following    = 1;
+  auto min_periods  = 3;
+  auto const result = rolling_window(
+    input_column, preceding, following, min_periods, make_collect_list_aggregation());
 
   auto const expected_result = lists_column_wrapper<T, int32_t>{
     {{}, {0, 1, 2}, {1, 2, 3}, {2, 3, 4}, {3, 4, 5}, {}},
@@ -183,7 +183,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowHonoursMinPeriods)
                    preceding,
                    following,
                    min_periods,
-                   make_collect_aggregation(null_policy::EXCLUDE));
+                   make_collect_list_aggregation(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
 
@@ -191,8 +191,8 @@ TYPED_TEST(TypedCollectListTest, RollingWindowHonoursMinPeriods)
   following   = 2;
   min_periods = 4;
 
-  auto result_2 =
-    rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation());
+  auto result_2 = rolling_window(
+    input_column, preceding, following, min_periods, make_collect_list_aggregation());
   auto expected_result_2 = lists_column_wrapper<T, int32_t>{
     {{}, {0, 1, 2, 3}, {1, 2, 3, 4}, {2, 3, 4, 5}, {}, {}},
     cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) {
@@ -206,7 +206,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowHonoursMinPeriods)
                    preceding,
                    following,
                    min_periods,
-                   make_collect_aggregation(null_policy::EXCLUDE));
+                   make_collect_list_aggregation(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result_2->view(),
                                       result_2_with_nulls_excluded->view());
@@ -228,11 +228,11 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods)
 
   {
     // One result row at each end should be null.
-    auto preceding   = 2;
-    auto following   = 1;
-    auto min_periods = 3;
-    auto const result =
-      rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation());
+    auto preceding    = 2;
+    auto following    = 1;
+    auto min_periods  = 3;
+    auto const result = rolling_window(
+      input_column, preceding, following, min_periods, make_collect_list_aggregation());
 
     auto expected_result_child_values   = std::vector<int32_t>{0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5};
     auto expected_result_child_validity = std::vector<bool>{1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1};
@@ -265,7 +265,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods)
                                        preceding,
                                        following,
                                        min_periods,
-                                       make_collect_aggregation(null_policy::EXCLUDE));
+                                       make_collect_list_aggregation(null_policy::EXCLUDE));
 
     auto expected_result_child_values = std::vector<int32_t>{0, 2, 2, 3, 2, 3, 3, 5};
     auto expected_result_child        = fixed_width_column_wrapper<T, int32_t>(
@@ -287,11 +287,11 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods)
 
   {
     // First result row, and the last two result rows should be null.
-    auto preceding   = 2;
-    auto following   = 2;
-    auto min_periods = 4;
-    auto const result =
-      rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation());
+    auto preceding    = 2;
+    auto following    = 2;
+    auto min_periods  = 4;
+    auto const result = rolling_window(
+      input_column, preceding, following, min_periods, make_collect_list_aggregation());
 
     auto expected_result_child_values   = std::vector<int32_t>{0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5};
     auto expected_result_child_validity = std::vector<bool>{1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1};
@@ -325,7 +325,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods)
                                        preceding,
                                        following,
                                        min_periods,
-                                       make_collect_aggregation(null_policy::EXCLUDE));
+                                       make_collect_list_aggregation(null_policy::EXCLUDE));
 
     auto expected_result_child_values = std::vector<int32_t>{0, 2, 3, 2, 3, 2, 3, 5};
     auto expected_result_child        = fixed_width_column_wrapper<T, int32_t>(
@@ -358,11 +358,11 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsOnStrings)
   auto const input_column = strings_column_wrapper{"0", "1", "2", "3", "4", "5"};
   auto const num_elements = static_cast<column_view>(input_column).size();
 
-  auto preceding   = 2;
-  auto following   = 1;
-  auto min_periods = 3;
-  auto const result =
-    rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation());
+  auto preceding    = 2;
+  auto following    = 1;
+  auto min_periods  = 3;
+  auto const result = rolling_window(
+    input_column, preceding, following, min_periods, make_collect_list_aggregation());
 
   auto const expected_result = lists_column_wrapper<string_view>{
     {{}, {"0", "1", "2"}, {"1", "2", "3"}, {"2", "3", "4"}, {"3", "4", "5"}, {}},
@@ -377,7 +377,7 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsOnStrings)
                    preceding,
                    following,
                    min_periods,
-                   make_collect_aggregation(null_policy::EXCLUDE));
+                   make_collect_list_aggregation(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
 
@@ -385,8 +385,8 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsOnStrings)
   following   = 2;
   min_periods = 4;
 
-  auto result_2 =
-    rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation());
+  auto result_2 = rolling_window(
+    input_column, preceding, following, min_periods, make_collect_list_aggregation());
   auto expected_result_2 = lists_column_wrapper<string_view>{
     {{}, {"0", "1", "2", "3"}, {"1", "2", "3", "4"}, {"2", "3", "4", "5"}, {}, {}},
     cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) {
@@ -400,7 +400,7 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsOnStrings)
                    preceding,
                    following,
                    min_periods,
-                   make_collect_aggregation(null_policy::EXCLUDE));
+                   make_collect_list_aggregation(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result_2->view(),
                                       result_2_with_nulls_excluded->view());
@@ -421,11 +421,11 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsWithDecimal)
 
   {
     // One result row at each end should be null.
-    auto preceding   = 2;
-    auto following   = 1;
-    auto min_periods = 3;
-    auto const result =
-      rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation());
+    auto preceding    = 2;
+    auto following    = 1;
+    auto min_periods  = 3;
+    auto const result = rolling_window(
+      input_column, preceding, following, min_periods, make_collect_list_aggregation());
 
     auto expected_result_child_values = std::vector<int32_t>{0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5};
     auto expected_result_child =
@@ -451,7 +451,7 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsWithDecimal)
                      preceding,
                      following,
                      min_periods,
-                     make_collect_aggregation(null_policy::EXCLUDE));
+                     make_collect_list_aggregation(null_policy::EXCLUDE));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(),
                                         result_with_nulls_excluded->view());
@@ -459,11 +459,11 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsWithDecimal)
 
   {
     // First result row, and the last two result rows should be null.
-    auto preceding   = 2;
-    auto following   = 2;
-    auto min_periods = 4;
-    auto const result =
-      rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation());
+    auto preceding    = 2;
+    auto following    = 2;
+    auto min_periods  = 4;
+    auto const result = rolling_window(
+      input_column, preceding, following, min_periods, make_collect_list_aggregation());
 
     auto expected_result_child_values = std::vector<int32_t>{0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5};
     auto expected_result_child =
@@ -489,7 +489,7 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsWithDecimal)
                      preceding,
                      following,
                      min_periods,
-                     make_collect_aggregation(null_policy::EXCLUDE));
+                     make_collect_list_aggregation(null_policy::EXCLUDE));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(),
                                         result_with_nulls_excluded->view());
@@ -515,7 +515,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindow)
                                              preceding,
                                              following,
                                              min_periods,
-                                             make_collect_aggregation());
+                                             make_collect_list_aggregation());
 
   auto const expected_result = lists_column_wrapper<T, int32_t>{
     {10, 11},
@@ -536,7 +536,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindow)
                            preceding,
                            following,
                            min_periods,
-                           make_collect_aggregation(null_policy::EXCLUDE));
+                           make_collect_list_aggregation(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
 }
@@ -563,7 +563,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindowWithNulls)
                                                preceding,
                                                following,
                                                min_periods,
-                                               make_collect_aggregation());
+                                               make_collect_list_aggregation());
 
     auto expected_child = fixed_width_column_wrapper<T, int32_t>{
       {10, 11, 10, 11, 12, 11, 12, 13, 12, 13, 14, 13, 14, 20, 21, 20, 21, 22, 21, 22, 23, 22, 23},
@@ -587,7 +587,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindowWithNulls)
                                                preceding,
                                                following,
                                                min_periods,
-                                               make_collect_aggregation(null_policy::EXCLUDE));
+                                               make_collect_list_aggregation(null_policy::EXCLUDE));
 
     auto expected_child = fixed_width_column_wrapper<T, int32_t>{
       10, 10, 12, 12, 13, 12, 13, 14, 13, 14, 20, 20, 22, 22, 23, 22, 23};
@@ -627,7 +627,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedTimeRangeRollingWindow)
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_aggregation());
+                                      make_collect_list_aggregation());
 
   auto const expected_result = lists_column_wrapper<T, int32_t>{
     {10, 11, 12, 13},
@@ -650,7 +650,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedTimeRangeRollingWindow)
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_aggregation(null_policy::EXCLUDE));
+                                      make_collect_list_aggregation(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
 }
@@ -678,7 +678,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithNulls)
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_aggregation());
+                                      make_collect_list_aggregation());
 
   auto null_at_0 = iterator_with_null_at(0);
   auto null_at_1 = iterator_with_null_at(1);
@@ -705,7 +705,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithNulls)
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_aggregation(null_policy::EXCLUDE));
+                                      make_collect_list_aggregation(null_policy::EXCLUDE));
 
   // After null exclusion, `11`, `21`, and `null` should not appear.
   auto const expected_result_with_nulls_excluded = lists_column_wrapper<T, int32_t>{
@@ -744,7 +744,7 @@ TEST_F(CollectListTest, BasicGroupedTimeRangeRollingWindowOnStrings)
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_aggregation());
+                                      make_collect_list_aggregation());
 
   auto const expected_result = lists_column_wrapper<cudf::string_view>{
     {"10", "11", "12", "13"},
@@ -767,7 +767,7 @@ TEST_F(CollectListTest, BasicGroupedTimeRangeRollingWindowOnStrings)
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_aggregation(null_policy::EXCLUDE));
+                                      make_collect_list_aggregation(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
 }
@@ -793,7 +793,7 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithNulls)
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_aggregation());
+                                      make_collect_list_aggregation());
 
   auto null_at_0 = iterator_with_null_at(0);
   auto null_at_1 = iterator_with_null_at(1);
@@ -821,7 +821,7 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithNulls)
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_aggregation(null_policy::EXCLUDE));
+                                      make_collect_list_aggregation(null_policy::EXCLUDE));
 
   // After null exclusion, `11`, `21`, and `null` should not appear.
   auto const expected_result_with_nulls_excluded = lists_column_wrapper<cudf::string_view>{
@@ -868,7 +868,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedTimeRangeRollingWindowOnStructs)
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_aggregation());
+                                      make_collect_list_aggregation());
 
   auto expected_numeric_column = fixed_width_column_wrapper<T, int32_t>{
     10, 11, 12, 13, 10, 11, 12, 13, 10, 11, 12, 13, 14, 10, 11, 12,
@@ -898,7 +898,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedTimeRangeRollingWindowOnStructs)
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_aggregation(null_policy::EXCLUDE));
+                                      make_collect_list_aggregation(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
 }
@@ -928,7 +928,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithMinPeriods)
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_aggregation());
+                                      make_collect_list_aggregation());
 
   auto const expected_result = lists_column_wrapper<T, int32_t>{
     {{10, 11, 12, 13},
@@ -954,7 +954,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithMinPeriods)
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_aggregation(null_policy::EXCLUDE));
+                                      make_collect_list_aggregation(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
 }
@@ -984,7 +984,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithNullsAndMinPer
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_aggregation());
+                                      make_collect_list_aggregation());
 
   auto null_at_1 = iterator_with_null_at(1);
 
@@ -1013,7 +1013,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithNullsAndMinPer
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_aggregation(null_policy::EXCLUDE));
+                                      make_collect_list_aggregation(null_policy::EXCLUDE));
 
   // After null exclusion, `11`, `21`, and `null` should not appear.
   auto const expected_result_with_nulls_excluded = lists_column_wrapper<T, int32_t>{
@@ -1056,7 +1056,7 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithMinPeriods)
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_aggregation());
+                                      make_collect_list_aggregation());
 
   auto const expected_result = lists_column_wrapper<cudf::string_view>{
     {{"10", "11", "12", "13"},
@@ -1082,7 +1082,7 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithMinPeriods)
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_aggregation(null_policy::EXCLUDE));
+                                      make_collect_list_aggregation(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
 }
@@ -1110,7 +1110,7 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithNullsAndMinPer
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_aggregation());
+                                      make_collect_list_aggregation());
 
   auto null_at_1 = iterator_with_null_at(1);
 
@@ -1139,7 +1139,7 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithNullsAndMinPer
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_aggregation(null_policy::EXCLUDE));
+                                      make_collect_list_aggregation(null_policy::EXCLUDE));
 
   // After null exclusion, `11`, `21`, and `null` should not appear.
   auto const expected_result_with_nulls_excluded = lists_column_wrapper<cudf::string_view>{
@@ -1190,7 +1190,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowOnStructsWithMinPe
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_aggregation());
+                                      make_collect_list_aggregation());
 
   auto expected_numeric_column = fixed_width_column_wrapper<T, int32_t>{
     10, 11, 12, 13, 10, 11, 12, 13, 10, 11, 12, 13, 14, 10, 11, 12, 13, 14, 10, 11, 12, 13, 14};
@@ -1226,7 +1226,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowOnStructsWithMinPe
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_aggregation(null_policy::EXCLUDE));
+                                      make_collect_list_aggregation(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
 }
diff --git a/cpp/tests/strings/integers_tests.cu b/cpp/tests/strings/integers_tests.cu
index d6bf03b3f76..f15116ae4c2 100644
--- a/cpp/tests/strings/integers_tests.cu
+++ b/cpp/tests/strings/integers_tests.cu
@@ -26,20 +26,18 @@
 #include <string>
 #include <vector>
 
+// Using an alias variable for the null elements
+// This will make the code looks cleaner
+constexpr auto NULL_VAL = 0;
+
 struct StringsConvertTest : public cudf::test::BaseFixture {
 };
 
-TEST_F(StringsConvertTest, IsInteger)
+TEST_F(StringsConvertTest, IsIntegerBasicCheck)
 {
-  cudf::test::strings_column_wrapper strings;
-  auto strings_view = cudf::strings_column_view(strings);
-  auto results      = cudf::strings::is_integer(strings_view);
-  EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id());
-  EXPECT_EQ(0, results->view().size());
-
   cudf::test::strings_column_wrapper strings1(
     {"+175", "-34", "9.8", "17+2", "+-14", "1234567890", "67de", "", "1e10", "-", "++", ""});
-  results = cudf::strings::is_integer(cudf::strings_column_view(strings1));
+  auto results = cudf::strings::is_integer(cudf::strings_column_view(strings1));
   cudf::test::fixed_width_column_wrapper<bool> expected1({1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);
 
@@ -50,24 +48,187 @@ TEST_F(StringsConvertTest, IsInteger)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
 }
 
+TEST_F(StringsConvertTest, ZeroSizeIsIntegerBasicCheck)
+{
+  cudf::test::strings_column_wrapper strings;
+  auto strings_view = cudf::strings_column_view(strings);
+  auto results      = cudf::strings::is_integer(strings_view);
+  EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id());
+  EXPECT_EQ(0, results->view().size());
+}
+
+TEST_F(StringsConvertTest, IsIntegerBoundCheckNoNull)
+{
+  auto strings = cudf::test::strings_column_wrapper(
+    {"+175", "-34", "9.8", "17+2", "+-14", "1234567890", "67de", "", "1e10", "-", "++", ""});
+  auto results = cudf::strings::is_integer(cudf::strings_column_view(strings),
+                                           cudf::data_type{cudf::type_id::INT32});
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<bool>({1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  strings = cudf::test::strings_column_wrapper(
+    {"0", "+0", "-0", "1234567890", "-27341132", "+012", "023", "-045"});
+  results  = cudf::strings::is_integer(cudf::strings_column_view(strings),
+                                      cudf::data_type{cudf::type_id::INT32});
+  expected = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 1, 1, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
+TEST_F(StringsConvertTest, IsIntegerBoundCheckWithNulls)
+{
+  std::vector<const char*> const h_strings{
+    "eee", "1234", nullptr, "", "-9832", "93.24", "765é", nullptr};
+  auto const strings = cudf::test::strings_column_wrapper(
+    h_strings.begin(),
+    h_strings.end(),
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  auto const results = cudf::strings::is_integer(cudf::strings_column_view(strings),
+                                                 cudf::data_type{cudf::type_id::INT32});
+  // Input has null elements then the output should have the same null mask
+  auto const expected = cudf::test::fixed_width_column_wrapper<bool>(
+    std::initializer_list<int8_t>{0, 1, NULL_VAL, 0, 1, 0, 0, NULL_VAL},
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
+TEST_F(StringsConvertTest, ZeroSizeIsIntegerBoundCheck)
+{
+  // Empty input
+  auto strings = cudf::test::strings_column_wrapper{};
+  auto results = cudf::strings::is_integer(cudf::strings_column_view(strings),
+                                           cudf::data_type{cudf::type_id::INT32});
+  EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id());
+  EXPECT_EQ(0, results->view().size());
+}
+
+TEST_F(StringsConvertTest, IsIntegerBoundCheckSmallNumbers)
+{
+  auto strings = cudf::test::strings_column_wrapper(
+    {"-200", "-129", "-128", "-120", "0", "120", "127", "130", "150", "255", "300", "500"});
+  auto results = cudf::strings::is_integer(cudf::strings_column_view(strings),
+                                           cudf::data_type{cudf::type_id::INT8});
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  results  = cudf::strings::is_integer(cudf::strings_column_view(strings),
+                                      cudf::data_type{cudf::type_id::UINT8});
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  strings = cudf::test::strings_column_wrapper(
+    {"-40000", "-32769", "-32768", "-32767", "-32766", "32765", "32766", "32767", "32768"});
+  results  = cudf::strings::is_integer(cudf::strings_column_view(strings),
+                                      cudf::data_type{cudf::type_id::INT16});
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 1, 1, 1, 1, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  results  = cudf::strings::is_integer(cudf::strings_column_view(strings),
+                                      cudf::data_type{cudf::type_id::UINT16});
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 0, 0, 1, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  results  = cudf::strings::is_integer(cudf::strings_column_view(strings),
+                                      cudf::data_type{cudf::type_id::INT32});
+  expected = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 1, 1, 1, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
+TEST_F(StringsConvertTest, IsIntegerBoundCheckLargeNumbers)
+{
+  auto strings =
+    cudf::test::strings_column_wrapper({"-2147483649",   // std::numeric_limits<int32_t>::min() - 1
+                                        "-2147483648",   // std::numeric_limits<int32_t>::min()
+                                        "-2147483647",   // std::numeric_limits<int32_t>::min() + 1
+                                        "2147483646",    // std::numeric_limits<int32_t>::max() - 1
+                                        "2147483647",    // std::numeric_limits<int32_t>::max()
+                                        "2147483648",    // std::numeric_limits<int32_t>::max() + 1
+                                        "4294967294",    // std::numeric_limits<uint32_t>::max() - 1
+                                        "4294967295",    // std::numeric_limits<uint32_t>::max()
+                                        "4294967296"});  // std::numeric_limits<uint32_t>::max() + 1
+  auto results  = cudf::strings::is_integer(cudf::strings_column_view(strings),
+                                           cudf::data_type{cudf::type_id::INT32});
+  auto expected = cudf::test::fixed_width_column_wrapper<bool>({0, 1, 1, 1, 1, 0, 0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  results  = cudf::strings::is_integer(cudf::strings_column_view(strings),
+                                      cudf::data_type{cudf::type_id::UINT32});
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 1, 1, 1, 1, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  strings = cudf::test::strings_column_wrapper(
+    {"-9223372036854775809",    // std::numeric_limits<int64_t>::min() - 1
+     "-9223372036854775808",    // std::numeric_limits<int64_t>::min()
+     "-9223372036854775807",    // std::numeric_limits<int64_t>::min() + 1
+     "9223372036854775806",     // std::numeric_limits<int64_t>::max() - 1
+     "9223372036854775807",     // std::numeric_limits<int64_t>::max()
+     "9223372036854775808",     // std::numeric_limits<int64_t>::max() + 1
+     "18446744073709551614",    // std::numeric_limits<uint64_t>::max() - 1
+     "18446744073709551615",    // std::numeric_limits<uint64_t>::max()
+     "18446744073709551616"});  // std::numeric_limits<uint64_t>::max() + 1
+  results  = cudf::strings::is_integer(cudf::strings_column_view(strings),
+                                      cudf::data_type{cudf::type_id::INT64});
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 1, 1, 1, 1, 0, 0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  results  = cudf::strings::is_integer(cudf::strings_column_view(strings),
+                                      cudf::data_type{cudf::type_id::UINT64});
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 1, 1, 1, 1, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(StringsConvertTest, ToInteger)
 {
-  std::vector<const char*> h_strings{
-    "eee", "1234", nullptr, "", "-9832", "93.24", "765é", "-1.78e+5", "2147483647", "-2147483648"};
+  std::vector<const char*> h_strings{"eee",
+                                     "1234",
+                                     nullptr,
+                                     "",
+                                     "-9832",
+                                     "93.24",
+                                     "765é",
+                                     nullptr,
+                                     "-1.78e+5",
+                                     "2147483647",
+                                     "-2147483648",
+                                     "2147483648"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  std::vector<int32_t> h_expected{0, 1234, 0, 0, -9832, 93, 765, -1, 2147483647, -2147483648};
 
-  auto strings_view = cudf::strings_column_view(strings);
-  auto results = cudf::strings::to_integers(strings_view, cudf::data_type{cudf::type_id::INT32});
+  auto results            = cudf::strings::to_integers(cudf::strings_column_view(strings),
+                                            cudf::data_type{cudf::type_id::INT16});
+  auto const expected_i16 = cudf::test::fixed_width_column_wrapper<int16_t>(
+    std::initializer_list<int16_t>{0, 1234, NULL_VAL, 0, -9832, 93, 765, NULL_VAL, -1, -1, 0, 0},
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_i16);
 
-  cudf::test::fixed_width_column_wrapper<int32_t> expected(
-    h_expected.begin(),
-    h_expected.end(),
+  results                 = cudf::strings::to_integers(cudf::strings_column_view(strings),
+                                       cudf::data_type{cudf::type_id::INT32});
+  auto const expected_i32 = cudf::test::fixed_width_column_wrapper<int32_t>(
+    std::initializer_list<int32_t>{
+      0, 1234, NULL_VAL, 0, -9832, 93, 765, NULL_VAL, -1, 2147483647, -2147483648, -2147483648},
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_i32);
+
+  results                 = cudf::strings::to_integers(cudf::strings_column_view(strings),
+                                       cudf::data_type{cudf::type_id::UINT32});
+  auto const expected_u32 = cudf::test::fixed_width_column_wrapper<uint32_t>(
+    std::initializer_list<uint32_t>{0,
+                                    1234,
+                                    NULL_VAL,
+                                    0,
+                                    4294957464,
+                                    93,
+                                    765,
+                                    NULL_VAL,
+                                    4294967295,
+                                    2147483647,
+                                    2147483648,
+                                    2147483648},
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_u32);
 }
 
 TEST_F(StringsConvertTest, FromInteger)
@@ -114,7 +275,7 @@ TEST_F(StringsConvertTest, EmptyStringsColumn)
   cudf::test::strings_column_wrapper strings({"", "", ""});
   auto results = cudf::strings::to_integers(cudf::strings_column_view(strings),
                                             cudf::data_type{cudf::type_id::INT64});
-  cudf::test::fixed_width_column_wrapper<int64_t> expected({0, 0, 0});
+  cudf::test::fixed_width_column_wrapper<int64_t> expected{0, 0, 0};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
 }
 
diff --git a/docs/cudf/source/api.rst b/docs/cudf/source/api.rst
index df7e92c125d..b4ca0321073 100644
--- a/docs/cudf/source/api.rst
+++ b/docs/cudf/source/api.rst
@@ -20,6 +20,13 @@ Series
     :inherited-members:
     :exclude-members: serialize, deserialize, logical_not, logical_or, logical_and, remainder, sum_of_squares, fill, merge, iteritems, items, device_deserialize, device_serialize, host_deserialize, host_serialize, to_dict, tolist, to_list
 
+Lists
+-----
+.. currentmodule:: cudf.core.column.lists
+
+.. autoclass:: ListMethods
+    :members:
+
 Strings
 -------
 .. currentmodule:: cudf.core.column.string
@@ -253,4 +260,4 @@ GpuArrowReader
 .. currentmodule:: cudf.comm.gpuarrow
 .. autoclass:: GpuArrowReader
     :members:
-    :exclude-members: count, index
\ No newline at end of file
+    :exclude-members: count, index
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
index 3bc9adb5f49..e6675591164 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
@@ -1163,6 +1163,17 @@ public static ColumnVector decimalFromInts(int scale, int... values) {
     }
   }
 
+  /**
+   * Create a new decimal vector from boxed unscaled values (Integer array) and scale.
+   * The created vector is of type DType.DECIMAL32, whose max precision is 9.
+   * Compared with scale of [[java.math.BigDecimal]], the scale here represents the opposite meaning.
+   */
+  public static ColumnVector decimalFromBoxedInts(int scale, Integer... values) {
+    try (HostColumnVector host = HostColumnVector.decimalFromBoxedInts(scale, values)) {
+      return host.copyToDevice();
+    }
+  }
+
   /**
    * Create a new decimal vector from unscaled values (long array) and scale.
    * The created vector is of type DType.DECIMAL64, whose max precision is 18.
@@ -1174,6 +1185,17 @@ public static ColumnVector decimalFromLongs(int scale, long... values) {
     }
   }
 
+  /**
+   * Create a new decimal vector from boxed unscaled values (Long array) and scale.
+   * The created vector is of type DType.DECIMAL64, whose max precision is 18.
+   * Compared with scale of [[java.math.BigDecimal]], the scale here represents the opposite meaning.
+   */
+  public static ColumnVector decimalFromBoxedLongs(int scale, Long... values) {
+    try (HostColumnVector host = HostColumnVector.decimalFromBoxedLongs(scale, values)) {
+      return host.copyToDevice();
+    }
+  }
+
   /**
    * Create a new decimal vector from double floats with specific DecimalType and RoundingMode.
    * All doubles will be rescaled if necessary, according to scale of input DecimalType and RoundingMode.
diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
index 559256aa7bf..846bcb3b635 100644
--- a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
@@ -481,6 +481,23 @@ public static HostColumnVector decimalFromInts(int scale, int... values) {
     return build(DType.create(DType.DTypeEnum.DECIMAL32, scale), values.length, (b) -> b.appendUnscaledDecimalArray(values));
   }
 
+  /**
+   * Create a new decimal vector from boxed unscaled values (Integer array) and scale.
+   * The created vector is of type DType.DECIMAL32, whose max precision is 9.
+   * Compared with scale of [[java.math.BigDecimal]], the scale here represents the opposite meaning.
+   */
+  public static HostColumnVector decimalFromBoxedInts(int scale, Integer... values) {
+    return build(DType.create(DType.DTypeEnum.DECIMAL32, scale), values.length, (b) -> {
+      for (Integer v : values) {
+        if (v == null) {
+          b.appendNull();
+        } else {
+          b.appendUnscaledDecimal(v);
+        }
+      }
+    });
+  }
+
   /**
    * Create a new decimal vector from unscaled values (long array) and scale.
    * The created vector is of type DType.DECIMAL64, whose max precision is 18.
@@ -490,6 +507,23 @@ public static HostColumnVector decimalFromLongs(int scale, long... values) {
     return build(DType.create(DType.DTypeEnum.DECIMAL64, scale), values.length, (b) -> b.appendUnscaledDecimalArray(values));
   }
 
+  /**
+   * Create a new decimal vector from boxed unscaled values (Long array) and scale.
+   * The created vector is of type DType.DECIMAL64, whose max precision is 18.
+   * Compared with scale of [[java.math.BigDecimal]], the scale here represents the opposite meaning.
+   */
+  public static HostColumnVector decimalFromBoxedLongs(int scale, Long... values) {
+    return build(DType.create(DType.DTypeEnum.DECIMAL64, scale), values.length, (b) -> {
+      for (Long v : values) {
+        if (v == null) {
+          b.appendNull();
+        } else {
+          b.appendUnscaledDecimal(v);
+        }
+      }
+    });
+  }
+
   /**
    * Create a new decimal vector from double floats with specific DecimalType and RoundingMode.
    * All doubles will be rescaled if necessary, according to scale of input DecimalType and RoundingMode.
diff --git a/java/src/main/native/src/AggregationJni.cpp b/java/src/main/native/src/AggregationJni.cpp
index aae7cb493a8..c5184111edf 100644
--- a/java/src/main/native/src/AggregationJni.cpp
+++ b/java/src/main/native/src/AggregationJni.cpp
@@ -206,7 +206,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCollectAgg(JNIEnv
     cudf::jni::auto_set_device(env);
     cudf::null_policy policy =
         include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
-    std::unique_ptr<cudf::aggregation> ret = cudf::make_collect_aggregation(policy);
+    std::unique_ptr<cudf::aggregation> ret = cudf::make_collect_list_aggregation(policy);
     return reinterpret_cast<jlong>(ret.release());
   }
   CATCH_STD(env, 0);
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 73db5ee4df3..4132016d85c 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -42,6 +42,7 @@
 #include <cudf/strings/contains.hpp>
 #include <cudf/strings/convert/convert_booleans.hpp>
 #include <cudf/strings/convert/convert_datetime.hpp>
+#include <cudf/strings/convert/convert_fixed_point.hpp>
 #include <cudf/strings/convert/convert_floats.hpp>
 #include <cudf/strings/convert/convert_integers.hpp>
 #include <cudf/strings/convert/convert_urls.hpp>
@@ -712,6 +713,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas
         case cudf::type_id::UINT64:
           result = cudf::strings::from_integers(*column);
           break;
+        case cudf::type_id::DECIMAL32:
+        case cudf::type_id::DECIMAL64:
+          result = cudf::strings::from_fixed_point(*column);
+          break;
         default: JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Invalid data type", 0);
       }
     } else if (column->type().id() == cudf::type_id::STRING) {
@@ -733,6 +738,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas
         case cudf::type_id::UINT64:
           result = cudf::strings::to_integers(*column, n_data_type);
           break;
+        case cudf::type_id::DECIMAL32:
+        case cudf::type_id::DECIMAL64:
+          result = cudf::strings::to_fixed_point(*column, n_data_type);
+          break;
         default: JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Invalid data type", 0);
       }
     } else if (cudf::is_timestamp(n_data_type) && cudf::is_numeric(column->type())) {
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 8b40f6e93d4..02fbe56431b 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -2266,6 +2266,73 @@ void testCastBoolToString() {
     testCastFixedWidthToStringsAndBack(DType.BOOL8, () -> ColumnVector.fromBoxedBooleans(booleans), () -> ColumnVector.fromStrings(stringBools));
   }
 
+  @Test
+  void testCastDecimal32ToString() {
+
+    Integer[] unScaledValues = {0, null, 3, 2, -43, null, 5234, -73451, 348093, -234810};
+    String[] strDecimalValues = new String[unScaledValues.length];
+    for (int scale : new int[]{-2, -1, 0, 1, 2}) {
+      for (int i = 0; i < strDecimalValues.length; i++) {
+        Long value = unScaledValues[i] == null ? null : Long.valueOf(unScaledValues[i]);
+        strDecimalValues[i] = dumpDecimal(value, scale);
+      }
+
+      testCastFixedWidthToStringsAndBack(DType.create(DType.DTypeEnum.DECIMAL32, scale),
+          () -> ColumnVector.decimalFromBoxedInts(scale, unScaledValues),
+          () -> ColumnVector.fromStrings(strDecimalValues));
+    }
+  }
+
+  @Test
+  void testCastDecimal64ToString() {
+
+    Long[] unScaledValues = {0l, null, 3l, 2l, -43l, null, 234802l, -94582l, 1234208124l, -2342348023812l};
+    String[] strDecimalValues = new String[unScaledValues.length];
+    for (int scale : new int[]{-5, -2, -1, 0, 1, 2, 5}) {
+      for (int i = 0; i < strDecimalValues.length; i++) {
+        strDecimalValues[i] = dumpDecimal(unScaledValues[i], scale);
+        System.out.println(strDecimalValues[i]);
+      }
+
+      testCastFixedWidthToStringsAndBack(DType.create(DType.DTypeEnum.DECIMAL64, scale),
+          () -> ColumnVector.decimalFromBoxedLongs(scale, unScaledValues),
+          () -> ColumnVector.fromStrings(strDecimalValues));
+    }
+  }
+
+  /**
+   * Helper function to create decimal strings which can be processed by castStringToDecimal functor.
+   * We can not simply create decimal string via `String.valueOf`, because castStringToDecimal doesn't
+   * support scientific notations so far.
+   *
+   * issue for scientific notation: https://github.com/rapidsai/cudf/issues/7665
+   */
+  private static String dumpDecimal(Long unscaledValue, int scale) {
+    if (unscaledValue == null) return null;
+
+    StringBuilder builder = new StringBuilder();
+    if (unscaledValue < 0) builder.append('-');
+    String absValue = String.valueOf(Math.abs(unscaledValue));
+
+    if (scale >= 0) {
+      builder.append(absValue);
+      for (int i = 0; i < scale; i++) builder.append('0');
+      return builder.toString();
+    }
+
+    if (absValue.length() <= -scale) {
+      builder.append('0').append('.');
+      for (int i = 0; i < -scale - absValue.length(); i++) builder.append('0');
+      builder.append(absValue);
+    } else {
+      int split = absValue.length() + scale;
+      builder.append(absValue.substring(0, split))
+          .append('.')
+          .append(absValue.substring(split));
+    }
+    return builder.toString();
+  }
+
   private static <T> String[] getStringArray(T[] input) {
     String[] result = new String[input.length];
     for (int i = 0 ; i < input.length ; i++) {
diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 5c6801137ae..840f0c98987 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -50,6 +50,7 @@ class AggregationKind(Enum):
     NUNIQUE = libcudf_aggregation.aggregation.Kind.NUNIQUE
     NTH = libcudf_aggregation.aggregation.Kind.NTH_ELEMENT
     COLLECT = libcudf_aggregation.aggregation.Kind.COLLECT
+    COLLECT_SET = libcudf_aggregation.aggregation.Kind.COLLECT_SET
     PTX = libcudf_aggregation.aggregation.Kind.PTX
     CUDA = libcudf_aggregation.aggregation.Kind.CUDA
 
@@ -241,7 +242,13 @@ cdef class _AggregationFactory:
     @classmethod
     def collect(cls):
         cdef Aggregation agg = Aggregation.__new__(Aggregation)
-        agg.c_obj = move(libcudf_aggregation.make_collect_aggregation())
+        agg.c_obj = move(libcudf_aggregation.make_collect_list_aggregation())
+        return agg
+
+    @classmethod
+    def collect_set(cls):
+        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        agg.c_obj = move(libcudf_aggregation.make_collect_set_aggregation())
         return agg
 
     @classmethod
diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd
index 660db29f7a9..e9836c11361 100644
--- a/python/cudf/cudf/_lib/cpp/aggregation.pxd
+++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd
@@ -34,7 +34,8 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
             ARGMIN 'cudf::aggregation::ARGMIN'
             NUNIQUE 'cudf::aggregation::NUNIQUE'
             NTH_ELEMENT 'cudf::aggregation::NTH_ELEMENT'
-            COLLECT 'cudf::aggregation::COLLECT'
+            COLLECT 'cudf::aggregation::COLLECT_LIST'
+            COLLECT_SET 'cudf::aggregation::COLLECT_SET'
             PTX 'cudf::aggregation::PTX'
             CUDA 'cudf::aggregation::CUDA'
         Kind kind
@@ -83,7 +84,9 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
         size_type n
     ) except +
 
-    cdef unique_ptr[aggregation] make_collect_aggregation() except +
+    cdef unique_ptr[aggregation] make_collect_list_aggregation() except +
+
+    cdef unique_ptr[aggregation] make_collect_set_aggregation() except +
 
     cdef unique_ptr[aggregation] make_udf_aggregation(
         udf_type type,
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
index 519565fa48c..39da6b26502 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -70,6 +70,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         column_in_metadata& set_nullability(bool nullable)
         column_in_metadata& set_list_column_as_map()
         column_in_metadata& set_int96_timestamps(bool req)
+        column_in_metadata& set_decimal_precision(uint8_t precision)
         column_in_metadata& child(size_type i)
 
     cdef cppclass table_input_metadata:
diff --git a/python/cudf/cudf/_lib/cpp/lists/extract.pxd b/python/cudf/cudf/_lib/cpp/lists/extract.pxd
new file mode 100644
index 00000000000..89fa893c17d
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/lists/extract.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
+
+from cudf._lib.cpp.types cimport size_type
+
+cdef extern from "cudf/lists/extract.hpp" namespace "cudf::lists" nogil:
+    cdef unique_ptr[column] extract_list_element(
+        const lists_column_view,
+        size_type
+    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/lists/sorting.pxd b/python/cudf/cudf/_lib/cpp/lists/sorting.pxd
new file mode 100644
index 00000000000..55e8e09427c
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/lists/sorting.pxd
@@ -0,0 +1,15 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.cpp.types cimport order, null_order
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
+
+
+cdef extern from "cudf/lists/sorting.hpp" namespace "cudf::lists" nogil:
+    cdef unique_ptr[column] sort_lists(
+        const lists_column_view source_column,
+        order column_order,
+        null_order null_precedence
+    ) except +
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 0f0ee35556a..2971aad8313 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -10,23 +10,29 @@ from cudf._lib.cpp.lists.count_elements cimport (
 from cudf._lib.cpp.lists.explode cimport (
     explode_outer as cpp_explode_outer
 )
+from cudf._lib.cpp.lists.sorting cimport (
+    sort_lists as cpp_sort_lists
+)
 from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.column.column cimport column
 
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.cpp.types cimport size_type, order, null_order
 
 from cudf._lib.column cimport Column
 from cudf._lib.table cimport Table
 
+from cudf._lib.types cimport (
+    underlying_type_t_null_order, underlying_type_t_order
+)
 from cudf.core.dtypes import ListDtype
 
+from cudf._lib.cpp.lists.extract cimport extract_list_element
+
 
 def count_elements(Column col):
-    if not isinstance(col.dtype, ListDtype):
-        raise TypeError("col is not a list column.")
 
     # shared_ptr required because lists_column_view has no default
     # ctor
@@ -58,3 +64,39 @@ def explode_outer(Table tbl, int explode_column_idx, bool ignore_index=False):
         column_names=tbl._column_names,
         index_names=None if ignore_index else tbl._index_names
     )
+
+
+def sort_lists(Column col, bool ascending, str na_position):
+    cdef shared_ptr[lists_column_view] list_view = (
+        make_shared[lists_column_view](col.view())
+    )
+    cdef order c_sort_order = (
+        order.ASCENDING if ascending else order.DESCENDING
+    )
+    cdef null_order c_null_prec = (
+        null_order.BEFORE if na_position == "first" else null_order.AFTER
+    )
+
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_sort_lists(list_view.get()[0], c_sort_order, c_null_prec)
+        )
+
+    return Column.from_unique_ptr(move(c_result))
+
+
+def extract_element(Column col, size_type index):
+    # shared_ptr required because lists_column_view has no default
+    # ctor
+    cdef shared_ptr[lists_column_view] list_view = (
+        make_shared[lists_column_view](col.view())
+    )
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(extract_list_element(list_view.get()[0], index))
+
+    result = Column.from_unique_ptr(move(c_result))
+    return result
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 87179c02fe2..0158df46cc4 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -20,7 +20,8 @@ from cudf.utils.dtypes import (
     np_to_pa_dtype,
     is_categorical_dtype,
     is_list_dtype,
-    is_struct_dtype
+    is_struct_dtype,
+    is_decimal_dtype,
 )
 
 from cudf._lib.utils cimport get_column_names
@@ -310,7 +311,7 @@ cpdef write_parquet(
 
     for i, name in enumerate(table._column_names, num_index_cols_meta):
         tbl_meta.get().column_metadata[i].set_name(name.encode())
-        _set_col_children_names(
+        _set_col_metadata(
             table[name]._column, tbl_meta.get().column_metadata[i]
         )
 
@@ -448,7 +449,7 @@ cdef class ParquetWriter:
 
         for i, name in enumerate(table._column_names, num_index_cols_meta):
             self.tbl_meta.get().column_metadata[i].set_name(name.encode())
-            _set_col_children_names(
+            _set_col_metadata(
                 table[name]._column, self.tbl_meta.get().column_metadata[i]
             )
 
@@ -546,14 +547,16 @@ cdef Column _update_column_struct_field_names(
         col.set_base_children(tuple(children))
     return col
 
-cdef _set_col_children_names(Column col, column_in_metadata& col_meta):
+cdef _set_col_metadata(Column col, column_in_metadata& col_meta):
     if is_struct_dtype(col):
         for i, (child_col, name) in enumerate(
             zip(col.children, list(col.dtype.fields))
         ):
             col_meta.child(i).set_name(name.encode())
-            _set_col_children_names(child_col, col_meta.child(i))
+            _set_col_metadata(child_col, col_meta.child(i))
     elif is_list_dtype(col):
-        _set_col_children_names(col.children[1], col_meta.child(1))
+        _set_col_metadata(col.children[1], col_meta.child(1))
     else:
+        if is_decimal_dtype(col):
+            col_meta.set_decimal_precision(col.dtype.precision)
         return
diff --git a/python/cudf/cudf/_lib/table.pyx b/python/cudf/cudf/_lib/table.pyx
index dba0abb9cf0..f97b45d8abf 100644
--- a/python/cudf/cudf/_lib/table.pyx
+++ b/python/cudf/cudf/_lib/table.pyx
@@ -34,8 +34,8 @@ cdef class Table:
 
         Parameters
         ----------
-        data : OrderedColumnDict
-            An OrderedColumnDict mapping column names to Columns
+        data : dict
+            An dict mapping column names to Columns
         index : Table
             A Table representing the (optional) index columns.
         """
@@ -109,7 +109,7 @@ cdef class Table:
                 it += 1
             index = Table(dict(zip(index_names, index_columns)))
 
-        # Construct the data OrderedColumnDict
+        # Construct the data dict
         data_columns = []
         for _ in column_names:
             data_columns.append(Column.from_unique_ptr(move(dereference(it))))
@@ -154,7 +154,7 @@ cdef class Table:
                 column_idx += 1
             index = Table(dict(zip(index_names, index_columns)))
 
-        # Construct the data OrderedColumnDict
+        # Construct the data dict
         cdef size_type source_column_idx = 0
         data_columns = []
         for _ in column_names:
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 6698a47b416..4fe795e57a9 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -23,6 +23,7 @@ from cudf.utils.dtypes import (
     is_categorical_dtype,
     is_list_dtype,
     is_struct_dtype,
+    is_decimal_dtype,
 )
 
 
@@ -80,7 +81,11 @@ cpdef generate_pandas_metadata(Table table, index):
                 "'category' column dtypes are currently not "
                 + "supported by the gpu accelerated parquet writer"
             )
-        elif is_list_dtype(col) or is_struct_dtype(col):
+        elif (
+            is_list_dtype(col)
+            or is_struct_dtype(col)
+            or is_decimal_dtype(col)
+        ):
             types.append(col.dtype.to_arrow())
         else:
             types.append(np_to_pa_dtype(col.dtype))
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 1d3f73822a9..2204fbdea1f 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -7,7 +7,7 @@
 
 import cudf
 from cudf._lib.copying import segmented_gather
-from cudf._lib.lists import count_elements
+from cudf._lib.lists import count_elements, extract_element, sort_lists
 from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase, as_column, column
 from cudf.core.column.methods import ColumnMethodsMixin
@@ -178,6 +178,38 @@ def __init__(self, column, parent=None):
             )
         super().__init__(column=column, parent=parent)
 
+    def get(self, index):
+        """
+        Extract element at the given index from each component
+
+        Extract element from lists, tuples, or strings in
+        each element in the Series/Index.
+
+        Parameters
+        ----------
+        index : int
+
+        Returns
+        -------
+        Series or Index
+
+        Examples
+        --------
+        >>> s = cudf.Series([[1, 2, 3], [3, 4, 5], [4, 5, 6]])
+        >>> s.list.get(-1)
+        0    3
+        1    5
+        2    6
+        dtype: int64
+        """
+        min_col_list_len = self.len().min()
+        if -min_col_list_len <= index < min_col_list_len:
+            return self._return_or_inplace(
+                extract_element(self._column, index)
+            )
+        else:
+            raise IndexError("list index out of range")
+
     @property
     def leaves(self):
         """
@@ -285,3 +317,57 @@ def take(self, lists_indices):
             raise
         else:
             return res
+
+    def sort_values(
+        self,
+        ascending=True,
+        inplace=False,
+        kind="quicksort",
+        na_position="last",
+        ignore_index=False,
+    ):
+        """
+        Sort each list by the values.
+
+        Sort the lists in ascending or descending order by some criterion.
+
+        Parameters
+        ----------
+        ascending : bool, default True
+            If True, sort values in ascending order, otherwise descending.
+        na_position : {'first', 'last'}, default 'last'
+            'first' puts nulls at the beginning, 'last' puts nulls at the end.
+        ignore_index : bool, default False
+            If True, the resulting axis will be labeled 0, 1, ..., n - 1.
+
+        Returns
+        -------
+        ListColumn with each list sorted
+
+        Notes
+        -----
+        Difference from pandas:
+          * Not supporting: `inplace`, `kind`
+
+        Examples
+        --------
+        >>> s = cudf.Series([[4, 2, None, 9], [8, 8, 2], [2, 1]])
+        >>> s.list.sort_values(ascending=True, na_position="last")
+        0    [2.0, 4.0, 9.0, nan]
+        1         [2.0, 8.0, 8.0]
+        2              [1.0, 2.0]
+        dtype: list
+        """
+        if inplace:
+            raise NotImplementedError("`inplace` not currently implemented.")
+        if kind != "quicksort":
+            raise NotImplementedError("`kind` not currently implemented.")
+        if na_position not in {"first", "last"}:
+            raise ValueError(f"Unknown `na_position` value {na_position}")
+        if is_list_dtype(self._column.children[1].dtype):
+            raise NotImplementedError("Nested lists sort is not supported.")
+
+        return self._return_or_inplace(
+            sort_lists(self._column, ascending, na_position),
+            retain_index=not ignore_index,
+        )
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 03743e4464b..0c580132290 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -3,12 +3,12 @@
 from __future__ import annotations
 
 import itertools
-from collections import OrderedDict
 from collections.abc import MutableMapping
 from typing import (
     TYPE_CHECKING,
     Any,
     Callable,
+    Dict,
     Mapping,
     Optional,
     Tuple,
@@ -18,8 +18,8 @@
 import pandas as pd
 
 import cudf
+from cudf.core import column
 from cudf.utils.utils import (
-    OrderedColumnDict,
     cached_property,
     to_flat_dict,
     to_nested_dict,
@@ -31,7 +31,7 @@
 
 class ColumnAccessor(MutableMapping):
 
-    _data: "OrderedDict[Any, ColumnBase]"
+    _data: "Dict[Any, ColumnBase]"
     multiindex: bool
     _level_names: Tuple[Any, ...]
 
@@ -63,10 +63,26 @@ def __init__(
             self._data = data._data
             self.multiindex = multiindex
             self._level_names = level_names
+        else:
+            # This code path is performance-critical for copies and should be
+            # modified with care.
+            self._data = {}
+            if data:
+                data = dict(data)
+                # Faster than next(iter(data.values()))
+                column_length = len(data[next(iter(data))])
+                for k, v in data.items():
+                    # Much faster to avoid the function call if possible; the
+                    # extra isinstance is negligible if we do have to make a
+                    # column from something else.
+                    if not isinstance(v, column.ColumnBase):
+                        v = column.as_column(v)
+                    if len(v) != column_length:
+                        raise ValueError("All columns must be of equal length")
+                    self._data[k] = v
 
-        self._data = OrderedColumnDict(data)
-        self.multiindex = multiindex
-        self._level_names = level_names
+            self.multiindex = multiindex
+            self._level_names = level_names
 
     def __iter__(self):
         return self._data.__iter__()
@@ -76,7 +92,6 @@ def __getitem__(self, key: Any) -> ColumnBase:
 
     def __setitem__(self, key: Any, value: Any):
         self.set_by_label(key, value)
-        self._clear_cache()
 
     def __delitem__(self, key: Any):
         self._data.__delitem__(key)
@@ -144,6 +159,13 @@ def _grouped_data(self) -> MutableMapping:
         else:
             return self._data
 
+    @cached_property
+    def _column_length(self):
+        try:
+            return len(self._data[next(iter(self._data))])
+        except StopIteration:
+            return 0
+
     def _clear_cache(self):
         cached_properties = "columns", "names", "_grouped_data"
         for attr in cached_properties:
@@ -152,6 +174,10 @@ def _clear_cache(self):
             except AttributeError:
                 pass
 
+        # Column length should only be cleared if no data is present.
+        if len(self._data) == 0 and hasattr(self, "_column_length"):
+            del self._column_length
+
     def to_pandas_index(self) -> pd.Index:
         """"
         Convert the keys of the ColumnAccessor to a Pandas Index object.
@@ -169,7 +195,9 @@ def to_pandas_index(self) -> pd.Index:
             result = pd.Index(self.names, name=self.name, tupleize_cols=False)
         return result
 
-    def insert(self, name: Any, value: Any, loc: int = -1):
+    def insert(
+        self, name: Any, value: Any, loc: int = -1, validate: bool = True
+    ):
         """
         Insert column into the ColumnAccessor at the specified location.
 
@@ -199,6 +227,13 @@ def insert(self, name: Any, value: Any, loc: int = -1):
         if name in self._data:
             raise ValueError(f"Cannot insert '{name}', already exists")
         if loc == len(self._data):
+            if validate:
+                value = column.as_column(value)
+                if len(self._data) > 0:
+                    if len(value) != self._column_length:
+                        raise ValueError("All columns must be of equal length")
+                else:
+                    self._column_length = len(value)
             self._data[name] = value
         else:
             new_keys = self.names[:loc] + (name,) + self.names[loc:]
@@ -270,16 +305,29 @@ def select_by_index(self, index: Any) -> ColumnAccessor:
             data, multiindex=self.multiindex, level_names=self.level_names,
         )
 
-    def set_by_label(self, key: Any, value: Any):
+    def set_by_label(self, key: Any, value: Any, validate: bool = True):
         """
         Add (or modify) column by name.
 
         Parameters
         ----------
-        key : name of the column
+        key
+            name of the column
         value : column-like
+            The value to insert into the column.
+        validate : bool
+            If True, the provided value will be coerced to a column and
+            validated before setting (Default value = True).
         """
         key = self._pad_key(key)
+        if validate:
+            value = column.as_column(value)
+            if len(self._data) > 0:
+                if len(value) != self._column_length:
+                    raise ValueError("All columns must be of equal length")
+            else:
+                self._column_length = len(value)
+
         self._data[key] = value
         self._clear_cache()
 
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 812a20cba45..bd009a9ad84 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -52,7 +52,6 @@
     is_struct_dtype,
     numeric_normalize_types,
 )
-from cudf.utils.utils import OrderedColumnDict
 
 T = TypeVar("T", bound="DataFrame")
 
@@ -4854,7 +4853,7 @@ def hash_columns(self, columns=None):
             table_to_hash = self
         else:
             cols = [self[k]._column for k in columns]
-            table_to_hash = Frame(data=OrderedColumnDict(zip(columns, cols)))
+            table_to_hash = Frame(data=dict(zip(columns, cols)))
 
         return Series(table_to_hash._hash()).values
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index bfcc2d125db..e6898b8c606 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -40,8 +40,8 @@ class Frame(libcudf.table.Table):
 
     Parameters
     ----------
-    data : OrderedColumnDict
-        An OrderedColumnDict mapping column names to Columns
+    data : dict
+        An dict mapping column names to Columns
     index : Table
         A Frame representing the (optional) index columns.
     """
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 7ed2157277c..9d4643da637 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3571,6 +3571,7 @@ def sort_values(
         4    3
         3    4
         1    5
+        dtype: int64
         """
 
         if inplace:
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index b3ba439cb15..76a02d5e74a 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -5222,7 +5222,7 @@ def test_memory_usage_multi():
 def test_setitem_diff_size_list(list_input, key):
     gdf = cudf.datasets.randomdata(5)
     with pytest.raises(
-        ValueError, match=("All values must be of equal length")
+        ValueError, match=("All columns must be of equal length")
     ):
         gdf[key] = list_input
 
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 33812cfa7a7..2ab1382b34e 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
+import functools
 
 import pandas as pd
 import pyarrow as pa
@@ -159,3 +160,89 @@ def test_take_invalid(invalid, exception):
     gs = cudf.Series([[0, 1], [2, 3]])
     with exception:
         gs.list.take(invalid)
+
+
+def key_func_builder(x, na_position):
+    if x is None:
+        if na_position == "first":
+            return -1e8
+        else:
+            return 1e8
+    else:
+        return x
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [[4, 2, None, 9], [8, 8, 2], [2, 1]],
+        [[4, 2, None, 9], [8, 8, 2], None],
+        [[4, 2, None, 9], [], None],
+    ],
+)
+@pytest.mark.parametrize(
+    "index",
+    [
+        None,
+        pd.Index(["a", "b", "c"]),
+        pd.MultiIndex.from_tuples(
+            [(0, "a"), (0, "b"), (1, "a")], names=["l0", "l1"]
+        ),
+    ],
+)
+@pytest.mark.parametrize("ascending", [True, False])
+@pytest.mark.parametrize("na_position", ["first", "last"])
+@pytest.mark.parametrize("ignore_index", [True, False])
+def test_sort_values(data, index, ascending, na_position, ignore_index):
+    key_func = functools.partial(key_func_builder, na_position=na_position)
+
+    ps = pd.Series(data, index=index)
+    gs = cudf.from_pandas(ps)
+
+    expected = ps.apply(
+        lambda x: sorted(x, key=key_func, reverse=not ascending)
+        if x is not None
+        else None
+    )
+    if ignore_index:
+        expected.reset_index(drop=True, inplace=True)
+    got = gs.list.sort_values(
+        ascending=ascending, na_position=na_position, ignore_index=ignore_index
+    )
+
+    assert_eq(expected, got)
+
+
+@pytest.mark.parametrize(
+    "data, index, expect",
+    [
+        ([[None, None], [None, None]], 0, [None, None]),
+        ([[1, 2], [3, 4]], 0, [1, 3]),
+        ([["a", "b"], ["c", "d"]], 1, ["b", "d"]),
+        ([[1, None], [None, 2]], 1, [None, 2]),
+        ([[[1, 2], [3, 4]], [[5, 6], [7, 8]]], 1, [[3, 4], [7, 8]]),
+    ],
+)
+def test_get(data, index, expect):
+    sr = cudf.Series(data)
+    expect = cudf.Series(expect)
+    got = sr.list.get(index)
+    assert_eq(expect, got)
+
+
+def test_get_nested_lists():
+    sr = cudf.Series(
+        [
+            [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [], [[3, 4], [7, 8]]],
+            [[], [[9, 10]], [[11, 12], [13, 14]]],
+        ]
+    )
+    expect = cudf.Series([[[1, 2], [3, 4]], []])
+    got = sr.list.get(0)
+    assert_eq(expect, got)
+
+
+def test_get_nulls():
+    with pytest.raises(IndexError, match="list index out of range"):
+        sr = cudf.Series([[], [], []])
+        sr.list.get(100)
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 6d50e4b6fee..a7a11c95e30 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1026,7 +1026,7 @@ def test_parquet_reader_list_skiprows(skip, tmpdir):
     assert_eq(expect, got, check_dtype=False)
 
 
-@pytest.mark.parametrize("skip", range(0, 128))
+@pytest.mark.parametrize("skip", range(0, 120))
 def test_parquet_reader_list_num_rows(skip, tmpdir):
     num_rows = 128
     src = pd.DataFrame(
@@ -1043,7 +1043,8 @@ def test_parquet_reader_list_num_rows(skip, tmpdir):
     src.to_parquet(fname)
     assert os.path.exists(fname)
 
-    rows_to_read = min(3, num_rows - skip)
+    # make sure to leave a few rows at the end that we don't read
+    rows_to_read = min(3, (num_rows - skip) - 5)
     expect = src.iloc[skip:].head(rows_to_read)
     got = cudf.read_parquet(fname, skiprows=skip, num_rows=rows_to_read)
     assert_eq(expect, got, check_dtype=False)
@@ -1920,3 +1921,18 @@ def test_parquet_writer_nested(tmpdir, data):
 
     got = pd.read_parquet(fname)
     assert_eq(expect, got)
+
+
+def test_parquet_writer_decimal(tmpdir):
+    from cudf.core.dtypes import Decimal64Dtype
+
+    gdf = cudf.DataFrame({"val": [0.00, 0.01, 0.02]})
+
+    gdf["dec_val"] = gdf["val"].astype(Decimal64Dtype(7, 2))
+
+    fname = tmpdir.join("test_parquet_writer_decimal.parquet")
+    gdf.to_parquet(fname)
+    assert os.path.exists(fname)
+
+    got = pd.read_parquet(fname)
+    assert_eq(gdf, got)
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 03a39f6fb4b..ba9fa734248 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -280,36 +280,6 @@ def __get__(self, instance, cls):
             return value
 
 
-class ColumnValuesMappingMixin:
-    """
-    Coerce provided values for the mapping to Columns.
-    """
-
-    def __setitem__(self, key, value):
-
-        value = column.as_column(value)
-        super().__setitem__(key, value)
-
-
-class EqualLengthValuesMappingMixin:
-    """
-    Require all values in the mapping to have the same length.
-    """
-
-    def __setitem__(self, key, value):
-        if len(self) > 0:
-            first = next(iter(self.values()))
-            if len(value) != len(first):
-                raise ValueError("All values must be of equal length")
-        super().__setitem__(key, value)
-
-
-class OrderedColumnDict(
-    ColumnValuesMappingMixin, EqualLengthValuesMappingMixin, OrderedDict
-):
-    pass
-
-
 class NestedMappingMixin:
     """
     Make missing values of a mapping empty instances