diff --git a/build.sh b/build.sh
index bc49b76d44e..70b93427d5c 100755
--- a/build.sh
+++ b/build.sh
@@ -134,18 +134,20 @@ if hasArg clean; then
     done
 fi
 
-if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then
-    CUDF_CMAKE_CUDA_ARCHITECTURES="-DCMAKE_CUDA_ARCHITECTURES="
-    echo "Building for the architecture of the GPU in the system..."
-else
-    CUDF_CMAKE_CUDA_ARCHITECTURES=""
-    echo "Building for *ALL* supported GPU architectures..."
-fi
 
 ################################################################################
 # Configure, build, and install libcudf
 
 if buildAll || hasArg libcudf; then
+
+    if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then
+        CUDF_CMAKE_CUDA_ARCHITECTURES="-DCMAKE_CUDA_ARCHITECTURES="
+        echo "Building for the architecture of the GPU in the system..."
+    else
+        CUDF_CMAKE_CUDA_ARCHITECTURES=""
+        echo "Building for *ALL* supported GPU architectures..."
+    fi
+
     cmake -S $REPODIR/cpp -B ${LIB_BUILD_DIR} \
           -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
           ${CUDF_CMAKE_CUDA_ARCHITECTURES} \
@@ -192,19 +194,16 @@ fi
 # Build libcudf_kafka library
 if hasArg libcudf_kafka; then
     cmake -S $REPODIR/cpp/libcudf_kafka -B ${KAFKA_LIB_BUILD_DIR} \
-          ${CUDF_CMAKE_CUDA_ARCHITECTURES} \
           -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
+          -DBUILD_TESTS=${BUILD_TESTS} \
           -DCMAKE_BUILD_TYPE=${BUILD_TYPE}
 
+
     cd ${KAFKA_LIB_BUILD_DIR}
+    cmake --build . -j${PARALLEL_LEVEL} ${VERBOSE_FLAG}
+
     if [[ ${INSTALL_TARGET} != "" ]]; then
         cmake --build . -j${PARALLEL_LEVEL} --target install ${VERBOSE_FLAG}
-    else
-        cmake --build . -j${PARALLEL_LEVEL} --target  libcudf_kafka ${VERBOSE_FLAG}
-    fi
-
-    if [[ ${BUILD_TESTS} == "ON" ]]; then
-        cmake --build . -j${PARALLEL_LEVEL} --target  build_tests_libcudf_kafka ${VERBOSE_FLAG}
     fi
 fi
 
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 61f551f4b6d..819a0dcf6bf 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -49,6 +49,9 @@ function sed_runner() {
 # cpp update
 sed_runner 's/'"CUDA_DATAFRAME VERSION .* LANGUAGES"'/'"CUDA_DATAFRAME VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/CMakeLists.txt
 
+# cpp libcudf_kafka update
+sed_runner 's/'"CUDA_KAFKA VERSION .* LANGUAGES"'/'"CUDA_KAFKA VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/libcudf_kafka/CMakeLists.txt
+
 # doxyfile update
 sed_runner 's/PROJECT_NUMBER         = .*/PROJECT_NUMBER         = '${NEXT_FULL_TAG}'/g' cpp/doxygen/Doxyfile
 
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 1be8a6b450a..39587b4bd05 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -33,7 +33,7 @@ build:
 
 requirements:
   build:
-    - cmake >=3.17.0
+    - cmake >=3.18
   host:
     - librmm {{ minor_version }}.*
     - cudatoolkit {{ cuda_version }}.*
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index fc439ebfa7f..48562476070 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -554,12 +554,6 @@ if(CUDF_BUILD_BENCHMARKS)
         GIT_SHALLOW     TRUE
         OPTIONS         "BENCHMARK_ENABLE_TESTING OFF"
                         "BENCHMARK_ENABLE_INSTALL OFF")
-    if(benchmark_ADDED)
-        install(TARGETS benchmark
-                        benchmark_main
-            DESTINATION lib
-            EXPORT cudf-targets)
-    endif()
     add_subdirectory(benchmarks)
 endif()
 
@@ -636,6 +630,15 @@ elseif(TARGET arrow_static)
     endif()
 endif()
 
+if(TARGET gtest)
+    get_target_property(gtest_is_imported gtest IMPORTED)
+    if(NOT gtest_is_imported)
+        export(TARGETS gtest gmock gtest_main gmock_main
+            FILE ${CUDF_BINARY_DIR}/cudf-gtesting-targets.cmake
+            NAMESPACE   GTest::)
+    endif()
+endif()
+
 export(EXPORT cudf-targets
     FILE ${CUDF_BINARY_DIR}/cudf-targets.cmake
     NAMESPACE   cudf::)
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 7fd84b508ac..5aa7e0132f8 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -175,10 +175,12 @@ ConfigureBench(BINARYOP_BENCH binaryop/binaryop_benchmark.cu)
 ###################################################################################################
 # - nvtext benchmark -------------------------------------------------------------------
 ConfigureBench(TEXT_BENCH
+  text/ngrams_benchmark.cpp
   text/normalize_benchmark.cpp
   text/normalize_spaces_benchmark.cpp
-  text/tokenize_benchmark.cpp
-  text/subword_benchmark.cpp)
+  text/replace_benchmark.cpp
+  text/subword_benchmark.cpp
+  text/tokenize_benchmark.cpp)
 
 ###################################################################################################
 # - strings benchmark -------------------------------------------------------------------
diff --git a/cpp/benchmarks/text/ngrams_benchmark.cpp b/cpp/benchmarks/text/ngrams_benchmark.cpp
new file mode 100644
index 00000000000..1fe8e3b7f2e
--- /dev/null
+++ b/cpp/benchmarks/text/ngrams_benchmark.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/string/string_bench_args.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf_test/base_fixture.hpp>
+
+#include <nvtext/generate_ngrams.hpp>
+
+class TextNGrams : public cudf::benchmark {
+};
+
+enum class ngrams_type { tokens, characters };
+
+static void BM_ngrams(benchmark::State& state, ngrams_type nt)
+{
+  auto const n_rows         = static_cast<cudf::size_type>(state.range(0));
+  auto const max_str_length = static_cast<cudf::size_type>(state.range(1));
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    switch (nt) {
+      case ngrams_type::tokens: nvtext::generate_ngrams(input); break;
+      case ngrams_type::characters: nvtext::generate_character_ngrams(input); break;
+    }
+  }
+
+  state.SetBytesProcessed(state.iterations() * input.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 5;
+  int const max_rowlen = 40;
+  int const len_mult   = 2;
+  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+}
+
+#define NVTEXT_BENCHMARK_DEFINE(name)                             \
+  BENCHMARK_DEFINE_F(TextNGrams, name)                            \
+  (::benchmark::State & st) { BM_ngrams(st, ngrams_type::name); } \
+  BENCHMARK_REGISTER_F(TextNGrams, name)                          \
+    ->Apply(generate_bench_args)                                  \
+    ->UseManualTime()                                             \
+    ->Unit(benchmark::kMillisecond);
+
+NVTEXT_BENCHMARK_DEFINE(tokens)
+NVTEXT_BENCHMARK_DEFINE(characters)
diff --git a/cpp/benchmarks/text/replace_benchmark.cpp b/cpp/benchmarks/text/replace_benchmark.cpp
new file mode 100644
index 00000000000..f5428aee225
--- /dev/null
+++ b/cpp/benchmarks/text/replace_benchmark.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/string/string_bench_args.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <nvtext/replace.hpp>
+
+class TextReplace : public cudf::benchmark {
+};
+
+static void BM_replace(benchmark::State& state)
+{
+  auto const n_rows   = static_cast<cudf::size_type>(state.range(0));
+  auto const n_length = static_cast<cudf::size_type>(state.range(1));
+
+  std::vector<std::string> words{" ",        "one  ",    "two ",       "three ",     "four ",
+                                 "five ",    "six  ",    "sevén  ",    "eight ",     "nine ",
+                                 "ten   ",   "eleven ",  "twelve ",    "thirteen  ", "fourteen ",
+                                 "fifteen ", "sixteen ", "seventeen ", "eighteen ",  "nineteen "};
+
+  std::default_random_engine generator;
+  std::uniform_int_distribution<int> tokens_dist(0, words.size() - 1);
+  std::string row;  // build a row of random tokens
+  while (static_cast<int>(row.size()) < n_length) row += words[tokens_dist(generator)];
+
+  std::uniform_int_distribution<int> position_dist(0, 16);
+
+  auto elements = cudf::detail::make_counting_transform_iterator(
+    0, [&](auto idx) { return row.c_str() + position_dist(generator); });
+  cudf::test::strings_column_wrapper input(elements, elements + n_rows);
+  cudf::strings_column_view view(input);
+
+  cudf::test::strings_column_wrapper targets({"one", "two", "sevén", "zero"});
+  cudf::test::strings_column_wrapper replacements({"1", "2", "7", "0"});
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    nvtext::replace_tokens(
+      view, cudf::strings_column_view(targets), cudf::strings_column_view(replacements));
+  }
+
+  state.SetBytesProcessed(state.iterations() * view.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows          = 1 << 12;
+  int const max_rows          = 1 << 24;
+  int const row_multiplier    = 8;
+  int const min_row_length    = 1 << 5;
+  int const max_row_length    = 1 << 13;
+  int const length_multiplier = 4;
+  generate_string_bench_args(
+    b, min_rows, max_rows, row_multiplier, min_row_length, max_row_length, length_multiplier);
+}
+
+#define NVTEXT_BENCHMARK_DEFINE(name)           \
+  BENCHMARK_DEFINE_F(TextReplace, name)         \
+  (::benchmark::State & st) { BM_replace(st); } \
+  BENCHMARK_REGISTER_F(TextReplace, name)       \
+    ->Apply(generate_bench_args)                \
+    ->UseManualTime()                           \
+    ->Unit(benchmark::kMillisecond);
+
+NVTEXT_BENCHMARK_DEFINE(replace)
diff --git a/cpp/cmake/cudf-build-config.cmake.in b/cpp/cmake/cudf-build-config.cmake.in
index d0c5a608e45..ed1926f20f0 100644
--- a/cpp/cmake/cudf-build-config.cmake.in
+++ b/cpp/cmake/cudf-build-config.cmake.in
@@ -2,6 +2,22 @@
 
 cmake_minimum_required(VERSION 3.18)
 
+set(_possible_targets_to_promote
+      cudf::cudf
+      GTest::gmock
+      GTest::gmock_main
+      GTest::gtest
+      GTest::gtest_main
+      cudf::cudftestutil
+      rmm::rmm
+      arrow_shared
+      arrow_cuda_shared )
+foreach(target IN LISTS _possible_targets_to_promote)
+  if(NOT TARGET ${target})
+    list(APPEND _targets_to_promote ${target})
+  endif()
+endforeach()
+
 set(CUDF_VERSION @CUDF_VERSION@)
 set(CUDF_VERSION_MAJOR @CUDF_VERSION_MAJOR@)
 set(CUDF_VERSION_MINOR @CUDF_VERSION_MINOR@)
@@ -36,21 +52,29 @@ include(@CUDF_SOURCE_DIR@/cmake/thirdparty/CUDF_GetThrust.cmake)
 # find rmm
 set(CUDF_MIN_VERSION_rmm "${CUDF_VERSION_MAJOR}.${CUDF_VERSION_MINOR}")
 include(@CUDF_SOURCE_DIR@/cmake/thirdparty/CUDF_GetRMM.cmake)
-# find gtest
-include(@CUDF_SOURCE_DIR@/cmake/thirdparty/CUDF_GetGTest.cmake)
 
 # find arrow
-if(NOT EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-arrow-targets.cmake")
+if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-arrow-targets.cmake")
+  include("${CMAKE_CURRENT_LIST_DIR}/cudf-arrow-targets.cmake")
+else()
+  if(NOT DEFINED CUDF_USE_ARROW_STATIC)
+    set(CUDF_USE_ARROW_STATIC OFF)
+  endif()
   include(@CUDF_SOURCE_DIR@/cmake/thirdparty/CUDF_GetArrow.cmake)
 endif()
 
+# find GTest
+if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-gtesting-targets.cmake")
+  include("${CMAKE_CURRENT_LIST_DIR}/cudf-gtesting-targets.cmake")
+else()
+  # find gtest
+  include(@CUDF_SOURCE_DIR@/cmake/thirdparty/CUDF_GetGTest.cmake)
+endif()
+
 list(POP_FRONT CMAKE_MODULE_PATH)
 
-if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-arrow-targets.cmake")
-  include("${CMAKE_CURRENT_LIST_DIR}/cudf-arrow-targets.cmake")
-endif()
-include("${CMAKE_CURRENT_LIST_DIR}/cudf-targets.cmake")
 
+include("${CMAKE_CURRENT_LIST_DIR}/cudf-targets.cmake")
 if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake")
   include("${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake")
 endif()
@@ -59,6 +83,12 @@ include("${CMAKE_CURRENT_LIST_DIR}/cudf-config-version.cmake")
 
 check_required_components(cudf)
 
+foreach(target IN LISTS _targets_to_promote)
+  if(TARGET ${target})
+    fix_cmake_global_defaults(${target})
+  endif()
+endforeach()
+
 set(${CMAKE_FIND_PACKAGE_NAME}_CONFIG "${CMAKE_CURRENT_LIST_FILE}")
 
 include(FindPackageHandleStandardArgs)
diff --git a/cpp/cmake/cudf-config.cmake.in b/cpp/cmake/cudf-config.cmake.in
index 14f8a661c2f..66c669851fa 100644
--- a/cpp/cmake/cudf-config.cmake.in
+++ b/cpp/cmake/cudf-config.cmake.in
@@ -23,21 +23,16 @@ targets:
  cudf::cudf             - The main cudf library.
 
 This module offers an optional testing component which defines the
-following IMPORTED GLOBAL targets:
+following IMPORTED GLOBAL  targets:
 
  cudf::cudftestutil     - The main cudf testing library
- cudf::gmock
- cudf::gmock_main
- cudf::gtest
- cudf::gtest_main
-
 
 Result Variables
 ^^^^^^^^^^^^^^^^
 
 This module will set the following variables in your project::
 
-  CUDF_FOUND
+  cudf_FOUND
   CUDF_VERSION
   CUDF_VERSION_MAJOR
   CUDF_VERSION_MINOR
@@ -49,13 +44,11 @@ cmake_minimum_required(VERSION 3.18)
 
 set(_possible_targets_to_promote
       cudf::cudf
-      cudf::benchmark
-      cudf::benchmark_main
-      cudf::gmock
-      cudf::gtest
-      cudf::gmock_main
-      cudf::gtest_main
       cudf::cudftestutil
+      GTest::gmock
+      GTest::gmock_main
+      GTest::gtest
+      GTest::gtest_main
       rmm::rmm
       arrow_shared
       arrow_cuda_shared )
@@ -101,17 +94,22 @@ include("${CMAKE_CURRENT_LIST_DIR}/cudf-targets.cmake")
 if(testing IN_LIST cudf_FIND_COMPONENTS)
   enable_language(CUDA)
 
-  find_dependency(GTest @CUDF_MIN_VERSION_GTest@)
+  find_dependency(GTest @CUDF_MIN_VERSION_GTest@ CONFIG)
+
   include("${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake")
+
 endif()
 
 include("${CMAKE_CURRENT_LIST_DIR}/cudf-config-version.cmake")
 
 check_required_components(cudf)
 
-foreach(t IN LISTS _targets_to_promote)
-  if(TARGET ${t})
-    set_target_properties(${t} PROPERTIES IMPORTED_GLOBAL TRUE)
+foreach(target IN LISTS _targets_to_promote)
+  if(TARGET ${target})
+    get_target_property(_already_global ${target} IMPORTED_GLOBAL)
+    if(NOT _already_global)
+      set_target_properties(${target} PROPERTIES IMPORTED_GLOBAL TRUE)
+    endif()
   endif()
 endforeach()
 set(${CMAKE_FIND_PACKAGE_NAME}_CONFIG "${CMAKE_CURRENT_LIST_FILE}")
diff --git a/cpp/cmake/thirdparty/CUDF_GetGTest.cmake b/cpp/cmake/thirdparty/CUDF_GetGTest.cmake
index 666ba0fbb2c..9e4f3c137b1 100644
--- a/cpp/cmake/thirdparty/CUDF_GetGTest.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetGTest.cmake
@@ -26,7 +26,7 @@ function(find_and_configure_gtest VERSION)
         GIT_REPOSITORY  https://github.com/google/googletest.git
         GIT_TAG         release-${VERSION}
         GIT_SHALLOW     TRUE
-        OPTIONS         "INSTALL_GTEST OFF"
+        OPTIONS         "INSTALL_GTEST ON"
         # googletest >= 1.10.0 provides a cmake config file -- use it if it exists
         FIND_PACKAGE_ARGUMENTS "CONFIG")
     # Add GTest aliases if they don't already exist.
@@ -43,14 +43,6 @@ function(find_and_configure_gtest VERSION)
     fix_cmake_global_defaults(GTest::gmock)
     fix_cmake_global_defaults(GTest::gtest_main)
     fix_cmake_global_defaults(GTest::gmock_main)
-    if(GTest_ADDED)
-        install(TARGETS gmock
-                        gtest
-                        gmock_main
-                        gtest_main
-            DESTINATION lib
-            EXPORT cudf-testing-targets)
-    endif()
 endfunction()
 
 set(CUDF_MIN_VERSION_GTest 1.10.0)
diff --git a/cpp/cmake/thirdparty/CUDF_GetRMM.cmake b/cpp/cmake/thirdparty/CUDF_GetRMM.cmake
index e5d1f2f07a9..136947674f9 100644
--- a/cpp/cmake/thirdparty/CUDF_GetRMM.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetRMM.cmake
@@ -55,11 +55,6 @@ function(find_and_configure_rmm VERSION)
 
     # Make sure consumers of cudf can also see rmm::rmm
     fix_cmake_global_defaults(rmm::rmm)
-
-    if(NOT rmm_BINARY_DIR IN_LIST CMAKE_PREFIX_PATH)
-        list(APPEND CMAKE_PREFIX_PATH "${rmm_BINARY_DIR}")
-        set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} PARENT_SCOPE)
-    endif()
 endfunction()
 
 set(CUDF_MIN_VERSION_rmm "${CUDF_VERSION_MAJOR}.${CUDF_VERSION_MINOR}")
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 5a02f5bbe55..14d44b77fad 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -472,6 +472,13 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
     return d_children[child_index];
   }
 
+  /**
+   * @brief Returns the number of child columns
+   *
+   * @return The number of child columns
+   */
+  __host__ __device__ size_type num_child_columns() const noexcept { return _num_children; }
+
  protected:
   column_device_view* d_children{};  ///< Array of `column_device_view`
                                      ///< objects in device memory.
diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
index 31196824845..43c2407d629 100644
--- a/cpp/include/cudf/column/column_factories.hpp
+++ b/cpp/include/cudf/column/column_factories.hpp
@@ -21,7 +21,6 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
 
 namespace cudf {
 /**
diff --git a/cpp/include/cudf/concatenate.hpp b/cpp/include/cudf/concatenate.hpp
index 8333cf41b77..182cbbdc3ec 100644
--- a/cpp/include/cudf/concatenate.hpp
+++ b/cpp/include/cudf/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <memory>
-#include <vector>
 
 namespace cudf {
 /**
@@ -36,13 +36,13 @@ namespace cudf {
  *
  * Returns empty `device_buffer` if the column is not nullable
  *
- * @param views Vector of column views whose bitmask will to be concatenated
+ * @param views host_span of column views whose bitmask will to be concatenated
  * @param mr Device memory resource used for allocating the new device_buffer
  * @return rmm::device_buffer A `device_buffer` containing the bitmasks of all
  * the column views in the views vector
  */
 rmm::device_buffer concatenate_masks(
-  std::vector<column_view> const& views,
+  host_span<column_view const> views,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -51,14 +51,13 @@ rmm::device_buffer concatenate_masks(
  * @throws cudf::logic_error
  * If types of the input columns mismatch
  *
- * @param columns_to_concat The column views to be concatenated into a single
- * column
+ * @param columns_to_concat host_span of column views to be concatenated into a single column
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return Unique pointer to a single table having all the rows from the
  * elements of `columns_to_concat` respectively in the same order.
  */
 std::unique_ptr<column> concatenate(
-  std::vector<column_view> const& columns_to_concat,
+  host_span<column_view const> columns_to_concat,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -82,14 +81,13 @@ std::unique_ptr<column> concatenate(
  * @throws cudf::logic_error
  * If number of columns mismatch
  *
- * @param tables_to_concat The table views to be concatenated into a single
- * table
+ * @param tables_to_concat host_span of table views to be concatenated into a single table
  * @param mr Device memory resource used to allocate the returned table's device memory.
  * @return Unique pointer to a single table having all the rows from the
  * elements of `tables_to_concat` respectively in the same order.
  */
 std::unique_ptr<table> concatenate(
-  std::vector<table_view> const& tables_to_concat,
+  host_span<table_view const> tables_to_concat,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/detail/concatenate.cuh b/cpp/include/cudf/detail/concatenate.cuh
index a30ad6e853d..5f0399d6172 100644
--- a/cpp/include/cudf/detail/concatenate.cuh
+++ b/cpp/include/cudf/detail/concatenate.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/concatenate.hpp>
 #include <cudf/detail/concatenate.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -34,8 +35,8 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-void concatenate_masks(rmm::device_vector<column_device_view> const& d_views,
-                       rmm::device_vector<size_t> const& d_offsets,
+void concatenate_masks(device_span<column_device_view const> d_views,
+                       device_span<size_t const> d_offsets,
                        bitmask_type* dest_mask,
                        size_type output_size,
                        rmm::cuda_stream_view stream);
@@ -45,7 +46,7 @@ void concatenate_masks(rmm::device_vector<column_device_view> const& d_views,
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-void concatenate_masks(std::vector<column_view> const& views,
+void concatenate_masks(host_span<column_view const> views,
                        bitmask_type* dest_mask,
                        rmm::cuda_stream_view stream);
 
diff --git a/cpp/include/cudf/detail/concatenate.hpp b/cpp/include/cudf/detail/concatenate.hpp
index 43eb5203b37..f7f5567cd76 100644
--- a/cpp/include/cudf/detail/concatenate.hpp
+++ b/cpp/include/cudf/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/concatenate.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -27,22 +28,22 @@ namespace cudf {
 //! Inner interfaces and implementations
 namespace detail {
 /**
- * @copydoc cudf::concatenate(std::vector<column_view> const&,rmm::mr::device_memory_resource*)
+ * @copydoc cudf::concatenate(host_span<column_view const>,rmm::mr::device_memory_resource*)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> concatenate(
-  std::vector<column_view> const& columns_to_concat,
+  host_span<column_view const> columns_to_concat,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @copydoc cudf::concatenate(std::vector<table_view> const&,rmm::mr::device_memory_resource*)
+ * @copydoc cudf::concatenate(host_span<table_view const>,rmm::mr::device_memory_resource*)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> concatenate(
-  std::vector<table_view> const& tables_to_concat,
+  host_span<table_view const> tables_to_concat,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/detail/groupby.hpp b/cpp/include/cudf/detail/groupby.hpp
index ce5fdb92bd1..36a76c7b6de 100644
--- a/cpp/include/cudf/detail/groupby.hpp
+++ b/cpp/include/cudf/detail/groupby.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #include <cudf/groupby.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -36,12 +37,12 @@ namespace hash {
  * @return true A hash-based groupby can be used
  * @return false A hash-based groupby cannot be used
  */
-bool can_use_hash_groupby(table_view const& keys, std::vector<aggregation_request> const& requests);
+bool can_use_hash_groupby(table_view const& keys, host_span<aggregation_request const> requests);
 
 // Hash-based groupby
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(
   table_view const& keys,
-  std::vector<aggregation_request> const& requests,
+  host_span<aggregation_request const> requests,
   null_policy include_null_keys,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr);
diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp
index a68d649b8c8..bfc9673d3cb 100644
--- a/cpp/include/cudf/detail/groupby/sort_helper.hpp
+++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp
@@ -93,7 +93,7 @@ struct sort_groupby_helper {
    */
   std::unique_ptr<column> sorted_values(
     column_view const& values,
-    rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+    rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -108,7 +108,7 @@ struct sort_groupby_helper {
    */
   std::unique_ptr<column> grouped_values(
     column_view const& values,
-    rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+    rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -117,7 +117,7 @@ struct sort_groupby_helper {
    * @return a new table in which each row is a unique row in the sorted key table.
    */
   std::unique_ptr<table> unique_keys(
-    rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+    rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -126,13 +126,13 @@ struct sort_groupby_helper {
    * @return a new table containing the sorted keys.
    */
   std::unique_ptr<table> sorted_keys(
-    rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+    rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Get the number of groups in `keys`
    */
-  size_type num_groups() { return group_offsets().size() - 1; }
+  size_type num_groups(rmm::cuda_stream_view stream) { return group_offsets(stream).size() - 1; }
 
   /**
    * @brief Return the effective number of keys
@@ -141,7 +141,7 @@ struct sort_groupby_helper {
    * When include_null_keys = NO, returned value is the number of rows in `keys`
    *  in which no element is null
    */
-  size_type num_keys(rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+  size_type num_keys(rmm::cuda_stream_view stream);
 
   /**
    * @brief Get the sorted order of `keys`.
@@ -156,7 +156,7 @@ struct sort_groupby_helper {
    *
    * @return the sort order indices for `keys`.
    */
-  column_view key_sort_order(rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+  column_view key_sort_order(rmm::cuda_stream_view stream);
 
   /**
    * @brief Get each group's offset into the sorted order of `keys`.
@@ -169,13 +169,13 @@ struct sort_groupby_helper {
    * @return vector of offsets of the starting point of each group in the sorted
    * key table
    */
-  index_vector const& group_offsets(rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+  index_vector const& group_offsets(rmm::cuda_stream_view stream);
 
   /**
    * @brief Get the group labels corresponding to the sorted order of `keys`.
    *
    * Each group is assigned a unique numerical "label" in
-   * `[0, 1, 2, ... , num_groups() - 1, num_groups())`.
+   * `[0, 1, 2, ... , num_groups() - 1, num_groups(stream))`.
    * For a row in sorted `keys`, its corresponding group label indicates which
    * group it belongs to.
    *
@@ -184,7 +184,7 @@ struct sort_groupby_helper {
    *
    * @return vector of group labels for each row in the sorted key column
    */
-  index_vector const& group_labels(rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+  index_vector const& group_labels(rmm::cuda_stream_view stream);
 
  private:
   /**
@@ -192,7 +192,7 @@ struct sort_groupby_helper {
    *
    * Returns the group label for every row in the original `keys` table. For a
    * given unique key row, its group label is equivalent to what is returned by
-   * `group_labels()`. However, if a row contains a null value, and
+   * `group_labels(stream)`. However, if a row contains a null value, and
    * `include_null_keys == NO`, then its label is NULL.
    *
    * Computes and stores unsorted labels on first invocation and returns stored
@@ -201,7 +201,7 @@ struct sort_groupby_helper {
    * @return A nullable column of `INT32` containing group labels in the order
    *         of the unsorted key table
    */
-  column_view unsorted_keys_labels(rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+  column_view unsorted_keys_labels(rmm::cuda_stream_view stream);
 
   /**
    * @brief Get the column representing the row bitmask for the `keys`
@@ -215,7 +215,7 @@ struct sort_groupby_helper {
    * Computes and stores bitmask on first invocation and returns stored column
    * on subsequent calls.
    */
-  column_view keys_bitmask_column(rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+  column_view keys_bitmask_column(rmm::cuda_stream_view stream);
 
  private:
   column_ptr _key_sorted_order;      ///< Indices to produce _keys in sorted order
diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp
index b0870ef8d9a..77cb321a12c 100644
--- a/cpp/include/cudf/detail/null_mask.hpp
+++ b/cpp/include/cudf/detail/null_mask.hpp
@@ -53,7 +53,7 @@ void set_null_mask(bitmask_type *bitmask,
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
-                                                std::vector<size_type> const &indices,
+                                                host_span<size_type const> indices,
                                                 rmm::cuda_stream_view stream);
 
 /**
@@ -62,7 +62,7 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::vector<size_type> segmented_count_unset_bits(bitmask_type const *bitmask,
-                                                  std::vector<size_type> const &indices,
+                                                  host_span<size_type const> indices,
                                                   rmm::cuda_stream_view stream);
 
 /**
diff --git a/cpp/include/cudf/dictionary/detail/concatenate.hpp b/cpp/include/cudf/dictionary/detail/concatenate.hpp
index ae2e0f0ba38..c2fe2dce1fe 100644
--- a/cpp/include/cudf/dictionary/detail/concatenate.hpp
+++ b/cpp/include/cudf/dictionary/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -36,7 +37,7 @@ namespace detail {
  * @return New column with concatenated results.
  */
 std::unique_ptr<column> concatenate(
-  std::vector<column_view> const& columns,
+  host_span<column_view const> columns,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index eb752a8a0ea..952075b1703 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -218,14 +218,15 @@ class fixed_point {
   using rep = Rep;
 
   /**
-   * @brief Constructor that will perform shifting to store value appropriately
+   * @brief Constructor that will perform shifting to store value appropriately (from floating point
+   * types)
    *
-   * @tparam T The type that you are constructing from (integral or floating)
+   * @tparam T The floating point type that you are constructing from
    * @param value The value that will be constructed from
    * @param scale The exponent that is applied to Rad to perform shifting
    */
   template <typename T,
-            typename cuda::std::enable_if_t<is_supported_construction_value_type<T>() &&
+            typename cuda::std::enable_if_t<cuda::std::is_floating_point<T>() &&
                                             is_supported_representation_type<Rep>()>* = nullptr>
   CUDA_HOST_DEVICE_CALLABLE explicit fixed_point(T const& value, scale_type const& scale)
     : _value{static_cast<Rep>(detail::shift<Rep, Rad>(value, scale))}, _scale{scale}
@@ -233,8 +234,25 @@ class fixed_point {
   }
 
   /**
-   * @brief Constructor that will not perform shifting (assumes value already
-   * shifted)
+   * @brief Constructor that will perform shifting to store value appropriately (from integral
+   * types)
+   *
+   * @tparam T The integral type that you are constructing from
+   * @param value The value that will be constructed from
+   * @param scale The exponent that is applied to Rad to perform shifting
+   */
+  template <typename T,
+            typename cuda::std::enable_if_t<cuda::std::is_integral<T>() &&
+                                            is_supported_representation_type<Rep>()>* = nullptr>
+  CUDA_HOST_DEVICE_CALLABLE explicit fixed_point(T const& value, scale_type const& scale)
+    // `value` is cast to `Rep` to avoid overflow in cases where
+    // constructing to `Rep` that is wider than `T`
+    : _value{detail::shift<Rep, Rad>(static_cast<Rep>(value), scale)}, _scale{scale}
+  {
+  }
+
+  /**
+   * @brief Constructor that will not perform shifting (assumes value already shifted)
    *
    * @param s scaled_integer that contains scale and already shifted value
    */
@@ -260,18 +278,33 @@ class fixed_point {
   fixed_point() : _value{0}, _scale{scale_type{0}} {}
 
   /**
-   * @brief Explicit conversion operator
+   * @brief Explicit conversion operator for casting to floating point types
    *
-   * @tparam U The type that is being explicitly converted to (integral or floating)
+   * @tparam U The floating point type that is being explicitly converted to
    * @return The `fixed_point` number in base 10 (aka human readable format)
    */
   template <typename U,
-            typename cuda::std::enable_if_t<is_supported_construction_value_type<U>()>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE explicit constexpr operator U() const
+            typename cuda::std::enable_if_t<cuda::std::is_floating_point<U>::value>* = nullptr>
+  explicit constexpr operator U() const
   {
     return detail::shift<Rep, Rad>(static_cast<U>(_value), detail::negate(_scale));
   }
 
+  /**
+   * @brief Explicit conversion operator for casting to integral types
+   *
+   * @tparam U The integral type that is being explicitly converted to
+   * @return The `fixed_point` number in base 10 (aka human readable format)
+   */
+  template <typename U,
+            typename cuda::std::enable_if_t<cuda::std::is_integral<U>::value>* = nullptr>
+  explicit constexpr operator U() const
+  {
+    // Don't cast to U until converting to Rep because in certain cases casting to U before shifting
+    // will result in integer overflow (i.e. if U = int32_t, Rep = int64_t and _value > 2 billion)
+    return static_cast<U>(detail::shift<Rep, Rad>(_value, detail::negate(_scale)));
+  }
+
   CUDA_HOST_DEVICE_CALLABLE operator scaled_integer<Rep>() const
   {
     return scaled_integer<Rep>{_value, _scale};
diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp
index 1dfacd53e0d..19f87873873 100644
--- a/cpp/include/cudf/groupby.hpp
+++ b/cpp/include/cudf/groupby.hpp
@@ -19,6 +19,7 @@
 #include <cudf/aggregation.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -163,7 +164,7 @@ class groupby {
    * specified in `requests`.
    */
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> aggregate(
-    std::vector<aggregation_request> const& requests,
+    host_span<aggregation_request const> requests,
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -218,7 +219,7 @@ class groupby {
    * specified in `requests`.
    */
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> scan(
-    std::vector<aggregation_request> const& requests,
+    host_span<aggregation_request const> requests,
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -277,18 +278,18 @@ class groupby {
    * aggregation requests.
    */
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> dispatch_aggregation(
-    std::vector<aggregation_request> const& requests,
+    host_span<aggregation_request const> requests,
     rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr);
 
   // Sort-based groupby
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> sort_aggregate(
-    std::vector<aggregation_request> const& requests,
+    host_span<aggregation_request const> requests,
     rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr);
 
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> sort_scan(
-    std::vector<aggregation_request> const& requests,
+    host_span<aggregation_request const> requests,
     rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr);
 };
diff --git a/cpp/include/cudf/lists/detail/concatenate.hpp b/cpp/include/cudf/lists/detail/concatenate.hpp
index f9adc893b8e..30797443c35 100644
--- a/cpp/include/cudf/lists/detail/concatenate.hpp
+++ b/cpp/include/cudf/lists/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -42,7 +43,7 @@ namespace detail {
  * @return New column with concatenated results.
  */
 std::unique_ptr<column> concatenate(
-  std::vector<column_view> const& columns,
+  host_span<column_view const> columns,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp
index 0d4de1a9beb..ae6c0cfdbd7 100644
--- a/cpp/include/cudf/null_mask.hpp
+++ b/cpp/include/cudf/null_mask.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/device_buffer.hpp>
 
@@ -136,38 +137,32 @@ cudf::size_type count_unset_bits(bitmask_type const* bitmask, size_type start, s
  * `[indices[2*i], indices[(2*i)+1])` (where 0 <= i < indices.size() / 2).
  *
  * Returns an empty vector if `bitmask == nullptr`.
+ *
  * @throws cudf::logic_error if `indices.size() % 2 != 0`
- * @throws cudf::logic_error if `indices[2*i] < 0 or
- * indices[2*i] > indices[(2*i)+1]`
- *
- * @param[in] bitmask Bitmask residing in device memory whose bits will be
- * counted
- * @param[in] indices A vector of indices used to specify ranges to count the
- * number of set bits
- * @return std::vector<size_type> A vector storing the number of non-zero bits
- * in the specified ranges
+ * @throws cudf::logic_error if `indices[2*i] < 0 or indices[2*i] > indices[(2*i)+1]`
+ *
+ * @param[in] bitmask Bitmask residing in device memory whose bits will be counted
+ * @param[in] indices A host_span of indices specifying ranges to count the number of set bits
+ * @return A vector storing the number of non-zero bits in the specified ranges
  */
 std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
-                                                std::vector<cudf::size_type> const& indices);
+                                                host_span<cudf::size_type const> indices);
 
 /**
  * @brief Given a bitmask, counts the number of unset (0) bits in every range
  * `[indices[2*i], indices[(2*i)+1])` (where 0 <= i < indices.size() / 2).
  *
  * Returns an empty vector if `bitmask == nullptr`.
+ *
  * @throws cudf::logic_error if `indices.size() % 2 != 0`
- * @throws cudf::logic_error if `indices[2*i] < 0 or
- * indices[2*i] > indices[(2*i)+1]`
- *
- * @param[in] bitmask Bitmask residing in device memory whose bits will be
- * counted
- * @param[in] indices A vector of indices used to specify ranges to count the
- * number of unset bits
- * @return std::vector<size_type> A vector storing the number of zero bits in
- * the specified ranges
+ * @throws cudf::logic_error if `indices[2*i] < 0 or indices[2*i] > indices[(2*i)+1]`
+ *
+ * @param[in] bitmask Bitmask residing in device memory whose bits will be counted
+ * @param[in] indices A host_span of indices specifying ranges to count the number of unset bits
+ * @return A vector storing the number of zero bits in the specified ranges
  */
 std::vector<size_type> segmented_count_unset_bits(bitmask_type const* bitmask,
-                                                  std::vector<cudf::size_type> const& indices);
+                                                  host_span<cudf::size_type const> indices);
 
 /**
  * @brief Creates a `device_buffer` from a slice of bitmask defined by a range
diff --git a/cpp/include/cudf/strings/detail/concatenate.hpp b/cpp/include/cudf/strings/detail/concatenate.hpp
index 3e6fc6d67fc..0740039e896 100644
--- a/cpp/include/cudf/strings/detail/concatenate.hpp
+++ b/cpp/include/cudf/strings/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -41,7 +42,7 @@ namespace detail {
  * @return New column with concatenated results.
  */
 std::unique_ptr<column> concatenate(
-  std::vector<column_view> const& columns,
+  host_span<column_view const> columns,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/structs/detail/concatenate.hpp b/cpp/include/cudf/structs/detail/concatenate.hpp
index ef3da82cfeb..a098703e4b0 100644
--- a/cpp/include/cudf/structs/detail/concatenate.hpp
+++ b/cpp/include/cudf/structs/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 namespace cudf {
 namespace structs {
@@ -48,7 +49,7 @@ namespace detail {
  * @return        New column with concatenated results.
  */
 std::unique_ptr<column> concatenate(
-  std::vector<column_view> const& columns,
+  host_span<column_view const> columns,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
index 04d215ff7cb..5af3c29a3d9 100644
--- a/cpp/include/cudf/table/row_operators.cuh
+++ b/cpp/include/cudf/table/row_operators.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -91,6 +91,26 @@ __device__ weak_ordering relational_compare(Element lhs, Element rhs)
   return detail::compare_elements(lhs, rhs);
 }
 
+/**
+ * @brief Compare the nulls according to null order.
+ *
+ * @param lhs_is_null boolean representing if lhs is null
+ * @param rhs_is_null boolean representing if lhs is null
+ * @param null_precedence null order
+ * @return Indicates the relationship between null in lhs and rhs columns.
+ */
+inline __device__ auto null_compare(bool lhs_is_null, bool rhs_is_null, null_order null_precedence)
+{
+  if (lhs_is_null and rhs_is_null) {  // null <? null
+    return weak_ordering::EQUIVALENT;
+  } else if (lhs_is_null) {  // null <? x
+    return (null_precedence == null_order::BEFORE) ? weak_ordering::LESS : weak_ordering::GREATER;
+  } else if (rhs_is_null) {  // x <? null
+    return (null_precedence == null_order::AFTER) ? weak_ordering::LESS : weak_ordering::GREATER;
+  }
+  return weak_ordering::EQUIVALENT;
+}
+
 /**
  * @brief A specialization for non-floating-point `Element` type relational
  * comparison to derive the order of the elements with respect to `lhs`.
@@ -173,8 +193,8 @@ class element_equality_comparator {
     noexcept
   {
     if (has_nulls) {
-      bool const lhs_is_null{lhs.nullable() and lhs.is_null(lhs_element_index)};
-      bool const rhs_is_null{rhs.nullable() and rhs.is_null(rhs_element_index)};
+      bool const lhs_is_null{lhs.is_null(lhs_element_index)};
+      bool const rhs_is_null{rhs.is_null(rhs_element_index)};
       if (lhs_is_null and rhs_is_null) {
         return nulls_are_equal;
       } else if (lhs_is_null != rhs_is_null) {
@@ -269,17 +289,11 @@ class element_relational_comparator {
                                       size_type rhs_element_index) const noexcept
   {
     if (has_nulls) {
-      bool const lhs_is_null{lhs.nullable() and lhs.is_null(lhs_element_index)};
-      bool const rhs_is_null{rhs.nullable() and rhs.is_null(rhs_element_index)};
-
-      if (lhs_is_null and rhs_is_null) {  // null <? null
-        return weak_ordering::EQUIVALENT;
-      } else if (lhs_is_null) {  // null <? x
-        return (null_precedence == null_order::BEFORE) ? weak_ordering::LESS
-                                                       : weak_ordering::GREATER;
-      } else if (rhs_is_null) {  // x <? null
-        return (null_precedence == null_order::AFTER) ? weak_ordering::LESS
-                                                      : weak_ordering::GREATER;
+      bool const lhs_is_null{lhs.is_null(lhs_element_index)};
+      bool const rhs_is_null{rhs.is_null(rhs_element_index)};
+
+      if (lhs_is_null or rhs_is_null) {  // atleast one is null
+        return null_compare(lhs_is_null, rhs_is_null, null_precedence);
       }
     }
 
@@ -324,6 +338,7 @@ class row_lexicographic_comparator {
    * comparison between the rows of two tables.
    *
    * @throws cudf::logic_error if `lhs.num_columns() != rhs.num_columns()`
+   * @throws cudf::logic_error if column types of `lhs` and `rhs` are not comparable.
    *
    * @param lhs The first table
    * @param rhs The second table (may be the same table as `lhs`)
@@ -341,8 +356,9 @@ class row_lexicographic_comparator {
                                null_order const* null_precedence = nullptr)
     : _lhs{lhs}, _rhs{rhs}, _column_order{column_order}, _null_precedence{null_precedence}
   {
-    // Add check for types to be the same.
     CUDF_EXPECTS(_lhs.num_columns() == _rhs.num_columns(), "Mismatched number of columns.");
+    CUDF_EXPECTS(detail::is_relationally_comparable(_lhs, _rhs),
+                 "Attempted to compare elements of uncomparable types.");
   }
 
   /**
diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh
index e2263e4f5df..7c80c958f92 100644
--- a/cpp/include/cudf/table/table_device_view.cuh
+++ b/cpp/include/cudf/table/table_device_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -149,4 +149,10 @@ auto contiguous_copy_column_device_views(HostTableView source_view, rmm::cuda_st
   return std::make_tuple(std::move(descendant_storage), d_columns);
 }
 
+namespace detail {
+extern template bool is_relationally_comparable<table_device_view>(table_device_view const& lhs,
+                                                                   table_device_view const& rhs);
+extern template bool is_relationally_comparable<mutable_table_device_view>(
+  mutable_table_device_view const& lhs, mutable_table_device_view const& rhs);
+}  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp
index 083366cc310..5cdecab9115 100644
--- a/cpp/include/cudf/table/table_view.hpp
+++ b/cpp/include/cudf/table/table_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -291,4 +291,21 @@ table_view scatter_columns(table_view const& source,
                            std::vector<size_type> const& map,
                            table_view const& target);
 
+namespace detail {
+/**
+ * @brief Indicates whether respective columns in input tables are relationally comparable.
+ *
+ * @param lhs The first table
+ * @param rhs The second table (may be the same table as `lhs`)
+ * @return true all of respective columns on `lhs` and 'rhs` tables are comparable.
+ * @return false any of respective columns on `lhs` and 'rhs` tables are not comparable.
+ */
+template <typename TableView>
+bool is_relationally_comparable(TableView const& lhs, TableView const& rhs);
+
+extern template bool is_relationally_comparable<table_view>(table_view const& lhs,
+                                                            table_view const& rhs);
+extern template bool is_relationally_comparable<mutable_table_view>(mutable_table_view const& lhs,
+                                                                    mutable_table_view const& rhs);
+}  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index 7a3316a0571..727284194d8 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -260,12 +260,12 @@ class data_type {
   /**
    * @brief Returns the type identifier
    */
-  CUDA_HOST_DEVICE_CALLABLE type_id id() const noexcept { return _id; }
+  constexpr type_id id() const noexcept { return _id; }
 
   /**
    * @brief Returns the scale (for fixed_point types)
    */
-  CUDA_HOST_DEVICE_CALLABLE int32_t scale() const noexcept { return _fixed_point_scale; }
+  constexpr int32_t scale() const noexcept { return _fixed_point_scale; }
 
  private:
   type_id _id{type_id::EMPTY};
@@ -287,7 +287,7 @@ class data_type {
  * @return true `lhs` is equal to `rhs`
  * @return false `lhs` is not equal to `rhs`
  */
-inline bool operator==(data_type const& lhs, data_type const& rhs)
+constexpr bool operator==(data_type const& lhs, data_type const& rhs)
 {
   // use std::tie in the future, breaks JITIFY currently
   return lhs.id() == rhs.id() && lhs.scale() == rhs.scale();
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index c13e5ce44ae..999306d4ee7 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -126,16 +126,31 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
 
   constexpr host_span() noexcept : base() {}  // required to compile on centos
 
-  template <typename C, std::enable_if_t<is_host_span_supported_container<C>::value>* = nullptr>
+  // Constructor from container
+  template <
+    typename C,
+    // Only supported containers of types convertible to T
+    std::enable_if_t<is_host_span_supported_container<C>::value &&
+                     std::is_convertible<std::remove_pointer_t<decltype(thrust::raw_pointer_cast(
+                                           std::declval<C&>().data()))> (*)[],
+                                         T (*)[]>::value>* = nullptr>
   constexpr host_span(C& in) : base(in.data(), in.size())
   {
   }
 
-  template <typename C, std::enable_if_t<is_host_span_supported_container<C>::value>* = nullptr>
+  // Constructor from const container
+  template <
+    typename C,
+    // Only supported containers of types convertible to T
+    std::enable_if_t<is_host_span_supported_container<C>::value &&
+                     std::is_convertible<std::remove_pointer_t<decltype(thrust::raw_pointer_cast(
+                                           std::declval<C&>().data()))> (*)[],
+                                         T (*)[]>::value>* = nullptr>
   constexpr host_span(C const& in) : base(in.data(), in.size())
   {
   }
 
+  // Copy construction to support const conversion
   template <typename OtherT,
             std::size_t OtherExtent,
             typename std::enable_if<(Extent == OtherExtent || Extent == dynamic_extent) &&
@@ -175,12 +190,24 @@ struct device_span : public cudf::detail::span_base<T, Extent, device_span<T, Ex
 
   constexpr device_span() noexcept : base() {}  // required to compile on centos
 
-  template <typename C, std::enable_if_t<is_device_span_supported_container<C>::value>* = nullptr>
+  template <
+    typename C,
+    // Only supported containers of types convertible to T
+    std::enable_if_t<is_device_span_supported_container<C>::value &&
+                     std::is_convertible<std::remove_pointer_t<decltype(thrust::raw_pointer_cast(
+                                           std::declval<C&>().data()))> (*)[],
+                                         T (*)[]>::value>* = nullptr>
   constexpr device_span(C& in) : base(thrust::raw_pointer_cast(in.data()), in.size())
   {
   }
 
-  template <typename C, std::enable_if_t<is_device_span_supported_container<C>::value>* = nullptr>
+  template <
+    typename C,
+    // Only supported containers of types convertible to T
+    std::enable_if_t<is_device_span_supported_container<C>::value &&
+                     std::is_convertible<std::remove_pointer_t<decltype(thrust::raw_pointer_cast(
+                                           std::declval<C&>().data()))> (*)[],
+                                         T (*)[]>::value>* = nullptr>
   constexpr device_span(C const& in) : base(thrust::raw_pointer_cast(in.data()), in.size())
   {
   }
diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt
index a307bf9d3f0..e178f5a6280 100644
--- a/cpp/libcudf_kafka/CMakeLists.txt
+++ b/cpp/libcudf_kafka/CMakeLists.txt
@@ -13,112 +13,51 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #=============================================================================
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 
-project(CUDA_KAFKA VERSION 0.15.0 LANGUAGES C CXX CUDA)
-
-# TODO: Since we have no actual CUDA code in cudf_kafka this should be removed in the future
-# in favor of using FindCUDAToolkit to get the needed CUDA include headers
-if(NOT CMAKE_CUDA_COMPILER)
-  message(SEND_ERROR "CMake cannot locate a CUDA compiler")
-endif(NOT CMAKE_CUDA_COMPILER)
-
-###################################################################################################
-# - build type ------------------------------------------------------------------------------------
-
-# Set a default build type if none was specified
-set(DEFAULT_BUILD_TYPE "Release")
-
-if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
-  message(STATUS "Setting build type to '${DEFAULT_BUILD_TYPE}' since none specified.")
-  set(CMAKE_BUILD_TYPE "${DEFAULT_BUILD_TYPE}" CACHE
-      STRING "Choose the type of build." FORCE)
-  # Set the possible values of build type for cmake-gui
-  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
-    "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
-endif(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+project(CUDA_KAFKA VERSION 0.19.0 LANGUAGES CXX)
 
 ###################################################################################################
-# - compiler options ------------------------------------------------------------------------------
-
-set(CMAKE_CXX_STANDARD 14)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
+# - Build options
+option(BUILD_TESTS "Build tests for libcudf_kafka" ON)
 
-# To apply RUNPATH to transitive dependencies (this is a temporary solution)
-set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--disable-new-dtags")
-set(CMAKE_EXE_LINKER_FLAGS "-Wl,--disable-new-dtags")
-
-# Build options
-option(BUILD_TESTS "Configure CMake to build tests" ON)
+message(VERBOSE "CUDF_KAFKA: Build gtests: ${BUILD_TESTS}")
 
 ###################################################################################################
-# - cmake modules ---------------------------------------------------------------------------------
-
-message(VERBOSE "CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}")
-set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/" ${CMAKE_MODULE_PATH})
+# - Dependencies
 
-include(FeatureSummary)
-include(CheckIncludeFiles)
-include(CheckLibraryExists)
+# CPM
+include(../cmake/thirdparty/CUDF_GetCPM.cmake)
 
-###################################################################################################
-# - conda environment -----------------------------------------------------------------------------
+# libcudf
+include(cmake/thirdparty/CUDF_KAFKA_GetCUDF.cmake)
 
-if("$ENV{CONDA_BUILD}" STREQUAL "1")
-    set(CMAKE_SYSTEM_PREFIX_PATH "$ENV{BUILD_PREFIX};$ENV{PREFIX};${CMAKE_SYSTEM_PREFIX_PATH}")
-    set(CONDA_INCLUDE_DIRS "$ENV{BUILD_PREFIX}/include" "$ENV{PREFIX}/include")
-    set(CONDA_LINK_DIRS "$ENV{BUILD_PREFIX}/lib" "$ENV{PREFIX}/lib")
-    message(VERBOSE "Conda build detected, CMAKE_SYSTEM_PREFIX_PATH set to: ${CMAKE_SYSTEM_PREFIX_PATH}")
-endif()
+# librdkafka
+include(cmake/thirdparty/CUDF_KAFKA_GetRDKafka.cmake)
 
-###################################################################################################
-# - add gtest -------------------------------------------------------------------------------------
+# # GTests if enabled
+if (BUILD_TESTS)
+    # GoogleTest
+    include(../cmake/thirdparty/CUDF_GetGTest.cmake)
 
-# TODO: This is currently using a nearly duplicate Google Test Module due to CMake source limitations.
-# this should be standardized in the future to use the same Google Test Module as cudf
-if(BUILD_TESTS)
+    # include CTest module -- automatically calls enable_testing()
     include(CTest)
-    include(ConfigureGoogleTest)
-
-    if(GTEST_FOUND)
-        message(VERBOSE "Google C++ Testing Framework (Google Test) found in ${GTEST_ROOT}")
-        include_directories(${GTEST_INCLUDE_DIR})
-        add_subdirectory(${CMAKE_SOURCE_DIR}/tests)
-    else()
-        message(AUTHOR_WARNING "Google C++ Testing Framework (Google Test) not found: automated tests are disabled.")
-    endif(GTEST_FOUND)
-endif(BUILD_TESTS)
-        
-message(VERBOSE "CUDF_KAFKA_TEST_LIST set to: ${CUDF_KAFKA_TEST_LIST}")
+    add_subdirectory(tests)
+endif()
 
 ###################################################################################################
 # - include paths ---------------------------------------------------------------------------------
 
-if(CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES)
-	include_directories("${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}")
-endif(CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES)
-
 include_directories("${CMAKE_BINARY_DIR}/include"
-                    "${CMAKE_BINARY_DIR}/include/jit"
                     "${CMAKE_SOURCE_DIR}/include"
                     "${CMAKE_SOURCE_DIR}/src")
 
-if(CONDA_INCLUDE_DIRS)
-    include_directories("${CONDA_INCLUDE_DIRS}")
-endif(CONDA_INCLUDE_DIRS)
-
 ###################################################################################################
 # - library paths ---------------------------------------------------------------------------------
 
 link_directories("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" # CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES is an undocumented/unsupported variable containing the link directories for nvcc
                  "${CMAKE_BINARY_DIR}/lib"
-                 "${CMAKE_BINARY_DIR}"
-                 "${GTEST_LIBRARY_DIR}"
-                 "${RMM_LIBRARY}")
-
-if(CONDA_LINK_DIRS)
-    link_directories("${CONDA_LINK_DIRS}")
-endif(CONDA_LINK_DIRS)
+                 "${CMAKE_BINARY_DIR}")
 
 ###################################################################################################
 # - library target --------------------------------------------------------------------------------
@@ -127,37 +66,12 @@ add_library(cudf_kafka SHARED
     src/kafka_consumer.cpp
 )
 
-set_target_properties(cudf_kafka PROPERTIES BUILD_RPATH "\$ORIGIN")
-
-# Include paths
-include_directories("${CMAKE_SOURCE_DIR}/include"
-                    "${CMAKE_CURRENT_SOURCE_DIR}/include/cudf")
-
-###################################################################################################
-# cudf_kafka - librdkafka -------------------------------------------------------------------------
-
-find_path(RDKAFKA_INCLUDE "librdkafka" HINTS "$ENV{RDKAFKA_ROOT}/include")
-find_library(RDKAFKA++_LIBRARY "rdkafka++" HINTS "$ENV{RDKAFKA_ROOT}/lib" "$ENV{RDKAFKA_ROOT}/build")
-
-message(VERBOSE "RDKAFKA: RDKAFKA++_LIBRARY set to ${RDKAFKA++_LIBRARY}")
-message(VERBOSE "RDKAFKA: RDKAFKA_INCLUDE set to ${RDKAFKA_INCLUDE}")
-
-target_link_libraries(cudf_kafka ${RDKAFKA++_LIBRARY})
-include_directories("${RDKAFKA_INCLUDE}")
-
 ###################################################################################################
 # - cudf_kafka Install ----------------------------------------------------------------------------
-target_link_libraries(cudf_kafka cudf)
+target_link_libraries(cudf_kafka cudf::cudf RDKAFKA::RDKAFKA)
 
 install(TARGETS cudf_kafka
         DESTINATION lib)
 
 install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include
         DESTINATION include)
-
-add_custom_target(build_tests_libcudf_kafka
-                    DEPENDS ${CUDF_KAFKA_TEST_LIST})
-
-add_custom_target(test_libcudf_kafka
-                    COMMAND ctest
-                    DEPENDS build_tests_libcudf_kafka)
diff --git a/cpp/libcudf_kafka/cmake/Modules/ConfigureGoogleTest.cmake b/cpp/libcudf_kafka/cmake/Modules/ConfigureGoogleTest.cmake
deleted file mode 100644
index e2b3aff8546..00000000000
--- a/cpp/libcudf_kafka/cmake/Modules/ConfigureGoogleTest.cmake
+++ /dev/null
@@ -1,46 +0,0 @@
-set(GTEST_ROOT "${CMAKE_BINARY_DIR}/googletest")
-
-set(GTEST_CMAKE_ARGS "")
-
-configure_file("${CMAKE_SOURCE_DIR}/cmake/Templates/GoogleTest.CMakeLists.txt.cmake"
-               "${GTEST_ROOT}/CMakeLists.txt")
-
-file(MAKE_DIRECTORY "${GTEST_ROOT}/build")
-file(MAKE_DIRECTORY "${GTEST_ROOT}/install")
-
-execute_process(COMMAND ${CMAKE_COMMAND} -G ${CMAKE_GENERATOR} .
-                RESULT_VARIABLE GTEST_CONFIG
-                WORKING_DIRECTORY ${GTEST_ROOT})
-
-if(GTEST_CONFIG)
-    message(FATAL_ERROR "Configuring GoogleTest failed: " ${GTEST_CONFIG})
-endif(GTEST_CONFIG)
-
-set(PARALLEL_BUILD -j)
-if($ENV{PARALLEL_LEVEL})
-    set(NUM_JOBS $ENV{PARALLEL_LEVEL})
-    set(PARALLEL_BUILD "${PARALLEL_BUILD}${NUM_JOBS}")
-endif($ENV{PARALLEL_LEVEL})
-
-if(${NUM_JOBS})
-    if(${NUM_JOBS} EQUAL 1)
-        message(VERBOSE "GTEST BUILD: Enabling Sequential CMake build")
-    elseif(${NUM_JOBS} GREATER 1)
-        message(VERBOSE "GTEST BUILD: Enabling Parallel CMake build with ${NUM_JOBS} jobs")
-    endif(${NUM_JOBS} EQUAL 1)
-else()
-    message(VERBOSE "GTEST BUILD: Enabling Parallel CMake build with all threads")
-endif(${NUM_JOBS})
-
-execute_process(COMMAND ${CMAKE_COMMAND} --build .. -- ${PARALLEL_BUILD}
-                RESULT_VARIABLE GTEST_BUILD
-                WORKING_DIRECTORY ${GTEST_ROOT}/build)
-
-if(GTEST_BUILD)
-    message(FATAL_ERROR "Building GoogleTest failed: " ${GTEST_BUILD})
-endif(GTEST_BUILD)
-
-message(VERBOSE "GoogleTest installed here: " ${GTEST_ROOT}/install)
-set(GTEST_INCLUDE_DIR "${GTEST_ROOT}/install/include")
-set(GTEST_LIBRARY_DIR "${GTEST_ROOT}/install/lib")
-set(GTEST_FOUND TRUE)
diff --git a/cpp/libcudf_kafka/cmake/Templates/GoogleTest.CMakeLists.txt.cmake b/cpp/libcudf_kafka/cmake/Templates/GoogleTest.CMakeLists.txt.cmake
deleted file mode 100644
index 07692cd3d32..00000000000
--- a/cpp/libcudf_kafka/cmake/Templates/GoogleTest.CMakeLists.txt.cmake
+++ /dev/null
@@ -1,12 +0,0 @@
-cmake_minimum_required(VERSION 3.12)
-
-include(ExternalProject)
-
-ExternalProject_Add(GoogleTest
-                    GIT_REPOSITORY    https://github.com/google/googletest.git
-                    GIT_TAG           release-1.8.0
-                    GIT_SHALLOW       true
-                    SOURCE_DIR        "${GTEST_ROOT}/googletest"
-                    BINARY_DIR        "${GTEST_ROOT}/build"
-                    INSTALL_DIR       "${GTEST_ROOT}/install"
-                    CMAKE_ARGS        ${GTEST_CMAKE_ARGS} -DCMAKE_INSTALL_PREFIX=${GTEST_ROOT}/install)
diff --git a/cpp/libcudf_kafka/cmake/thirdparty/CUDF_KAFKA_GetCUDF.cmake b/cpp/libcudf_kafka/cmake/thirdparty/CUDF_KAFKA_GetCUDF.cmake
new file mode 100644
index 00000000000..1f7c15d4f75
--- /dev/null
+++ b/cpp/libcudf_kafka/cmake/thirdparty/CUDF_KAFKA_GetCUDF.cmake
@@ -0,0 +1,46 @@
+#=============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(cudfkafka_save_if_enabled var)
+    if(CUDF_KAFKA_${var})
+        unset(${var} PARENT_SCOPE)
+        unset(${var} CACHE)
+    endif()
+endfunction()
+
+function(cudfkafka_restore_if_enabled var)
+    if(CUDF_KAFKA_${var})
+        set(${var} ON CACHE INTERNAL "" FORCE)
+    endif()
+endfunction()
+
+function(find_and_configure_cudf VERSION)
+    cudfkafka_save_if_enabled(BUILD_TESTS)
+    cudfkafka_save_if_enabled(BUILD_BENCHMARKS)
+    CPMFindPackage(NAME cudf
+        VERSION         ${VERSION}
+        GIT_REPOSITORY  https://github.com/rapidsai/cudf.git
+        GIT_TAG         branch-${VERSION}
+        GIT_SHALLOW     TRUE
+        SOURCE_SUBDIR   cpp
+        OPTIONS         "BUILD_TESTS OFF"
+                        "BUILD_BENCHMARKS OFF")
+    cudfkafka_restore_if_enabled(BUILD_TESTS)
+    cudfkafka_restore_if_enabled(BUILD_BENCHMARKS)
+endfunction()
+
+set(CUDF_KAFKA_MIN_VERSION_cudf 0.19)
+find_and_configure_cudf(${CUDF_KAFKA_MIN_VERSION_cudf})
diff --git a/cpp/libcudf_kafka/cmake/thirdparty/CUDF_KAFKA_GetRDKafka.cmake b/cpp/libcudf_kafka/cmake/thirdparty/CUDF_KAFKA_GetRDKafka.cmake
new file mode 100644
index 00000000000..5c07db66668
--- /dev/null
+++ b/cpp/libcudf_kafka/cmake/thirdparty/CUDF_KAFKA_GetRDKafka.cmake
@@ -0,0 +1,25 @@
+#=============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+find_path(RDKAFKA_INCLUDE "librdkafka" HINTS "$ENV{RDKAFKA_ROOT}/include")
+find_library(RDKAFKA++_LIBRARY "rdkafka++" HINTS "$ENV{RDKAFKA_ROOT}/lib" "$ENV{RDKAFKA_ROOT}/build")
+
+if(RDKAFKA_INCLUDE AND RDKAFKA++_LIBRARY)
+  add_library(rdkafka INTERFACE)
+  target_link_libraries(rdkafka INTERFACE "${RDKAFKA++_LIBRARY}")
+  target_include_directories(rdkafka INTERFACE "${RDKAFKA_INCLUDE}")
+  add_library(RDKAFKA::RDKAFKA ALIAS rdkafka)
+endif()
\ No newline at end of file
diff --git a/cpp/libcudf_kafka/tests/CMakeLists.txt b/cpp/libcudf_kafka/tests/CMakeLists.txt
index af0ea1c8239..e813ed5439e 100644
--- a/cpp/libcudf_kafka/tests/CMakeLists.txt
+++ b/cpp/libcudf_kafka/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,100 +14,25 @@
 # limitations under the License.
 #=============================================================================
 
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-
-project(KAFKA_TESTS VERSION 0.15.0 LANGUAGES C CXX CUDA)
-
-# TODO: Since we have no actual CUDA code in cudf_kafka this should be removed in the future
-# in favor of using FindCUDAToolkit to get the needed CUDA include headers
-if(NOT CMAKE_CUDA_COMPILER)
-  message(SEND_ERROR "CMake cannot locate a CUDA compiler")
-endif(NOT CMAKE_CUDA_COMPILER)
-
-###################################################################################################
-# - build type ------------------------------------------------------------------------------------
-
-# Set a default build type if none was specified
-set(DEFAULT_BUILD_TYPE "Release")
-
-if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
-  message(STATUS "Setting build type to '${DEFAULT_BUILD_TYPE}' since none specified.")
-  set(CMAKE_BUILD_TYPE "${DEFAULT_BUILD_TYPE}" CACHE
-      STRING "Choose the type of build." FORCE)
-  # Set the possible values of build type for cmake-gui
-  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
-    "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
-endif(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
-
-###################################################################################################
-# - compiler options ------------------------------------------------------------------------------
-
-set(CMAKE_CXX_STANDARD 14)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-# To apply RUNPATH to transitive dependencies (this is a temporary solution)
-set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--disable-new-dtags")
-set(CMAKE_EXE_LINKER_FLAGS "-Wl,--disable-new-dtags")
-
-###################################################################################################
-# - conda environment -----------------------------------------------------------------------------
-
-if("$ENV{CONDA_BUILD}" STREQUAL "1")
-    set(CMAKE_SYSTEM_PREFIX_PATH "$ENV{BUILD_PREFIX};$ENV{PREFIX};${CMAKE_SYSTEM_PREFIX_PATH}")
-    set(CONDA_INCLUDE_DIRS "$ENV{BUILD_PREFIX}/include" "$ENV{PREFIX}/include")
-    set(CONDA_LINK_DIRS "$ENV{BUILD_PREFIX}/lib" "$ENV{PREFIX}/lib")
-    message(STATUS "Conda build detected, CMAKE_SYSTEM_PREFIX_PATH set to: ${CMAKE_SYSTEM_PREFIX_PATH}")
-endif()
-
 ###################################################################################################
 # - compiler function -----------------------------------------------------------------------------
 
-set(CUDF_KAFKA_TEST_LIST CACHE INTERNAL "CUDF_KAFKA_TEST_LIST")
-
-function(ConfigureTest CMAKE_TEST_NAME CMAKE_TEST_SRC)
-    add_executable(${CMAKE_TEST_NAME}
-                    ${CMAKE_TEST_SRC})
-    set_target_properties(${CMAKE_TEST_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_link_libraries(${CMAKE_TEST_NAME} gmock gtest gtest_main pthread cuda cudf_kafka)
-    set_target_properties(${CMAKE_TEST_NAME} PROPERTIES
-                            RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/gtests")
+function(ConfigureTest CMAKE_TEST_NAME )
+    add_executable(${CMAKE_TEST_NAME} ${ARGN})
+    set_target_properties(${CMAKE_TEST_NAME}
+        PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDA_KAFKA_BINARY_DIR}/gtests>")
+    target_link_libraries(${CMAKE_TEST_NAME} PRIVATE GTest::gmock_main GTest::gtest_main cudf_kafka)
+    target_include_directories(${CMAKE_TEST_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include)
     add_test(NAME ${CMAKE_TEST_NAME} COMMAND ${CMAKE_TEST_NAME})
-    set(CUDF_KAFKA_TEST_LIST ${CUDF_KAFKA_TEST_LIST} ${CMAKE_TEST_NAME} CACHE INTERNAL "CUDF_KAFKA_TEST_LIST")
-endfunction(ConfigureTest)
+endfunction()
 
 ###################################################################################################
-# - include paths ---------------------------------------------------------------------------------
-
-if(CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES)
-	include_directories("${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}")
-endif(CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES)
-
-include_directories("${CMAKE_BINARY_DIR}/include"
-                    "${CMAKE_SOURCE_DIR}/include"
-                    "${CMAKE_SOURCE_DIR}../../../tests"
-                    "${CMAKE_SOURCE_DIR}"
-                    "${CMAKE_SOURCE_DIR}/src"
-                    "${GTEST_INCLUDE_DIR}")
-
-if(CONDA_INCLUDE_DIRS)
-  include_directories("${CONDA_INCLUDE_DIRS}")
-endif(CONDA_INCLUDE_DIRS)
+# - Kafka host tests ----------------------------------------------------------------------------------
+ConfigureTest(KAFKA_HOST_TEST
+    kafka_consumer_tests.cpp)
 
 ###################################################################################################
-# - library paths ---------------------------------------------------------------------------------
-
-link_directories("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" # CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES is an undocumented/unsupported variable containing the link directories for nvcc
-                 "${CMAKE_BINARY_DIR}/lib"
-                 "${CMAKE_BINARY_DIR}"
-                 "${GTEST_LIBRARY_DIR}"
-                 "${RMM_LIBRARY}")
-
-if(CONDA_LINK_DIRS)
-    link_directories("${CONDA_LINK_DIRS}")
-endif(CONDA_LINK_DIRS)
-
+### enable testing ################################################################################
 ###################################################################################################
-# - create tests ----------------------------------------------------------------------------------
-ConfigureTest(CUDF_KAFKA_HOST_READ  kafka_consumer_tests.cpp)
 
 enable_testing()
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 845a5512c27..28d1411c30d 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -20,6 +20,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/bit.hpp>
@@ -30,7 +31,6 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/device_vector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <thrust/binary_search.h>
@@ -466,7 +466,7 @@ cudf::size_type count_unset_bits(bitmask_type const *bitmask,
 }
 
 std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
-                                                std::vector<size_type> const &indices,
+                                                host_span<size_type const> indices,
                                                 rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(indices.size() % 2 == 0,
@@ -489,8 +489,8 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
   }
 
   size_type num_ranges = indices.size() / 2;
-  thrust::host_vector<size_type> h_first_indices(num_ranges);
-  thrust::host_vector<size_type> h_last_indices(num_ranges);
+  std::vector<size_type> h_first_indices(num_ranges);
+  std::vector<size_type> h_last_indices(num_ranges);
   thrust::stable_partition_copy(thrust::seq,
                                 std::begin(indices),
                                 std::end(indices),
@@ -499,9 +499,9 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
                                 h_last_indices.begin(),
                                 [](auto i) { return (i % 2) == 0; });
 
-  rmm::device_vector<size_type> d_first_indices = h_first_indices;
-  rmm::device_vector<size_type> d_last_indices  = h_last_indices;
-  rmm::device_vector<size_type> d_null_counts(num_ranges, 0);
+  auto d_first_indices = make_device_uvector_async(h_first_indices, stream);
+  auto d_last_indices  = make_device_uvector_async(h_last_indices, stream);
+  rmm::device_uvector<size_type> d_null_counts(num_ranges, stream);
 
   auto word_num_set_bits = thrust::make_transform_iterator(
     thrust::make_counting_iterator(0),
@@ -510,12 +510,12 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
     thrust::make_counting_iterator(0),
     // We cannot use lambda as cub::DeviceSegmentedReduce::Sum() requires
     // first_word_indices and last_word_indices to have the same type.
-    to_word_index(true, d_first_indices.data().get()));
+    to_word_index(true, d_first_indices.data()));
   auto last_word_indices = thrust::make_transform_iterator(
     thrust::make_counting_iterator(0),
     // We cannot use lambda as cub::DeviceSegmentedReduce::Sum() requires
     // first_word_indices and last_word_indices to have the same type.
-    to_word_index(false, d_last_indices.data().get()));
+    to_word_index(false, d_last_indices.data()));
 
   // first allocate temporary memroy
 
@@ -560,7 +560,7 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
 
   std::vector<size_type> ret(num_ranges);
   CUDA_TRY(cudaMemcpyAsync(ret.data(),
-                           d_null_counts.data().get(),
+                           d_null_counts.data(),
                            num_ranges * sizeof(size_type),
                            cudaMemcpyDeviceToHost,
                            stream.value()));
@@ -571,7 +571,7 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
 }
 
 std::vector<size_type> segmented_count_unset_bits(bitmask_type const *bitmask,
-                                                  std::vector<size_type> const &indices,
+                                                  host_span<size_type const> indices,
                                                   rmm::cuda_stream_view stream)
 {
   if (indices.empty()) {
@@ -669,7 +669,7 @@ cudf::size_type count_unset_bits(bitmask_type const *bitmask, size_type start, s
 
 // Count non-zero bits in the specified ranges
 std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
-                                                std::vector<size_type> const &indices)
+                                                host_span<size_type const> indices)
 {
   CUDF_FUNC_RANGE();
   return detail::segmented_count_set_bits(bitmask, indices, rmm::cuda_stream_default);
@@ -677,7 +677,7 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
 
 // Count zero bits in the specified ranges
 std::vector<size_type> segmented_count_unset_bits(bitmask_type const *bitmask,
-                                                  std::vector<size_type> const &indices)
+                                                  host_span<size_type const> indices)
 {
   CUDF_FUNC_RANGE();
   return detail::segmented_count_unset_bits(bitmask, indices, rmm::cuda_stream_default);
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 8cf9db465f3..1b948083982 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/concatenate.hpp>
 #include <cudf/lists/detail/concatenate.hpp>
 #include <cudf/strings/detail/concatenate.hpp>
@@ -29,7 +30,6 @@
 #include <cudf/table/table_device_view.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/binary_search.h>
@@ -50,19 +50,18 @@ constexpr bool use_fused_kernel_heuristic(bool const has_nulls, size_t const num
   return has_nulls || num_columns > 4;
 }
 
-auto create_device_views(std::vector<column_view> const& views, rmm::cuda_stream_view stream)
+auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_view stream)
 {
   // Create device views for each input view
   using CDViewPtr = decltype(
     column_device_view::create(std::declval<column_view>(), std::declval<rmm::cuda_stream_view>()));
   auto device_view_owners = std::vector<CDViewPtr>(views.size());
-  std::transform(
-    views.cbegin(), views.cend(), device_view_owners.begin(), [stream](auto const& col) {
-      // TODO creating this device view can invoke null count computation
-      // even though it isn't used. See this issue:
-      // https://github.com/rapidsai/cudf/issues/4368
-      return column_device_view::create(col, stream);
-    });
+  std::transform(views.begin(), views.end(), device_view_owners.begin(), [stream](auto const& col) {
+    // TODO creating this device view can invoke null count computation
+    // even though it isn't used. See this issue:
+    // https://github.com/rapidsai/cudf/issues/4368
+    return column_device_view::create(col, stream);
+  });
 
   // Assemble contiguous array of device views
   auto device_views = thrust::host_vector<column_device_view>();
@@ -74,7 +73,7 @@ auto create_device_views(std::vector<column_view> const& views, rmm::cuda_stream
   // TODO each of these device vector copies invoke stream synchronization
   // which appears to add unnecessary overhead. See this issue:
   // https://github.com/rapidsai/rmm/issues/120
-  auto d_views = rmm::device_vector<column_device_view>{device_views};
+  auto d_views = make_device_uvector_async(device_views);
 
   // Compute the partition offsets
   auto offsets = thrust::host_vector<size_t>(views.size() + 1);
@@ -85,7 +84,7 @@ auto create_device_views(std::vector<column_view> const& views, rmm::cuda_stream
     std::next(offsets.begin()),
     [](auto const& col) { return col.size(); },
     thrust::plus<size_t>{});
-  auto const d_offsets   = rmm::device_vector<size_t>{offsets};
+  auto d_offsets         = make_device_uvector_async(offsets);
   auto const output_size = offsets.back();
 
   return std::make_tuple(
@@ -132,8 +131,8 @@ __global__ void concatenate_masks_kernel(column_device_view const* views,
   }
 }
 
-void concatenate_masks(rmm::device_vector<column_device_view> const& d_views,
-                       rmm::device_vector<size_t> const& d_offsets,
+void concatenate_masks(device_span<column_device_view const> d_views,
+                       device_span<size_t const> d_offsets,
                        bitmask_type* dest_mask,
                        size_type output_size,
                        rmm::cuda_stream_view stream)
@@ -141,14 +140,14 @@ void concatenate_masks(rmm::device_vector<column_device_view> const& d_views,
   constexpr size_type block_size{256};
   cudf::detail::grid_1d config(output_size, block_size);
   concatenate_masks_kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
-    d_views.data().get(),
-    d_offsets.data().get(),
+    d_views.data(),
+    d_offsets.data(),
     static_cast<size_type>(d_views.size()),
     dest_mask,
     output_size);
 }
 
-void concatenate_masks(std::vector<column_view> const& views,
+void concatenate_masks(host_span<column_view const> views,
                        bitmask_type* dest_mask,
                        rmm::cuda_stream_view stream)
 {
@@ -214,7 +213,7 @@ __global__ void fused_concatenate_kernel(column_device_view const* input_views,
 }
 
 template <typename T>
-std::unique_ptr<column> fused_concatenate(std::vector<column_view> const& views,
+std::unique_ptr<column> fused_concatenate(host_span<column_view const> views,
                                           bool const has_nulls,
                                           rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
@@ -245,8 +244,8 @@ std::unique_ptr<column> fused_concatenate(std::vector<column_view> const& views,
   auto const kernel = has_nulls ? fused_concatenate_kernel<T, block_size, true>
                                 : fused_concatenate_kernel<T, block_size, false>;
   kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
-    d_views.data().get(),
-    d_offsets.data().get(),
+    d_views.data(),
+    d_offsets.data(),
     static_cast<size_type>(d_views.size()),
     *d_out_view,
     d_valid_count.data());
@@ -257,7 +256,7 @@ std::unique_ptr<column> fused_concatenate(std::vector<column_view> const& views,
 }
 
 template <typename T>
-std::unique_ptr<column> for_each_concatenate(std::vector<column_view> const& views,
+std::unique_ptr<column> for_each_concatenate(host_span<column_view const> views,
                                              bool const has_nulls,
                                              rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
@@ -289,7 +288,7 @@ std::unique_ptr<column> for_each_concatenate(std::vector<column_view> const& vie
 }
 
 struct concatenate_dispatch {
-  std::vector<column_view> const& views;
+  host_span<column_view const> views;
   rmm::cuda_stream_view stream;
   rmm::mr::device_memory_resource* mr;
 
@@ -298,7 +297,7 @@ struct concatenate_dispatch {
   std::unique_ptr<column> operator()()
   {
     bool const has_nulls =
-      std::any_of(views.cbegin(), views.cend(), [](auto const& col) { return col.has_nulls(); });
+      std::any_of(views.begin(), views.end(), [](auto const& col) { return col.has_nulls(); });
 
     // Use a heuristic to guess when the fused kernel will be faster
     if (use_fused_kernel_heuristic(has_nulls, views.size())) {
@@ -392,7 +391,7 @@ void bounds_and_type_check(ColIter begin, ColIter end)
 }  // anonymous namespace
 
 // Concatenates the elements from a vector of column_views
-std::unique_ptr<column> concatenate(std::vector<column_view> const& columns_to_concat,
+std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_concat,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
@@ -411,15 +410,15 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns_to_c
     columns_to_concat.front().type(), concatenate_dispatch{columns_to_concat, stream, mr});
 }
 
-std::unique_ptr<table> concatenate(std::vector<table_view> const& tables_to_concat,
+std::unique_ptr<table> concatenate(host_span<table_view const> tables_to_concat,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   if (tables_to_concat.empty()) { return std::make_unique<table>(); }
 
   table_view const first_table = tables_to_concat.front();
-  CUDF_EXPECTS(std::all_of(tables_to_concat.cbegin(),
-                           tables_to_concat.cend(),
+  CUDF_EXPECTS(std::all_of(tables_to_concat.begin(),
+                           tables_to_concat.end(),
                            [&first_table](auto const& t) {
                              return t.num_columns() == first_table.num_columns();
                            }),
@@ -428,8 +427,8 @@ std::unique_ptr<table> concatenate(std::vector<table_view> const& tables_to_conc
   std::vector<std::unique_ptr<column>> concat_columns;
   for (size_type i = 0; i < first_table.num_columns(); ++i) {
     std::vector<column_view> cols;
-    std::transform(tables_to_concat.cbegin(),
-                   tables_to_concat.cend(),
+    std::transform(tables_to_concat.begin(),
+                   tables_to_concat.end(),
                    std::back_inserter(cols),
                    [i](auto const& t) { return t.column(i); });
 
@@ -442,7 +441,7 @@ std::unique_ptr<table> concatenate(std::vector<table_view> const& tables_to_conc
 
 }  // namespace detail
 
-rmm::device_buffer concatenate_masks(std::vector<column_view> const& views,
+rmm::device_buffer concatenate_masks(host_span<column_view const> views,
                                      rmm::mr::device_memory_resource* mr)
 {
   bool const has_nulls =
@@ -465,14 +464,14 @@ rmm::device_buffer concatenate_masks(std::vector<column_view> const& views,
 }
 
 // Concatenates the elements from a vector of column_views
-std::unique_ptr<column> concatenate(std::vector<column_view> const& columns_to_concat,
+std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_concat,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::concatenate(columns_to_concat, rmm::cuda_stream_default, mr);
 }
 
-std::unique_ptr<table> concatenate(std::vector<table_view> const& tables_to_concat,
+std::unique_ptr<table> concatenate(host_span<table_view const> tables_to_concat,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index 05349a5f968..cdf086e3f4a 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -62,8 +62,7 @@ struct compute_children_offsets_fn {
    *
    * @param columns The input dictionary columns.
    */
-  compute_children_offsets_fn(std::vector<column_view> const& columns)
-    : columns_ptrs{columns.size()}
+  compute_children_offsets_fn(host_span<column_view const> columns) : columns_ptrs{columns.size()}
   {
     std::transform(
       columns.begin(), columns.end(), columns_ptrs.begin(), [](auto& cv) { return &cv; });
@@ -187,7 +186,7 @@ struct dispatch_compute_indices {
 
 }  // namespace
 
-std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
+std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
diff --git a/cpp/src/groupby/common/utils.hpp b/cpp/src/groupby/common/utils.hpp
index 40bc96c6103..e8d5c60f81a 100644
--- a/cpp/src/groupby/common/utils.hpp
+++ b/cpp/src/groupby/common/utils.hpp
@@ -18,13 +18,14 @@
 
 #include <cudf/detail/aggregation/result_cache.hpp>
 #include <cudf/detail/groupby.hpp>
+#include <cudf/utilities/span.hpp>
 #include <vector>
 
 namespace cudf {
 namespace groupby {
 namespace detail {
 inline std::vector<aggregation_result> extract_results(
-  std::vector<aggregation_request> const& requests, cudf::detail::result_cache& cache)
+  host_span<aggregation_request const> requests, cudf::detail::result_cache& cache)
 {
   std::vector<aggregation_result> results(requests.size());
 
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index cdd8ceb0a6c..34c57996af3 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -55,7 +55,7 @@ groupby::groupby(table_view const& keys,
 
 // Select hash vs. sort groupby implementation
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::dispatch_aggregation(
-  std::vector<aggregation_request> const& requests,
+  host_span<aggregation_request const> requests,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
@@ -79,7 +79,7 @@ groupby::~groupby() = default;
 
 namespace {
 /// Make an empty table with appropriate types for requested aggs
-auto empty_results(std::vector<aggregation_request> const& requests)
+auto empty_results(host_span<aggregation_request const> requests)
 {
   std::vector<aggregation_result> empty_results;
 
@@ -102,7 +102,7 @@ auto empty_results(std::vector<aggregation_request> const& requests)
 }
 
 /// Verifies the agg requested on the request's values is valid
-void verify_valid_requests(std::vector<aggregation_request> const& requests)
+void verify_valid_requests(host_span<aggregation_request const> requests)
 {
   CUDF_EXPECTS(
     std::all_of(
@@ -143,7 +143,7 @@ void verify_valid_requests(std::vector<aggregation_request> const& requests)
 
 // Compute aggregation requests
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggregate(
-  std::vector<aggregation_request> const& requests, rmm::mr::device_memory_resource* mr)
+  host_span<aggregation_request const> requests, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(
@@ -156,12 +156,12 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggr
 
   if (_keys.num_rows() == 0) { return std::make_pair(empty_like(_keys), empty_results(requests)); }
 
-  return dispatch_aggregation(requests, 0, mr);
+  return dispatch_aggregation(requests, rmm::cuda_stream_default, mr);
 }
 
 // Compute scan requests
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::scan(
-  std::vector<aggregation_request> const& requests, rmm::mr::device_memory_resource* mr)
+  host_span<aggregation_request const> requests, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(
@@ -190,7 +190,7 @@ groupby::groups groupby::get_groups(table_view values, rmm::mr::device_memory_re
 
   if (values.num_columns()) {
     auto grouped_values = cudf::detail::gather(values,
-                                               helper().key_sort_order(),
+                                               helper().key_sort_order(rmm::cuda_stream_default),
                                                cudf::out_of_bounds_policy::DONT_CHECK,
                                                cudf::detail::negative_index_policy::NOT_ALLOWED,
                                                rmm::cuda_stream_default,
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index c54ecee9ccb..38aacbe59a7 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -110,7 +110,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
   data_type result_type;
   cudf::detail::result_cache* sparse_results;
   cudf::detail::result_cache* dense_results;
-  rmm::device_vector<size_type> const& gather_map;
+  device_span<size_type const> gather_map;
   size_type const map_size;
   Map const& map;
   bitmask_type const* __restrict__ row_bitmask;
@@ -122,7 +122,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
                               column_view col,
                               cudf::detail::result_cache* sparse_results,
                               cudf::detail::result_cache* dense_results,
-                              rmm::device_vector<size_type> const& gather_map,
+                              device_span<size_type const> gather_map,
                               size_type map_size,
                               Map const& map,
                               bitmask_type const* row_bitmask,
@@ -272,7 +272,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
 
 // flatten aggs to filter in single pass aggs
 std::tuple<table_view, std::vector<aggregation::Kind>, std::vector<size_t>>
-flatten_single_pass_aggs(std::vector<aggregation_request> const& requests)
+flatten_single_pass_aggs(host_span<aggregation_request const> requests)
 {
   std::vector<column_view> columns;
   std::vector<aggregation::Kind> agg_kinds;
@@ -311,10 +311,10 @@ flatten_single_pass_aggs(std::vector<aggregation_request> const& requests)
  */
 template <typename Map>
 void sparse_to_dense_results(table_view const& keys,
-                             std::vector<aggregation_request> const& requests,
+                             host_span<aggregation_request const> requests,
                              cudf::detail::result_cache* sparse_results,
                              cudf::detail::result_cache* dense_results,
-                             rmm::device_vector<size_type> const& gather_map,
+                             device_span<size_type const> gather_map,
                              size_type map_size,
                              Map const& map,
                              bool keys_have_nulls,
@@ -421,7 +421,7 @@ auto create_sparse_results_table(table_view const& flattened_values,
  */
 template <bool keys_have_nulls, typename Map>
 void compute_single_pass_aggs(table_view const& keys,
-                              std::vector<aggregation_request> const& requests,
+                              host_span<aggregation_request const> requests,
                               cudf::detail::result_cache* sparse_results,
                               Map& map,
                               null_policy include_null_keys,
@@ -469,10 +469,10 @@ void compute_single_pass_aggs(table_view const& keys,
  * `map`.
  */
 template <typename Map>
-std::pair<rmm::device_vector<size_type>, size_type> extract_populated_keys(
+std::pair<rmm::device_uvector<size_type>, size_type> extract_populated_keys(
   Map map, size_type num_keys, rmm::cuda_stream_view stream)
 {
-  rmm::device_vector<size_type> populated_keys(num_keys);
+  rmm::device_uvector<size_type> populated_keys(num_keys, stream);
 
   auto get_key = [] __device__(auto const& element) {
     size_type key, value;
@@ -520,7 +520,7 @@ std::pair<rmm::device_vector<size_type>, size_type> extract_populated_keys(
  */
 template <bool keys_have_nulls>
 std::unique_ptr<table> groupby_null_templated(table_view const& keys,
-                                              std::vector<aggregation_request> const& requests,
+                                              host_span<aggregation_request const> requests,
                                               cudf::detail::result_cache* cache,
                                               null_policy include_null_keys,
                                               rmm::cuda_stream_view stream,
@@ -539,9 +539,9 @@ std::unique_ptr<table> groupby_null_templated(table_view const& keys,
 
   // Extract the populated indices from the hash map and create a gather map.
   // Gathering using this map from sparse results will give dense results.
-  rmm::device_vector<size_type> gather_map;
-  size_type map_size;
-  std::tie(gather_map, map_size) = extract_populated_keys(*map, keys.num_rows(), stream);
+  auto map_and_size = extract_populated_keys(*map, keys.num_rows(), stream);
+  rmm::device_uvector<size_type> gather_map{std::move(map_and_size.first)};
+  size_type const map_size = map_and_size.second;
 
   // Compact all results from sparse_results and insert into cache
   sparse_to_dense_results(keys,
@@ -576,7 +576,7 @@ std::unique_ptr<table> groupby_null_templated(table_view const& keys,
  * @return true A hash-based groupby should be used
  * @return false A hash-based groupby should not be used
  */
-bool can_use_hash_groupby(table_view const& keys, std::vector<aggregation_request> const& requests)
+bool can_use_hash_groupby(table_view const& keys, host_span<aggregation_request const> requests)
 {
   return std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
     return std::all_of(r.aggregations.begin(), r.aggregations.end(), [](auto const& a) {
@@ -588,7 +588,7 @@ bool can_use_hash_groupby(table_view const& keys, std::vector<aggregation_reques
 // Hash-based groupby
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(
   table_view const& keys,
-  std::vector<aggregation_request> const& requests,
+  host_span<aggregation_request const> requests,
   null_policy include_null_keys,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index b171b19413b..4e2303c8b9b 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -70,8 +70,9 @@ void aggregrate_result_functor::operator()<aggregation::COUNT_VALID>(aggregation
     agg,
     get_grouped_values().nullable()
       ? detail::group_count_valid(
-          get_grouped_values(), helper.group_labels(), helper.num_groups(), stream, mr)
-      : detail::group_count_all(helper.group_offsets(), helper.num_groups(), stream, mr));
+          get_grouped_values(), helper.group_labels(stream), helper.num_groups(stream), stream, mr)
+      : detail::group_count_all(
+          helper.group_offsets(stream), helper.num_groups(stream), stream, mr));
 }
 
 template <>
@@ -80,7 +81,9 @@ void aggregrate_result_functor::operator()<aggregation::COUNT_ALL>(aggregation c
   if (cache.has_result(col_idx, agg)) return;
 
   cache.add_result(
-    col_idx, agg, detail::group_count_all(helper.group_offsets(), helper.num_groups(), stream, mr));
+    col_idx,
+    agg,
+    detail::group_count_all(helper.group_offsets(stream), helper.num_groups(stream), stream, mr));
 }
 
 template <>
@@ -88,10 +91,11 @@ void aggregrate_result_functor::operator()<aggregation::SUM>(aggregation const&
 {
   if (cache.has_result(col_idx, agg)) return;
 
-  cache.add_result(col_idx,
-                   agg,
-                   detail::group_sum(
-                     get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr));
+  cache.add_result(
+    col_idx,
+    agg,
+    detail::group_sum(
+      get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr));
 };
 
 template <>
@@ -102,9 +106,9 @@ void aggregrate_result_functor::operator()<aggregation::ARGMAX>(aggregation cons
   cache.add_result(col_idx,
                    agg,
                    detail::group_argmax(get_grouped_values(),
-                                        helper.num_groups(),
-                                        helper.group_labels(),
-                                        helper.key_sort_order(),
+                                        helper.num_groups(stream),
+                                        helper.group_labels(stream),
+                                        helper.key_sort_order(stream),
                                         stream,
                                         mr));
 };
@@ -117,9 +121,9 @@ void aggregrate_result_functor::operator()<aggregation::ARGMIN>(aggregation cons
   cache.add_result(col_idx,
                    agg,
                    detail::group_argmin(get_grouped_values(),
-                                        helper.num_groups(),
-                                        helper.group_labels(),
-                                        helper.key_sort_order(),
+                                        helper.num_groups(stream),
+                                        helper.group_labels(stream),
+                                        helper.key_sort_order(stream),
                                         stream,
                                         mr));
 };
@@ -132,7 +136,7 @@ void aggregrate_result_functor::operator()<aggregation::MIN>(aggregation const&
   auto result = [&]() {
     if (cudf::is_fixed_width(values.type())) {
       return detail::group_min(
-        get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr);
+        get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr);
     } else {
       auto argmin_agg = make_argmin_aggregation();
       operator()<aggregation::ARGMIN>(*argmin_agg);
@@ -169,7 +173,7 @@ void aggregrate_result_functor::operator()<aggregation::MAX>(aggregation const&
   auto result = [&]() {
     if (cudf::is_fixed_width(values.type())) {
       return detail::group_max(
-        get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr);
+        get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr);
     } else {
       auto argmax_agg = make_argmax_aggregation();
       operator()<aggregation::ARGMAX>(*argmax_agg);
@@ -238,7 +242,7 @@ void aggregrate_result_functor::operator()<aggregation::VARIANCE>(aggregation co
   auto result = detail::group_var(get_grouped_values(),
                                   mean_result,
                                   group_sizes,
-                                  helper.group_labels(),
+                                  helper.group_labels(stream),
                                   var_agg._ddof,
                                   stream,
                                   mr);
@@ -271,8 +275,8 @@ void aggregrate_result_functor::operator()<aggregation::QUANTILE>(aggregation co
 
   auto result = detail::group_quantiles(get_sorted_values(),
                                         group_sizes,
-                                        helper.group_offsets(),
-                                        helper.num_groups(),
+                                        helper.group_offsets(stream),
+                                        helper.num_groups(stream),
                                         quantile_agg._quantiles,
                                         quantile_agg._interpolation,
                                         stream,
@@ -291,8 +295,8 @@ void aggregrate_result_functor::operator()<aggregation::MEDIAN>(aggregation cons
 
   auto result = detail::group_quantiles(get_sorted_values(),
                                         group_sizes,
-                                        helper.group_offsets(),
-                                        helper.num_groups(),
+                                        helper.group_offsets(stream),
+                                        helper.num_groups(stream),
                                         {0.5},
                                         interpolation::LINEAR,
                                         stream,
@@ -308,9 +312,9 @@ void aggregrate_result_functor::operator()<aggregation::NUNIQUE>(aggregation con
   auto nunique_agg = static_cast<cudf::detail::nunique_aggregation const&>(agg);
 
   auto result = detail::group_nunique(get_sorted_values(),
-                                      helper.group_labels(),
-                                      helper.num_groups(),
-                                      helper.group_offsets(),
+                                      helper.group_labels(stream),
+                                      helper.num_groups(stream),
+                                      helper.group_offsets(stream),
                                       nunique_agg._null_handling,
                                       stream,
                                       mr);
@@ -337,9 +341,9 @@ void aggregrate_result_functor::operator()<aggregation::NTH_ELEMENT>(aggregation
                    agg,
                    detail::group_nth_element(get_grouped_values(),
                                              group_sizes,
-                                             helper.group_labels(),
-                                             helper.group_offsets(),
-                                             helper.num_groups(),
+                                             helper.group_labels(stream),
+                                             helper.group_offsets(stream),
+                                             helper.num_groups(stream),
                                              nth_element_agg._n,
                                              nth_element_agg._null_handling,
                                              stream,
@@ -357,7 +361,7 @@ void aggregrate_result_functor::operator()<aggregation::COLLECT_LIST>(aggregatio
   if (cache.has_result(col_idx, agg)) return;
 
   auto result = detail::group_collect(
-    get_grouped_values(), helper.group_offsets(), helper.num_groups(), stream, mr);
+    get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr);
 
   cache.add_result(col_idx, agg, std::move(result));
 };
@@ -373,7 +377,7 @@ void aggregrate_result_functor::operator()<aggregation::COLLECT_SET>(aggregation
   if (cache.has_result(col_idx, agg)) { return; }
 
   auto const collect_result = detail::group_collect(
-    get_grouped_values(), helper.group_offsets(), helper.num_groups(), stream, mr);
+    get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr);
   auto const nulls_equal =
     static_cast<cudf::detail::collect_set_aggregation const&>(agg)._null_equal;
   cache.add_result(col_idx,
@@ -385,7 +389,7 @@ void aggregrate_result_functor::operator()<aggregation::COLLECT_SET>(aggregation
 
 // Sort-based groupby
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::sort_aggregate(
-  std::vector<aggregation_request> const& requests,
+  host_span<aggregation_request const> requests,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
diff --git a/cpp/src/groupby/sort/functors.hpp b/cpp/src/groupby/sort/functors.hpp
index 565320fbe80..afb92f8e141 100644
--- a/cpp/src/groupby/sort/functors.hpp
+++ b/cpp/src/groupby/sort/functors.hpp
@@ -64,7 +64,7 @@ struct store_result_functor {
       // It's overridden in scan implementation.
       return sorted_values->view();
     else
-      return (grouped_values = helper.grouped_values(values))->view();
+      return (grouped_values = helper.grouped_values(values, stream))->view();
   };
 
   /**
@@ -76,7 +76,7 @@ struct store_result_functor {
   column_view get_sorted_values()
   {
     return sorted_values ? sorted_values->view()
-                         : (sorted_values = helper.sorted_values(values))->view();
+                         : (sorted_values = helper.sorted_values(values, stream))->view();
   };
 
  protected:
diff --git a/cpp/src/groupby/sort/group_nth_element.cu b/cpp/src/groupby/sort/group_nth_element.cu
index 5c8e8b790d4..e6c10aa1056 100644
--- a/cpp/src/groupby/sort/group_nth_element.cu
+++ b/cpp/src/groupby/sort/group_nth_element.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,7 +69,7 @@ std::unique_ptr<column> group_nth_element(column_view const &values,
     auto bitmask_iterator =
       thrust::make_transform_iterator(cudf::detail::make_validity_iterator(*values_view),
                                       [] __device__(auto b) { return static_cast<size_type>(b); });
-    rmm::device_vector<size_type> intra_group_index(values.size());
+    rmm::device_uvector<size_type> intra_group_index(values.size(), stream);
     // intra group index for valids only.
     thrust::exclusive_scan_by_key(rmm::exec_policy(stream),
                                   group_labels.begin(),
@@ -77,9 +77,9 @@ std::unique_ptr<column> group_nth_element(column_view const &values,
                                   bitmask_iterator,
                                   intra_group_index.begin());
     // group_size to recalculate n if n<0
-    rmm::device_vector<size_type> group_count = [&] {
+    rmm::device_uvector<size_type> group_count = [&] {
       if (n < 0) {
-        rmm::device_vector<size_type> group_count(num_groups);
+        rmm::device_uvector<size_type> group_count(num_groups, stream);
         thrust::reduce_by_key(rmm::exec_policy(stream),
                               group_labels.begin(),
                               group_labels.end(),
@@ -88,7 +88,7 @@ std::unique_ptr<column> group_nth_element(column_view const &values,
                               group_count.begin());
         return group_count;
       } else {
-        return rmm::device_vector<size_type>();
+        return rmm::device_uvector<size_type>(0, stream);
       }
     }();
     // gather the valid index == n
diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu
index fcadb2e71fb..c9f9e3cad9e 100644
--- a/cpp/src/groupby/sort/group_quantiles.cu
+++ b/cpp/src/groupby/sort/group_quantiles.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -80,7 +80,7 @@ struct quantiles_functor {
     column_view const& group_sizes,
     cudf::device_span<size_type const> group_offsets,
     size_type const num_groups,
-    rmm::device_vector<double> const& quantile,
+    device_span<double const> quantile,
     interpolation interpolation,
     rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr)
@@ -112,7 +112,7 @@ struct quantiles_functor {
                            *group_size_view,
                            *result_view,
                            group_offsets.data(),
-                           quantile.data().get(),
+                           quantile.data(),
                            static_cast<size_type>(quantile.size()),
                            interpolation});
     } else {
@@ -125,7 +125,7 @@ struct quantiles_functor {
                            *group_size_view,
                            *result_view,
                            group_offsets.data(),
-                           quantile.data().get(),
+                           quantile.data(),
                            static_cast<size_type>(quantile.size()),
                            interpolation});
     }
diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp
index 63de4ea8684..336a6777ffa 100644
--- a/cpp/src/groupby/sort/scan.cpp
+++ b/cpp/src/groupby/sort/scan.cpp
@@ -59,7 +59,7 @@ struct scan_result_functor final : store_result_functor {
     if (grouped_values)
       return grouped_values->view();
     else
-      return (grouped_values = helper.grouped_values(values))->view();
+      return (grouped_values = helper.grouped_values(values, stream))->view();
   };
 };
 
@@ -71,7 +71,8 @@ void scan_result_functor::operator()<aggregation::SUM>(aggregation const& agg)
   cache.add_result(
     col_idx,
     agg,
-    detail::sum_scan(get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr));
+    detail::sum_scan(
+      get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr));
 }
 
 template <>
@@ -82,7 +83,8 @@ void scan_result_functor::operator()<aggregation::MIN>(aggregation const& agg)
   cache.add_result(
     col_idx,
     agg,
-    detail::min_scan(get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr));
+    detail::min_scan(
+      get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr));
 }
 
 template <>
@@ -93,7 +95,8 @@ void scan_result_functor::operator()<aggregation::MAX>(aggregation const& agg)
   cache.add_result(
     col_idx,
     agg,
-    detail::max_scan(get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr));
+    detail::max_scan(
+      get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr));
 }
 
 template <>
@@ -101,13 +104,13 @@ void scan_result_functor::operator()<aggregation::COUNT_ALL>(aggregation const&
 {
   if (cache.has_result(col_idx, agg)) return;
 
-  cache.add_result(col_idx, agg, detail::count_scan(helper.group_labels(), stream, mr));
+  cache.add_result(col_idx, agg, detail::count_scan(helper.group_labels(stream), stream, mr));
 }
 }  // namespace detail
 
 // Sort-based groupby
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::sort_scan(
-  std::vector<aggregation_request> const& requests,
+  host_span<aggregation_request const> requests,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index 6a9da36e21b..5e944f75712 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -141,7 +141,7 @@ column_view sort_groupby_helper::key_sort_order(rmm::cuda_stream_view stream)
     // presence of a null value within a row. This allows moving all rows that
     // contain a null value to the end of the sorted order.
 
-    auto augmented_keys = table_view({table_view({keys_bitmask_column()}), _keys});
+    auto augmented_keys = table_view({table_view({keys_bitmask_column(stream)}), _keys});
 
     _key_sorted_order = cudf::detail::stable_sorted_order(
       augmented_keys,
@@ -164,7 +164,7 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(
   _group_offsets = std::make_unique<index_vector>(num_keys(stream) + 1, stream);
 
   auto device_input_table = table_device_view::create(_keys, stream);
-  auto sorted_order       = key_sort_order().data<size_type>();
+  auto sorted_order       = key_sort_order(stream).data<size_type>();
   decltype(_group_offsets->begin()) result_end;
 
   if (has_nulls(_keys)) {
@@ -207,9 +207,9 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_labels(
                              group_labels.end(),
                              index_vector::value_type{0});
   thrust::scatter(rmm::exec_policy(stream),
-                  thrust::make_constant_iterator(1, decltype(num_groups())(1)),
-                  thrust::make_constant_iterator(1, num_groups()),
-                  group_offsets().begin() + 1,
+                  thrust::make_constant_iterator(1, decltype(num_groups(stream))(1)),
+                  thrust::make_constant_iterator(1, num_groups(stream)),
+                  group_offsets(stream).begin() + 1,
                   group_labels.begin());
 
   thrust::inclusive_scan(
@@ -226,9 +226,9 @@ column_view sort_groupby_helper::unsorted_keys_labels(rmm::cuda_stream_view stre
     data_type(type_to_id<size_type>()), _keys.num_rows(), mask_state::ALL_NULL, stream);
 
   auto group_labels_view = cudf::column_view(
-    data_type(type_to_id<size_type>()), group_labels().size(), group_labels().data());
+    data_type(type_to_id<size_type>()), group_labels(stream).size(), group_labels(stream).data());
 
-  auto scatter_map = key_sort_order();
+  auto scatter_map = key_sort_order(stream);
 
   std::unique_ptr<table> t_unsorted_keys_labels =
     cudf::detail::scatter(table_view({group_labels_view}),
@@ -267,7 +267,7 @@ sort_groupby_helper::column_ptr sort_groupby_helper::sorted_values(
   column_view const& values, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   column_ptr values_sort_order =
-    cudf::detail::stable_sorted_order(table_view({unsorted_keys_labels(), values}),
+    cudf::detail::stable_sorted_order(table_view({unsorted_keys_labels(stream), values}),
                                       {},
                                       std::vector<null_order>(2, null_order::AFTER),
                                       stream,
@@ -289,7 +289,7 @@ sort_groupby_helper::column_ptr sort_groupby_helper::sorted_values(
 sort_groupby_helper::column_ptr sort_groupby_helper::grouped_values(
   column_view const& values, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
-  auto gather_map = key_sort_order();
+  auto gather_map = key_sort_order(stream);
 
   auto grouped_values_table = cudf::detail::gather(table_view({values}),
                                                    gather_map,
@@ -304,14 +304,14 @@ sort_groupby_helper::column_ptr sort_groupby_helper::grouped_values(
 std::unique_ptr<table> sort_groupby_helper::unique_keys(rmm::cuda_stream_view stream,
                                                         rmm::mr::device_memory_resource* mr)
 {
-  auto idx_data = key_sort_order().data<size_type>();
+  auto idx_data = key_sort_order(stream).data<size_type>();
 
   auto gather_map_it = thrust::make_transform_iterator(
-    group_offsets().begin(), [idx_data] __device__(size_type i) { return idx_data[i]; });
+    group_offsets(stream).begin(), [idx_data] __device__(size_type i) { return idx_data[i]; });
 
   return cudf::detail::gather(_keys,
                               gather_map_it,
-                              gather_map_it + num_groups(),
+                              gather_map_it + num_groups(stream),
                               out_of_bounds_policy::DONT_CHECK,
                               stream,
                               mr);
@@ -321,7 +321,7 @@ std::unique_ptr<table> sort_groupby_helper::sorted_keys(rmm::cuda_stream_view st
                                                         rmm::mr::device_memory_resource* mr)
 {
   return cudf::detail::gather(_keys,
-                              key_sort_order(),
+                              key_sort_order(stream),
                               cudf::out_of_bounds_policy::DONT_CHECK,
                               cudf::detail::negative_index_policy::NOT_ALLOWED,
                               stream,
diff --git a/cpp/src/interop/from_arrow.cpp b/cpp/src/interop/from_arrow.cpp
index 729b98d85a8..612e2111b66 100644
--- a/cpp/src/interop/from_arrow.cpp
+++ b/cpp/src/interop/from_arrow.cpp
@@ -150,8 +150,7 @@ struct dispatch_to_cudf_column {
 
 std::unique_ptr<column> get_empty_type_column(size_type size)
 {
-  return std::make_unique<column>(
-    data_type(type_id::EMPTY), size, std::move(rmm::device_buffer(0)));
+  return std::make_unique<column>(data_type(type_id::EMPTY), size, rmm::device_buffer(0));
 }
 
 /**
diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index 99157a23fcb..e69a61bde66 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -17,6 +17,7 @@
 #include "orc_common.h"
 #include "orc_gpu.h"
 
+#include <cudf/table/table_device_view.cuh>
 #include <io/utilities/block_utils.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -46,14 +47,16 @@ struct dictinit_state_s {
 };
 
 /**
- * @brief Return a 12-bit hash from a byte sequence
+ * @brief Return a 12-bit hash from a string
  */
-static inline __device__ uint32_t nvstr_init_hash(char const *ptr, uint32_t len)
+static inline __device__ uint32_t hash_string(const string_view val)
 {
-  if (len != 0) {
-    return (ptr[0] + (ptr[len - 1] << 5) + (len << 10)) & ((1 << init_hash_bits) - 1);
-  } else {
+  if (val.empty()) {
     return 0;
+  } else {
+    char const *ptr = val.data();
+    uint32_t len    = val.size_bytes();
+    return (ptr[0] + (ptr[len - 1] << 5) + (len << 10)) & ((1 << init_hash_bits) - 1);
   }
 }
 
@@ -71,7 +74,8 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s,
 {
   if (t == 0) { s->nnz = 0; }
   for (uint32_t i = 0; i < s->chunk.num_rows; i += block_size) {
-    const uint32_t *valid_map = s->chunk.valid_map_base;
+    const uint32_t *valid_map = s->chunk.leaf_column->null_mask();
+    auto column_offset        = s->chunk.leaf_column->offset();
     uint32_t is_valid, nz_pos;
     if (t < block_size / 32) {
       if (!valid_map) {
@@ -80,10 +84,10 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s,
         uint32_t const row   = s->chunk.start_row + i + t * 32;
         auto const chunk_end = s->chunk.start_row + s->chunk.num_rows;
 
-        auto const valid_map_idx = (row + s->chunk.column_offset) / 32;
+        auto const valid_map_idx = (row + column_offset) / 32;
         uint32_t valid           = (row < chunk_end) ? valid_map[valid_map_idx] : 0;
 
-        auto const rows_in_next_word = (row + s->chunk.column_offset) & 0x1f;
+        auto const rows_in_next_word = (row + column_offset) & 0x1f;
         if (rows_in_next_word != 0) {
           auto const rows_in_current_word = 32 - rows_in_next_word;
           // Read next word if any rows are within the chunk
@@ -111,12 +115,18 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s,
  * @brief Gather all non-NULL string rows and compute total character data size
  *
  * @param[in] chunks DictionaryChunk device array [rowgroup][column]
- * @param[in] num_columns Number of columns
+ * @param[in] num_columns Number of string columns
  */
 // blockDim {block_size,1,1}
 template <int block_size>
 __global__ void __launch_bounds__(block_size, 2)
-  gpuInitDictionaryIndices(DictionaryChunk *chunks, uint32_t num_columns)
+  gpuInitDictionaryIndices(DictionaryChunk *chunks,
+                           const table_device_view view,
+                           uint32_t *dict_data,
+                           uint32_t *dict_index,
+                           size_t row_index_stride,
+                           size_type *str_col_ids,
+                           uint32_t num_columns)
 {
   __shared__ __align__(16) dictinit_state_s state_g;
 
@@ -131,12 +141,21 @@ __global__ void __launch_bounds__(block_size, 2)
   dictinit_state_s *const s = &state_g;
   uint32_t col_id           = blockIdx.x;
   uint32_t group_id         = blockIdx.y;
-  const nvstrdesc_s *ck_data;
-  uint32_t *dict_data;
   uint32_t nnz, start_row, dict_char_count;
   int t = threadIdx.x;
 
-  if (t == 0) s->chunk = chunks[group_id * num_columns + col_id];
+  if (t == 0) {
+    column_device_view *leaf_column_view = view.begin() + str_col_ids[col_id];
+    s->chunk                             = chunks[group_id * num_columns + col_id];
+    s->chunk.leaf_column                 = leaf_column_view;
+    s->chunk.dict_data =
+      dict_data + col_id * leaf_column_view->size() + group_id * row_index_stride;
+    s->chunk.dict_index = dict_index + col_id * leaf_column_view->size();
+    s->chunk.start_row  = group_id * row_index_stride;
+    s->chunk.num_rows =
+      min(row_index_stride,
+          max(static_cast<size_t>(leaf_column_view->size() - s->chunk.start_row), size_t{0}));
+  }
   for (uint32_t i = 0; i < sizeof(s->map) / sizeof(uint32_t); i += block_size) {
     if (i + t < sizeof(s->map) / sizeof(uint32_t)) s->map.u32[i + t] = 0;
   }
@@ -152,15 +171,15 @@ __global__ void __launch_bounds__(block_size, 2)
   nnz       = s->nnz;
   dict_data = s->chunk.dict_data;
   start_row = s->chunk.start_row;
-  ck_data   = static_cast<const nvstrdesc_s *>(s->chunk.column_data_base) + start_row;
   for (uint32_t i = 0; i < nnz; i += block_size) {
     uint32_t ck_row = 0;
     uint32_t hash   = 0;
     uint32_t len    = 0;
     if (i + t < nnz) {
-      ck_row = s->dict[i + t];
-      len    = static_cast<uint32_t>(ck_data[ck_row].count);
-      hash   = nvstr_init_hash(ck_data[ck_row].ptr, len);
+      ck_row                 = s->dict[i + t];
+      string_view string_val = s->chunk.leaf_column->element<string_view>(ck_row + start_row);
+      len                    = static_cast<uint32_t>(string_val.size_bytes());
+      hash                   = hash_string(string_val);
     }
     len = block_reduce(temp_storage.reduce_storage).Sum(len);
     if (t == 0) s->chunk.string_char_count += len;
@@ -200,10 +219,11 @@ __global__ void __launch_bounds__(block_size, 2)
     uint32_t ck_row = 0, pos = 0, hash = 0, pos_old, pos_new, sh, colliding_row;
     bool collision;
     if (i + t < nnz) {
-      ck_row  = dict_data[i + t] - start_row;
-      hash    = nvstr_init_hash(ck_data[ck_row].ptr, static_cast<uint32_t>(ck_data[ck_row].count));
-      sh      = (hash & 1) ? 16 : 0;
-      pos_old = s->map.u16[hash];
+      ck_row                 = dict_data[i + t] - start_row;
+      string_view string_val = s->chunk.leaf_column->element<string_view>(ck_row + start_row);
+      hash                   = hash_string(string_val);
+      sh                     = (hash & 1) ? 16 : 0;
+      pos_old                = s->map.u16[hash];
     }
     // The isolation of the atomicAdd, along with pos_old/pos_new is to guarantee deterministic
     // behavior for the first row in the hash map that will be used for early duplicate detection
@@ -233,18 +253,16 @@ __global__ void __launch_bounds__(block_size, 2)
   for (uint32_t i = 0; i < nnz; i += block_size) {
     uint32_t ck_row = 0, ck_row_ref = 0, is_dupe = 0;
     if (i + t < nnz) {
-      const char *str1, *str2;
-      uint32_t len1, len2, hash;
-      ck_row     = s->dict[i + t];
-      str1       = ck_data[ck_row].ptr;
-      len1       = static_cast<uint32_t>(ck_data[ck_row].count);
-      hash       = nvstr_init_hash(str1, len1);
-      ck_row_ref = s->dict[(hash > 0) ? s->map.u16[hash - 1] : 0];
+      ck_row                   = s->dict[i + t];
+      string_view string_value = s->chunk.leaf_column->element<string_view>(ck_row + start_row);
+      auto const string_length = static_cast<uint32_t>(string_value.size_bytes());
+      auto const hash          = hash_string(string_value);
+      ck_row_ref               = s->dict[(hash > 0) ? s->map.u16[hash - 1] : 0];
       if (ck_row_ref != ck_row) {
-        str2    = ck_data[ck_row_ref].ptr;
-        len2    = static_cast<uint32_t>(ck_data[ck_row_ref].count);
-        is_dupe = nvstr_is_equal(str1, len1, str2, len2);
-        dict_char_count += (is_dupe) ? 0 : len1;
+        string_view reference_string =
+          s->chunk.leaf_column->element<string_view>(ck_row_ref + start_row);
+        is_dupe = (string_value == reference_string);
+        dict_char_count += (is_dupe) ? 0 : string_length;
       }
     }
     uint32_t dupes_in_block;
@@ -269,6 +287,12 @@ __global__ void __launch_bounds__(block_size, 2)
     chunks[group_id * num_columns + col_id].string_char_count = s->chunk.string_char_count;
     chunks[group_id * num_columns + col_id].num_dict_strings  = nnz - s->total_dupes;
     chunks[group_id * num_columns + col_id].dict_char_count   = dict_char_count;
+    chunks[group_id * num_columns + col_id].leaf_column       = s->chunk.leaf_column;
+
+    chunks[group_id * num_columns + col_id].dict_data  = s->chunk.dict_data;
+    chunks[group_id * num_columns + col_id].dict_index = s->chunk.dict_index;
+    chunks[group_id * num_columns + col_id].start_row  = s->chunk.start_row;
+    chunks[group_id * num_columns + col_id].num_rows   = s->chunk.num_rows;
   }
 }
 
@@ -357,7 +381,6 @@ __global__ void __launch_bounds__(block_size)
   uint32_t num_strings;
   uint32_t *dict_data, *dict_index;
   uint32_t dict_char_count;
-  const nvstrdesc_s *str_data;
   int t = threadIdx.x;
 
   if (t == 0) s->stripe = stripes[stripe_id * num_columns + col_id];
@@ -366,21 +389,20 @@ __global__ void __launch_bounds__(block_size)
   num_strings = s->stripe.num_strings;
   dict_data   = s->stripe.dict_data;
   if (!dict_data) return;
-  dict_index      = s->stripe.dict_index;
-  str_data        = static_cast<const nvstrdesc_s *>(s->stripe.column_data_base);
-  dict_char_count = 0;
+  dict_index                 = s->stripe.dict_index;
+  string_view current_string = string_view::min();
+  dict_char_count            = 0;
   for (uint32_t i = 0; i < num_strings; i += block_size) {
     uint32_t cur     = (i + t < num_strings) ? dict_data[i + t] : 0;
     uint32_t cur_len = 0;
-    const char *cur_ptr;
-    bool is_dupe = false;
+    bool is_dupe     = false;
     if (i + t < num_strings) {
-      cur_ptr = str_data[cur].ptr;
-      cur_len = str_data[cur].count;
+      current_string = s->stripe.leaf_column->element<string_view>(cur);
+      cur_len        = current_string.size_bytes();
     }
     if (i + t != 0 && i + t < num_strings) {
       uint32_t prev = dict_data[i + t - 1];
-      is_dupe       = nvstr_is_equal(cur_ptr, cur_len, str_data[prev].ptr, str_data[prev].count);
+      is_dupe       = (current_string == (s->stripe.leaf_column->element<string_view>(prev)));
     }
     dict_char_count += (is_dupe) ? 0 : cur_len;
     uint32_t dupes_in_block;
@@ -403,14 +425,14 @@ __global__ void __launch_bounds__(block_size)
 }
 
 /**
- * @brief Launches kernel for initializing dictionary chunks
- *
- * @param[in] chunks DictionaryChunk device array [rowgroup][column]
- * @param[in] num_columns Number of columns
- * @param[in] num_rowgroups Number of row groups
- * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
+ * @copydoc cudf::io::orc::gpu::InitDictionaryIndices
  */
-void InitDictionaryIndices(DictionaryChunk *chunks,
+void InitDictionaryIndices(const table_device_view &view,
+                           DictionaryChunk *chunks,
+                           uint32_t *dict_data,
+                           uint32_t *dict_index,
+                           size_t row_index_stride,
+                           size_type *str_col_ids,
                            uint32_t num_columns,
                            uint32_t num_rowgroups,
                            rmm::cuda_stream_view stream)
@@ -418,20 +440,12 @@ void InitDictionaryIndices(DictionaryChunk *chunks,
   static constexpr int block_size = 512;
   dim3 dim_block(block_size, 1);
   dim3 dim_grid(num_columns, num_rowgroups);
-  gpuInitDictionaryIndices<block_size>
-    <<<dim_grid, dim_block, 0, stream.value()>>>(chunks, num_columns);
+  gpuInitDictionaryIndices<block_size><<<dim_grid, dim_block, 0, stream.value()>>>(
+    chunks, view, dict_data, dict_index, row_index_stride, str_col_ids, num_columns);
 }
 
 /**
- * @brief Launches kernel for building stripe dictionaries
- *
- * @param[in] stripes StripeDictionary device array [stripe][column]
- * @param[in] stripes_host StripeDictionary host array [stripe][column]
- * @param[in] chunks DictionaryChunk device array [rowgroup][column]
- * @param[in] num_stripes Number of stripes
- * @param[in] num_rowgroups Number of row groups
- * @param[in] num_columns Number of columns
- * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
+ * @copydoc cudf::io::orc::gpu::BuildStripeDictionaries
  */
 void BuildStripeDictionaries(StripeDictionary *stripes,
                              StripeDictionary *stripes_host,
@@ -447,18 +461,16 @@ void BuildStripeDictionaries(StripeDictionary *stripes,
     stripes, chunks, num_columns);
   for (uint32_t i = 0; i < num_stripes * num_columns; i++) {
     if (stripes_host[i].dict_data != nullptr) {
-      thrust::device_ptr<uint32_t> p = thrust::device_pointer_cast(stripes_host[i].dict_data);
-      const nvstrdesc_s *str_data =
-        static_cast<const nvstrdesc_s *>(stripes_host[i].column_data_base);
+      thrust::device_ptr<uint32_t> dict_data_ptr =
+        thrust::device_pointer_cast(stripes_host[i].dict_data);
+      column_device_view *string_column = stripes_host[i].leaf_column;
       // NOTE: Requires the --expt-extended-lambda nvcc flag
       thrust::sort(rmm::exec_policy(stream),
-                   p,
-                   p + stripes_host[i].num_strings,
-                   [str_data] __device__(const uint32_t &lhs, const uint32_t &rhs) {
-                     return nvstr_is_lesser(str_data[lhs].ptr,
-                                            (uint32_t)str_data[lhs].count,
-                                            str_data[rhs].ptr,
-                                            (uint32_t)str_data[rhs].count);
+                   dict_data_ptr,
+                   dict_data_ptr + stripes_host[i].num_strings,
+                   [string_column] __device__(const uint32_t &lhs, const uint32_t &rhs) {
+                     return string_column->element<string_view>(lhs) <
+                            string_column->element<string_view>(rhs);
                    });
     }
   }
diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h
index 7ad92e40cb4..55df0adf95b 100644
--- a/cpp/src/io/orc/orc_gpu.h
+++ b/cpp/src/io/orc/orc_gpu.h
@@ -21,6 +21,7 @@
 #include <io/comp/gpuinflate.h>
 #include <io/orc/orc_common.h>
 #include <io/statistics/column_stats.h>
+#include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -124,16 +125,15 @@ struct RowGroup {
  * @brief Struct to describe an encoder data chunk
  */
 struct EncChunk {
-  const uint32_t *valid_map_base;  // base ptr of input valid bit map
-  size_type column_offset;         // index of the first element relative to the base memory
-  const void *column_data_base;    // base ptr of input column data
-  uint32_t start_row;              // start row of this chunk
-  uint32_t num_rows;               // number of rows in this chunk
-  uint32_t valid_rows;             // max number of valid rows
-  uint8_t encoding_kind;           // column encoding kind (orc::ColumnEncodingKind)
-  uint8_t type_kind;               // column data type (orc::TypeKind)
-  uint8_t dtype_len;               // data type length
-  uint8_t scale;                   // scale for decimals or timestamps
+  uint32_t start_row;     // start row of this chunk
+  uint32_t num_rows;      // number of rows in this chunk
+  uint8_t encoding_kind;  // column encoding kind (orc::ColumnEncodingKind)
+  uint8_t type_kind;      // column data type (orc::TypeKind)
+  uint8_t dtype_len;      // data type length
+  uint8_t scale;          // scale for decimals or timestamps
+
+  uint32_t *dict_index;  // dictionary index from row index
+  column_device_view *leaf_column;
 };
 
 /**
@@ -163,10 +163,7 @@ struct StripeStream {
  * @brief Struct to describe a dictionary chunk
  */
 struct DictionaryChunk {
-  const uint32_t *valid_map_base;  // base ptr of input valid bit map
-  size_type column_offset;         // index of the first element relative to the base memory
-  const void *column_data_base;    // base ptr of column data (ptr,len pair)
-  uint32_t *dict_data;             // dictionary data (index of non-null rows)
+  uint32_t *dict_data;   // dictionary data (index of non-null rows)
   uint32_t *dict_index;  // row indices of corresponding string (row from dictionary index)
   uint32_t start_row;    // start row of this chunk
   uint32_t num_rows;     // num rows in this chunk
@@ -175,20 +172,23 @@ struct DictionaryChunk {
     string_char_count;  // total size of string data (NOTE: assumes less than 4G bytes per chunk)
   uint32_t num_dict_strings;  // number of strings in dictionary
   uint32_t dict_char_count;   // size of dictionary string data for this chunk
+
+  column_device_view *leaf_column;  //!< Pointer to string column
 };
 
 /**
  * @brief Struct to describe a dictionary
  */
 struct StripeDictionary {
-  const void *column_data_base;  // base ptr of column data (ptr,len pair)
-  uint32_t *dict_data;           // row indices of corresponding string (row from dictionary index)
-  uint32_t *dict_index;          // dictionary index from row index
-  uint32_t column_id;            // real column id
-  uint32_t start_chunk;          // first chunk in stripe
-  uint32_t num_chunks;           // number of chunks in the stripe
-  uint32_t num_strings;          // number of unique strings in the dictionary
-  uint32_t dict_char_count;      // total size of dictionary string data
+  uint32_t *dict_data;       // row indices of corresponding string (row from dictionary index)
+  uint32_t *dict_index;      // dictionary index from row index
+  uint32_t column_id;        // real column id
+  uint32_t start_chunk;      // first chunk in stripe
+  uint32_t num_chunks;       // number of chunks in the stripe
+  uint32_t num_strings;      // number of unique strings in the dictionary
+  uint32_t dict_char_count;  // total size of dictionary string data
+
+  column_device_view *leaf_column;  //!< Pointer to string column
 };
 
 /**
@@ -313,6 +313,17 @@ void EncodeStripeDictionaries(StripeDictionary *stripes,
                               detail::device_2dspan<encoder_chunk_streams> enc_streams,
                               rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
+/**
+ * @brief Set leaf column element of EncChunk
+ *
+ * @param[in] view table device view representing input table
+ * @param[in,out] chunks encoder chunk device array [column][rowgroup]
+ * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
+ */
+void set_chunk_columns(const table_device_view &view,
+                       detail::device_2dspan<EncChunk> chunks,
+                       rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernel for compacting chunked column data prior to compression
  *
@@ -350,15 +361,25 @@ void CompressOrcDataStreams(uint8_t *compressed_data,
 /**
  * @brief Launches kernel for initializing dictionary chunks
  *
+ * @param[in] view table device view representing input table
  * @param[in,out] chunks DictionaryChunk device array [rowgroup][column]
+ * @param[in] dict_data dictionary data (index of non-null rows)
+ * @param[in] dict_index row indices of corresponding string (row from dictionary index)
+ * @param[in] row_index_stride Rowgroup size in rows
+ * @param[in] str_col_ids List of columns that are strings type
  * @param[in] num_columns Number of columns
  * @param[in] num_rowgroups Number of row groups
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
-void InitDictionaryIndices(DictionaryChunk *chunks,
+void InitDictionaryIndices(const table_device_view &view,
+                           DictionaryChunk *chunks,
+                           uint32_t *dict_data,
+                           uint32_t *dict_index,
+                           size_t row_index_stride,
+                           size_type *str_col_ids,
                            uint32_t num_columns,
                            uint32_t num_rowgroups,
-                           rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+                           rmm::cuda_stream_view stream);
 
 /**
  * @brief Launches kernel for building stripe dictionaries
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index aef32efaf6e..10932d36309 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -669,19 +669,20 @@ __global__ void __launch_bounds__(block_size)
       if (t * 8 < nrows) {
         uint32_t row  = s->chunk.start_row + present_rows + t * 8;
         uint8_t valid = 0;
-        if (row < s->chunk.valid_rows) {
-          if (s->chunk.valid_map_base) {
-            size_type current_valid_offset = row + s->chunk.column_offset;
-            size_type next_valid_offset    = current_valid_offset + min(32, s->chunk.valid_rows);
+        if (row < s->chunk.leaf_column->size()) {
+          if (s->chunk.leaf_column->nullable()) {
+            size_type current_valid_offset = row + s->chunk.leaf_column->offset();
+            size_type next_valid_offset =
+              current_valid_offset + min(32, s->chunk.leaf_column->size());
 
             bitmask_type mask = cudf::detail::get_mask_offset_word(
-              s->chunk.valid_map_base, 0, current_valid_offset, next_valid_offset);
+              s->chunk.leaf_column->null_mask(), 0, current_valid_offset, next_valid_offset);
             valid = 0xff & mask;
           } else {
             valid = 0xff;
           }
-          if (row + 7 > s->chunk.valid_rows) {
-            valid = valid & ((1 << (s->chunk.valid_rows & 7)) - 1);
+          if (row + 7 > s->chunk.leaf_column->size()) {
+            valid = valid & ((1 << (s->chunk.leaf_column->size() & 7)) - 1);
           }
         }
         s->valid_buf[(row >> 3) & 0x1ff] = valid;
@@ -729,19 +730,18 @@ __global__ void __launch_bounds__(block_size)
       lengths_to_positions(s->buf.u32, 512, t);
       __syncthreads();
       if (valid) {
-        int nz_idx       = (s->nnz + s->buf.u32[t] - 1) & (maxnumvals - 1);
-        void const *base = s->chunk.column_data_base;
+        int nz_idx = (s->nnz + s->buf.u32[t] - 1) & (maxnumvals - 1);
         switch (s->chunk.type_kind) {
           case INT:
           case DATE:
-          case FLOAT: s->vals.u32[nz_idx] = static_cast<const uint32_t *>(base)[row]; break;
+          case FLOAT: s->vals.u32[nz_idx] = s->chunk.leaf_column->element<uint32_t>(row); break;
           case DOUBLE:
-          case LONG: s->vals.u64[nz_idx] = static_cast<const uint64_t *>(base)[row]; break;
-          case SHORT: s->vals.u32[nz_idx] = static_cast<const uint16_t *>(base)[row]; break;
+          case LONG: s->vals.u64[nz_idx] = s->chunk.leaf_column->element<uint64_t>(row); break;
+          case SHORT: s->vals.u32[nz_idx] = s->chunk.leaf_column->element<uint16_t>(row); break;
           case BOOLEAN:
-          case BYTE: s->vals.u8[nz_idx] = static_cast<const uint8_t *>(base)[row]; break;
+          case BYTE: s->vals.u8[nz_idx] = s->chunk.leaf_column->element<uint8_t>(row); break;
           case TIMESTAMP: {
-            int64_t ts       = static_cast<const int64_t *>(base)[row];
+            int64_t ts       = s->chunk.leaf_column->element<int64_t>(row);
             int32_t ts_scale = kTimeScale[min(s->chunk.scale, 9)];
             int64_t seconds  = ts / ts_scale;
             int64_t nanos    = (ts - seconds * ts_scale);
@@ -772,16 +772,13 @@ __global__ void __launch_bounds__(block_size)
           }
           case STRING:
             if (s->chunk.encoding_kind == DICTIONARY_V2) {
-              uint32_t dict_idx = static_cast<const uint32_t *>(base)[row];
-              if (dict_idx > 0x7fffffffu)
-                dict_idx = static_cast<const uint32_t *>(base)[dict_idx & 0x7fffffffu];
+              uint32_t dict_idx = s->chunk.dict_index[row];
+              if (dict_idx > 0x7fffffffu) dict_idx = s->chunk.dict_index[dict_idx & 0x7fffffffu];
               s->vals.u32[nz_idx] = dict_idx;
             } else {
-              const nvstrdesc_s *str_desc = static_cast<const nvstrdesc_s *>(base) + row;
-              const char *ptr             = str_desc->ptr;
-              uint32_t count              = static_cast<uint32_t>(str_desc->count);
-              s->u.strenc.str_data[s->buf.u32[t] - 1] = ptr;
-              s->lengths.u32[nz_idx]                  = count;
+              string_view value = s->chunk.leaf_column->element<string_view>(row);
+              s->u.strenc.str_data[s->buf.u32[t] - 1] = value.data();
+              s->lengths.u32[nz_idx]                  = value.size_bytes();
             }
             break;
           default: break;
@@ -899,8 +896,8 @@ __global__ void __launch_bounds__(block_size)
     streams[col_id][group_id].lengths[t] = s->strm_pos[t];
     if (!s->stream.data_ptrs[t]) {
       streams[col_id][group_id].data_ptrs[t] =
-        static_cast<uint8_t *>(const_cast<void *>(s->chunk.column_data_base)) +
-        s->chunk.start_row * s->chunk.dtype_len;
+        static_cast<uint8_t *>(const_cast<void *>(s->chunk.leaf_column->head())) +
+        (s->chunk.leaf_column->offset() + s->chunk.start_row) * s->chunk.dtype_len;
     }
   }
 }
@@ -939,8 +936,8 @@ __global__ void __launch_bounds__(block_size)
     s->nrows         = s->u.dict_stripe.num_strings;
     s->cur_row       = 0;
   }
-  auto const str_desc  = static_cast<const nvstrdesc_s *>(s->u.dict_stripe.column_data_base);
-  auto const dict_data = s->u.dict_stripe.dict_data;
+  column_device_view *string_column = s->u.dict_stripe.leaf_column;
+  auto const dict_data              = s->u.dict_stripe.dict_data;
   __syncthreads();
   if (s->chunk.encoding_kind != DICTIONARY_V2) {
     return;  // This column isn't using dictionary encoding -> bail out
@@ -951,8 +948,13 @@ __global__ void __launch_bounds__(block_size)
     uint32_t string_idx = (t < numvals) ? dict_data[s->cur_row + t] : 0;
     if (cid == CI_DICTIONARY) {
       // Encoding string contents
-      const char *ptr = (t < numvals) ? str_desc[string_idx].ptr : 0;
-      uint32_t count  = (t < numvals) ? static_cast<uint32_t>(str_desc[string_idx].count) : 0;
+      const char *ptr = 0;
+      uint32_t count  = 0;
+      if (t < numvals) {
+        auto string_val = string_column->element<string_view>(string_idx);
+        ptr             = string_val.data();
+        count           = string_val.size_bytes();
+      }
       s->u.strenc.str_data[t] = ptr;
       StoreStringData(s->stream.data_ptrs[CI_DICTIONARY] + s->strm_pos[CI_DICTIONARY],
                       &s->u.strenc,
@@ -961,7 +963,10 @@ __global__ void __launch_bounds__(block_size)
       if (!t) { s->strm_pos[CI_DICTIONARY] += s->u.strenc.char_count; }
     } else {
       // Encoding string lengths
-      uint32_t count  = (t < numvals) ? static_cast<uint32_t>(str_desc[string_idx].count) : 0;
+      uint32_t count =
+        (t < numvals)
+          ? static_cast<uint32_t>(string_column->element<string_view>(string_idx).size_bytes())
+          : 0;
       uint32_t nz_idx = (s->cur_row + t) & 0x3ff;
       if (t < numvals) s->lengths.u32[nz_idx] = count;
       __syncthreads();
@@ -982,6 +987,15 @@ __global__ void __launch_bounds__(block_size)
   if (t == 0) { strm_ptr->lengths[cid] = s->strm_pos[cid]; }
 }
 
+__global__ void __launch_bounds__(512)
+  gpu_set_chunk_columns(const table_device_view view, device_2dspan<EncChunk> chunks)
+{
+  // Set leaf_column member of EncChunk
+  for (size_type i = threadIdx.x; i < chunks.size().second; i += blockDim.x) {
+    chunks[blockIdx.x][i].leaf_column = view.begin() + blockIdx.x;
+  }
+}
+
 /**
  * @brief Merge chunked column data into a single contiguous stream
  *
@@ -1189,6 +1203,16 @@ void EncodeStripeDictionaries(StripeDictionary *stripes,
     <<<dim_grid, dim_block, 0, stream.value()>>>(stripes, chunks, enc_streams);
 }
 
+void set_chunk_columns(const table_device_view &view,
+                       device_2dspan<EncChunk> chunks,
+                       rmm::cuda_stream_view stream)
+{
+  dim3 dim_block(512, 1);
+  dim3 dim_grid(chunks.size().first, 1);
+
+  gpu_set_chunk_columns<<<dim_grid, dim_block, 0, stream.value()>>>(view, chunks);
+}
+
 void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
                            device_2dspan<encoder_chunk_streams> enc_streams,
                            rmm::cuda_stream_view stream)
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index eb5e90bbeec..10050806552 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -42,7 +42,6 @@ namespace detail {
 namespace orc {
 using namespace cudf::io::orc;
 using namespace cudf::io;
-using cudf::io::orc::gpu::nvstrdesc_s;
 
 struct row_group_index_info {
   int32_t pos       = -1;  // Position
@@ -111,39 +110,6 @@ constexpr T to_clockscale(cudf::type_id timestamp_id)
 
 }  // namespace
 
-/**
- * @brief Helper kernel for converting string data/offsets into nvstrdesc
- * REMOVEME: Once we eliminate the legacy readers/writers, the kernels could be
- * made to use the native offset+data layout.
- */
-__global__ void stringdata_to_nvstrdesc(gpu::nvstrdesc_s *dst,
-                                        const size_type *offsets,
-                                        const char *strdata,
-                                        const uint32_t *nulls,
-                                        const size_type column_offset,
-                                        size_type column_size)
-{
-  size_type row = blockIdx.x * blockDim.x + threadIdx.x;
-  if (row < column_size) {
-    uint32_t is_valid = (nulls != nullptr)
-                          ? (nulls[(row + column_offset) / 32] >> ((row + column_offset) % 32)) & 1
-                          : 1;
-    size_t count;
-    const char *ptr;
-    if (is_valid) {
-      size_type cur  = offsets[row];
-      size_type next = offsets[row + 1];
-      ptr            = strdata + cur;
-      count          = (next > cur) ? next - cur : 0;
-    } else {
-      ptr   = nullptr;
-      count = 0;
-    }
-    dst[row].ptr   = ptr;
-    dst[row].count = count;
-  }
-}
-
 /**
  * @brief Helper class that adds ORC-specific column info
  */
@@ -160,31 +126,14 @@ class orc_column_view {
                            rmm::cuda_stream_view stream)
     : _id(id),
       _str_id(str_id),
-      _string_type(col.type().id() == type_id::STRING),
-      _type_width(_string_type ? 0 : cudf::size_of(col.type())),
+      _is_string_type(col.type().id() == type_id::STRING),
+      _type_width(_is_string_type ? 0 : cudf::size_of(col.type())),
       _data_count(col.size()),
       _null_count(col.null_count()),
-      _data(col.head<uint8_t>() + col.offset() * _type_width),
       _nulls(col.null_mask()),
-      _column_offset(col.offset()),
       _clockscale(to_clockscale<uint8_t>(col.type().id())),
       _type_kind(to_orc_type(col.type().id()))
   {
-    if (_string_type && _data_count > 0) {
-      strings_column_view view{col};
-      _indexes = rmm::device_buffer(_data_count * sizeof(gpu::nvstrdesc_s), stream);
-
-      stringdata_to_nvstrdesc<<<((_data_count - 1) >> 8) + 1, 256, 0, stream.value()>>>(
-        static_cast<gpu::nvstrdesc_s *>(_indexes.data()),
-        view.offsets().data<size_type>() + view.offset(),
-        view.chars().data<char>(),
-        _nulls,
-        _column_offset,
-        _data_count);
-      _data = _indexes.data();
-
-      stream.synchronize();
-    }
     // Generating default name if name isn't present in metadata
     if (metadata && _id < metadata->column_names.size()) {
       _name = metadata->column_names[_id];
@@ -193,7 +142,7 @@ class orc_column_view {
     }
   }
 
-  auto is_string() const noexcept { return _string_type; }
+  auto is_string() const noexcept { return _is_string_type; }
   void set_dict_stride(size_t stride) noexcept { dict_stride = stride; }
   auto get_dict_stride() const noexcept { return dict_stride; }
 
@@ -207,7 +156,7 @@ class orc_column_view {
   }
   auto host_dict_chunk(size_t rowgroup) const
   {
-    assert(_string_type);
+    assert(_is_string_type);
     return &dict[rowgroup * dict_stride + _str_id];
   }
   auto device_dict_chunk() const { return d_dict; }
@@ -223,7 +172,7 @@ class orc_column_view {
   }
   auto host_stripe_dict(size_t stripe) const
   {
-    assert(_string_type);
+    assert(_is_string_type);
     return &stripe_dict[stripe * dict_stride + _str_id];
   }
   auto device_stripe_dict() const { return d_stripe_dict; }
@@ -233,9 +182,7 @@ class orc_column_view {
   size_t data_count() const noexcept { return _data_count; }
   size_t null_count() const noexcept { return _null_count; }
   bool nullable() const noexcept { return (_nulls != nullptr); }
-  void const *data() const noexcept { return _data; }
   uint32_t const *nulls() const noexcept { return _nulls; }
-  size_type column_offset() const noexcept { return _column_offset; }
   uint8_t clockscale() const noexcept { return _clockscale; }
 
   void set_orc_encoding(ColumnEncodingKind e) { _encoding_kind = e; }
@@ -245,17 +192,15 @@ class orc_column_view {
 
  private:
   // Identifier within set of columns and string columns, respectively
-  size_t _id        = 0;
-  size_t _str_id    = 0;
-  bool _string_type = false;
-
-  size_t _type_width       = 0;
-  size_t _data_count       = 0;
-  size_t _null_count       = 0;
-  void const *_data        = nullptr;
-  uint32_t const *_nulls   = nullptr;
-  size_type _column_offset = 0;
-  uint8_t _clockscale      = 0;
+  size_t _id           = 0;
+  size_t _str_id       = 0;
+  bool _is_string_type = false;
+
+  size_t _type_width     = 0;
+  size_t _data_count     = 0;
+  size_t _null_count     = 0;
+  uint32_t const *_nulls = nullptr;
+  uint8_t _clockscale    = 0;
 
   // ORC-related members
   std::string _name{};
@@ -263,7 +208,6 @@ class orc_column_view {
   ColumnEncodingKind _encoding_kind;
 
   // String dictionary-related members
-  rmm::device_buffer _indexes;
   size_t dict_stride                       = 0;
   gpu::DictionaryChunk const *dict         = nullptr;
   gpu::StripeDictionary const *stripe_dict = nullptr;
@@ -308,8 +252,10 @@ std::vector<stripe_rowgroups> writer::impl::gather_stripe_info(
   return infos;
 }
 
-void writer::impl::init_dictionaries(orc_column_view *columns,
+void writer::impl::init_dictionaries(const table_device_view &view,
+                                     orc_column_view *columns,
                                      std::vector<int> const &str_col_ids,
+                                     device_span<size_type> d_str_col_ids,
                                      uint32_t *dict_data,
                                      uint32_t *dict_index,
                                      hostdevice_vector<gpu::DictionaryChunk> *dict)
@@ -321,26 +267,17 @@ void writer::impl::init_dictionaries(orc_column_view *columns,
     auto &str_column = columns[str_col_ids[i]];
     str_column.set_dict_stride(str_col_ids.size());
     str_column.attach_dict_chunk(dict->host_ptr(), dict->device_ptr());
-
-    for (size_t g = 0; g < num_rowgroups; g++) {
-      auto *ck              = &(*dict)[g * str_col_ids.size() + i];
-      ck->valid_map_base    = str_column.nulls();
-      ck->column_offset     = str_column.column_offset();
-      ck->column_data_base  = str_column.data();
-      ck->dict_data         = dict_data + i * str_column.data_count() + g * row_index_stride_;
-      ck->dict_index        = dict_index + i * str_column.data_count();  // Indexed by abs row
-      ck->start_row         = g * row_index_stride_;
-      ck->num_rows          = std::min<uint32_t>(row_index_stride_,
-                                        std::max<int>(str_column.data_count() - ck->start_row, 0));
-      ck->num_strings       = 0;
-      ck->string_char_count = 0;
-      ck->num_dict_strings  = 0;
-      ck->dict_char_count   = 0;
-    }
   }
 
-  dict->host_to_device(stream);
-  gpu::InitDictionaryIndices(dict->device_ptr(), str_col_ids.size(), num_rowgroups, stream);
+  gpu::InitDictionaryIndices(view,
+                             dict->device_ptr(),
+                             dict_data,
+                             dict_index,
+                             row_index_stride_,
+                             d_str_col_ids.data(),
+                             d_str_col_ids.size(),
+                             num_rowgroups,
+                             stream);
   dict->device_to_host(stream, true);
 }
 
@@ -358,19 +295,19 @@ void writer::impl::build_dictionaries(orc_column_view *columns,
     str_column.attach_stripe_dict(stripe_dict.host_ptr(), stripe_dict.device_ptr());
 
     for (auto const &stripe : stripe_bounds) {
-      auto &sd            = stripe_dict[stripe.id * str_col_ids.size() + col_idx];
-      sd.column_data_base = str_column.host_dict_chunk(0)->column_data_base;
-      sd.dict_data        = str_column.host_dict_chunk(stripe.first)->dict_data;
-      sd.dict_index       = dict_index + col_idx * str_column.data_count();  // Indexed by abs row
-      sd.column_id        = str_col_ids[col_idx];
-      sd.start_chunk      = stripe.first;
-      sd.num_chunks       = stripe.size;
-      sd.dict_char_count  = 0;
+      auto &sd           = stripe_dict[stripe.id * str_col_ids.size() + col_idx];
+      sd.dict_data       = str_column.host_dict_chunk(stripe.first)->dict_data;
+      sd.dict_index      = dict_index + col_idx * str_column.data_count();  // Indexed by abs row
+      sd.column_id       = str_col_ids[col_idx];
+      sd.start_chunk     = stripe.first;
+      sd.num_chunks      = stripe.size;
+      sd.dict_char_count = 0;
       sd.num_strings =
         std::accumulate(stripe.cbegin(), stripe.cend(), 0, [&](auto dt_str_cnt, auto rg_idx) {
           const auto &dt = dict[rg_idx * str_col_ids.size() + col_idx];
           return dt_str_cnt + dt.num_dict_strings;
         });
+      sd.leaf_column = dict[col_idx].leaf_column;
     }
 
     if (enable_dictionary_) {
@@ -384,8 +321,8 @@ void writer::impl::build_dictionaries(orc_column_view *columns,
                         string_column_cost{},
                         [&](auto cost, auto rg_idx) -> string_column_cost {
                           const auto &dt = dict[rg_idx * str_col_ids.size() + col_idx];
-                          return {cost.dictionary + dt.dict_char_count + dt.num_dict_strings,
-                                  cost.direct + dt.string_char_count};
+                          return {cost.direct + dt.string_char_count,
+                                  cost.dictionary + dt.dict_char_count + dt.num_dict_strings};
                         });
       // Disable dictionary if it does not reduce the output size
       if (col_cost.dictionary >= col_cost.direct) {
@@ -593,15 +530,16 @@ struct segmented_valid_cnt_input {
   std::vector<size_type> indices;
 };
 
-encoded_data writer::impl::encode_columns(host_span<orc_column_view const> columns,
+encoded_data writer::impl::encode_columns(const table_device_view &view,
+                                          host_span<orc_column_view const> columns,
                                           std::vector<int> const &str_col_ids,
                                           host_span<stripe_rowgroups const> stripe_bounds,
                                           orc_streams const &streams)
 {
   auto const num_columns   = columns.size();
   auto const num_rowgroups = stripes_size(stripe_bounds);
-  hostdevice_2dvector<gpu::EncChunk> chunks(num_columns, num_rowgroups);
-  hostdevice_2dvector<gpu::encoder_chunk_streams> chunk_streams(num_columns, num_rowgroups);
+  hostdevice_2dvector<gpu::EncChunk> chunks(num_columns, num_rowgroups, stream);
+  hostdevice_2dvector<gpu::encoder_chunk_streams> chunk_streams(num_columns, num_rowgroups, stream);
   auto const stream_offsets = streams.compute_offsets(columns, num_rowgroups);
   rmm::device_uvector<uint8_t> encoded_data(stream_offsets.data_size(), stream);
 
@@ -614,23 +552,17 @@ encoded_data writer::impl::encode_columns(host_span<orc_column_view const> colum
         auto const rg_idx = *rg_idx_it;
         auto &ck          = chunks[column.id()][rg_idx];
 
-        ck.start_row  = (rg_idx * row_index_stride_);
-        ck.num_rows   = std::min<uint32_t>(row_index_stride_, column.data_count() - ck.start_row);
-        ck.valid_rows = column.data_count();
+        ck.start_row = (rg_idx * row_index_stride_);
+        ck.num_rows  = std::min<uint32_t>(row_index_stride_, column.data_count() - ck.start_row);
         ck.encoding_kind = column.orc_encoding();
         ck.type_kind     = column.orc_kind();
         if (ck.type_kind == TypeKind::STRING) {
-          ck.valid_map_base   = column.nulls();
-          ck.column_offset    = column.column_offset();
-          ck.column_data_base = (ck.encoding_kind == DICTIONARY_V2)
-                                  ? column.host_stripe_dict(stripe.id)->dict_index
-                                  : column.data();
+          ck.dict_index = (ck.encoding_kind == DICTIONARY_V2)
+                            ? column.host_stripe_dict(stripe.id)->dict_index
+                            : nullptr;
           ck.dtype_len = 1;
         } else {
-          ck.valid_map_base   = column.nulls();
-          ck.column_offset    = column.column_offset();
-          ck.column_data_base = column.data();
-          ck.dtype_len        = column.type_width();
+          ck.dtype_len = column.type_width();
         }
         ck.scale = column.clockscale();
         // Only need to check row groups that end within the stripe
@@ -730,6 +662,8 @@ encoded_data writer::impl::encode_columns(host_span<orc_column_view const> colum
   chunks.host_to_device(stream);
   chunk_streams.host_to_device(stream);
 
+  gpu::set_chunk_columns(view, chunks, stream);
+
   if (!str_col_ids.empty()) {
     auto d_stripe_dict = columns[str_col_ids[0]].device_stripe_dict();
     gpu::EncodeStripeDictionaries(
@@ -791,8 +725,8 @@ std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
   size_t num_chunks        = num_rowgroups * columns.size();
 
   std::vector<std::vector<uint8_t>> stat_blobs(num_stat_blobs);
-  hostdevice_vector<stats_column_desc> stat_desc(columns.size());
-  hostdevice_vector<statistics_merge_group> stat_merge(num_stat_blobs);
+  hostdevice_vector<stats_column_desc> stat_desc(columns.size(), stream);
+  hostdevice_vector<statistics_merge_group> stat_merge(num_stat_blobs, stream);
   rmm::device_uvector<statistics_chunk> stat_chunks(num_chunks + num_stat_blobs, stream);
   rmm::device_uvector<statistics_group> stat_groups(num_chunks, stream);
 
@@ -811,11 +745,8 @@ std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
       case TypeKind::STRING: desc->stats_dtype = dtype_string; break;
       default: desc->stats_dtype = dtype_none; break;
     }
-    desc->num_rows         = column.data_count();
-    desc->num_values       = column.data_count();
-    desc->valid_map_base   = column.nulls();
-    desc->column_offset    = column.column_offset();
-    desc->column_data_base = column.data();
+    desc->num_rows   = column.data_count();
+    desc->num_values = column.data_count();
     if (desc->stats_dtype == dtype_timestamp64) {
       // Timestamp statistics are in milliseconds
       switch (column.clockscale()) {
@@ -869,8 +800,8 @@ std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
     stat_merge.device_ptr(), stat_chunks.data() + num_chunks, num_stat_blobs, stream);
   stat_merge.device_to_host(stream, true);
 
-  hostdevice_vector<uint8_t> blobs(stat_merge[num_stat_blobs - 1].start_chunk +
-                                   stat_merge[num_stat_blobs - 1].num_chunks);
+  hostdevice_vector<uint8_t> blobs(
+    stat_merge[num_stat_blobs - 1].start_chunk + stat_merge[num_stat_blobs - 1].num_chunks, stream);
   gpu::orc_encode_statistics(blobs.device_ptr(),
                              stat_merge.device_ptr(),
                              stat_chunks.data() + num_chunks,
@@ -1061,6 +992,22 @@ void writer::impl::init_state()
   out_sink_->host_write(MAGIC, std::strlen(MAGIC));
 }
 
+rmm::device_uvector<size_type> get_string_column_ids(const table_device_view &view,
+                                                     rmm::cuda_stream_view stream)
+{
+  rmm::device_uvector<size_type> string_column_ids(view.num_columns(), stream);
+  auto iter     = thrust::make_counting_iterator<size_type>(0);
+  auto end_iter = thrust::copy_if(rmm::exec_policy(stream),
+                                  iter,
+                                  iter + view.num_columns(),
+                                  string_column_ids.begin(),
+                                  [view] __device__(size_type index) {
+                                    return (view.column(index).type().id() == type_id::STRING);
+                                  });
+  string_column_ids.resize(end_iter - string_column_ids.begin(), stream);
+  return string_column_ids;
+}
+
 void writer::impl::write(table_view const &table)
 {
   CUDF_EXPECTS(not closed, "Data has already been flushed to out and closed");
@@ -1074,6 +1021,9 @@ void writer::impl::write(table_view const &table)
       "be specified");
   }
 
+  auto device_columns    = table_device_view::create(table, stream);
+  auto string_column_ids = get_string_column_ids(*device_columns, stream);
+
   // Wrapper around cudf columns to attach ORC-specific type info
   std::vector<orc_column_view> orc_columns;
   orc_columns.reserve(num_columns);
@@ -1093,9 +1043,15 @@ void writer::impl::write(table_view const &table)
   // Build per-column dictionary indices
   const auto num_rowgroups   = div_by_rowgroups<size_t>(num_rows);
   const auto num_dict_chunks = num_rowgroups * str_col_ids.size();
-  hostdevice_vector<gpu::DictionaryChunk> dict(num_dict_chunks);
+  hostdevice_vector<gpu::DictionaryChunk> dict(num_dict_chunks, stream);
   if (!str_col_ids.empty()) {
-    init_dictionaries(orc_columns.data(), str_col_ids, dict_data.data(), dict_index.data(), &dict);
+    init_dictionaries(*device_columns,
+                      orc_columns.data(),
+                      str_col_ids,
+                      string_column_ids,
+                      dict_data.data(),
+                      dict_index.data(),
+                      &dict);
   }
 
   // Decide stripe boundaries early on, based on uncompressed size
@@ -1103,23 +1059,22 @@ void writer::impl::write(table_view const &table)
 
   // Build stripe-level dictionaries
   const auto num_stripe_dict = stripe_bounds.size() * str_col_ids.size();
-  hostdevice_vector<gpu::StripeDictionary> stripe_dict(num_stripe_dict);
+  hostdevice_vector<gpu::StripeDictionary> stripe_dict(num_stripe_dict, stream);
   if (!str_col_ids.empty()) {
     build_dictionaries(
       orc_columns.data(), str_col_ids, stripe_bounds, dict, dict_index.data(), stripe_dict);
   }
 
   auto streams  = create_streams(orc_columns, stripe_bounds);
-  auto enc_data = encode_columns(orc_columns, str_col_ids, stripe_bounds, streams);
+  auto enc_data = encode_columns(*device_columns, orc_columns, str_col_ids, stripe_bounds, streams);
 
   // Assemble individual disparate column chunks into contiguous data streams
   const auto num_index_streams = (num_columns + 1);
   const auto num_data_streams  = streams.size() - num_index_streams;
-  hostdevice_2dvector<gpu::StripeStream> strm_descs(stripe_bounds.size(), num_data_streams);
+  hostdevice_2dvector<gpu::StripeStream> strm_descs(stripe_bounds.size(), num_data_streams, stream);
   auto stripes =
     gather_stripes(num_rows, num_index_streams, stripe_bounds, &enc_data.streams, &strm_descs);
 
-  auto device_columns = table_device_view::create(table);
   // Gather column statistics
   std::vector<std::vector<uint8_t>> column_stats;
   if (enable_statistics_ && num_columns > 0 && num_rows > 0) {
@@ -1160,8 +1115,8 @@ void writer::impl::write(table_view const &table)
 
   // Compress the data streams
   rmm::device_buffer compressed_data(compressed_bfr_size, stream);
-  hostdevice_vector<gpu_inflate_status_s> comp_out(num_compressed_blocks);
-  hostdevice_vector<gpu_inflate_input_s> comp_in(num_compressed_blocks);
+  hostdevice_vector<gpu_inflate_status_s> comp_out(num_compressed_blocks, stream);
+  hostdevice_vector<gpu_inflate_input_s> comp_in(num_compressed_blocks, stream);
   if (compression_kind_ != NONE) {
     strm_descs.host_to_device(stream);
     gpu::CompressOrcDataStreams(static_cast<uint8_t *>(compressed_data.data()),
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index f0ec3a70cec..352cb11440f 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -186,14 +186,18 @@ class writer::impl {
   /**
    * @brief Builds up column dictionaries indices
    *
+   * @param view Table device view representing input table
    * @param columns List of columns
    * @param str_col_ids List of columns that are strings type
+   * @param d_str_col_ids List of columns that are strings type in device memory
    * @param dict_data Dictionary data memory
    * @param dict_index Dictionary index memory
    * @param dict List of dictionary chunks
    */
-  void init_dictionaries(orc_column_view* columns,
+  void init_dictionaries(const table_device_view& view,
+                         orc_column_view* columns,
                          std::vector<int> const& str_col_ids,
+                         device_span<size_type> d_str_col_ids,
                          uint32_t* dict_data,
                          uint32_t* dict_index,
                          hostdevice_vector<gpu::DictionaryChunk>* dict);
@@ -238,13 +242,15 @@ class writer::impl {
   /**
    * @brief Encodes the input columns into streams.
    *
+   * @param view Table device view representing input table
    * @param columns List of columns
    * @param str_col_ids List of columns that are strings type
    * @param stripe_bounds List of stripe boundaries
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @return Encoded data and per-chunk stream descriptors
    */
-  encoded_data encode_columns(host_span<orc_column_view const> columns,
+  encoded_data encode_columns(const table_device_view& view,
+                              host_span<orc_column_view const> columns,
                               std::vector<int> const& str_col_ids,
                               host_span<stripe_rowgroups const> stripe_bounds,
                               orc_streams const& streams);
diff --git a/cpp/src/io/parquet/page_dict.cu b/cpp/src/io/parquet/page_dict.cu
index 46d471d5cf7..2676f30474d 100644
--- a/cpp/src/io/parquet/page_dict.cu
+++ b/cpp/src/io/parquet/page_dict.cu
@@ -52,8 +52,10 @@ inline __device__ uint32_t uint64_hash16(uint64_t v)
   return uint32_hash16((uint32_t)(v + (v >> 32)));
 }
 
-inline __device__ uint32_t nvstr_hash16(const uint8_t *p, uint32_t len)
+inline __device__ uint32_t hash_string(const string_view &val)
 {
+  const char *p = val.data();
+  uint32_t len  = val.size_bytes();
   uint32_t hash = len;
   if (len > 0) {
     uint32_t align_p    = 3 & reinterpret_cast<uintptr_t>(p);
@@ -181,7 +183,7 @@ __global__ void __launch_bounds__(block_size, 1)
   } else if (dtype == INT96) {
     dtype_len_in = 8;
   } else {
-    dtype_len_in = (dtype == BYTE_ARRAY) ? sizeof(nvstrdesc_s) : dtype_len;
+    dtype_len_in = dtype_len;
   }
   __syncthreads();
   while (s->row_cnt < s->ck.num_rows) {
@@ -206,7 +208,7 @@ __global__ void __launch_bounds__(block_size, 1)
         if (dtype == BYTE_ARRAY) {
           auto str1 = s->col.leaf_column->element<string_view>(row);
           len += str1.size_bytes();
-          hash = nvstr_hash16(reinterpret_cast<const uint8_t *>(str1.data()), str1.size_bytes());
+          hash = hash_string(str1);
           // Walk the list of rows with the same hash
           next_addr = &s->hashmap[hash];
           while ((next = atomicCAS(next_addr, 0, row + 1)) != 0) {
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 3b29394686f..51ec0013f1a 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -79,8 +79,10 @@ struct page_enc_state_s {
 /**
  * @brief Return a 12-bit hash from a byte sequence
  */
-inline __device__ uint32_t nvstr_init_hash(const uint8_t *ptr, uint32_t len)
+inline __device__ uint32_t hash_string(const string_view &val)
 {
+  char const *ptr = val.data();
+  uint32_t len    = val.size_bytes();
   if (len != 0) {
     return (ptr[0] + (ptr[len - 1] << 5) + (len << 10)) & ((1 << init_hash_bits) - 1);
   } else {
@@ -199,7 +201,7 @@ __global__ void __launch_bounds__(block_size)
     // dtype_len, which determines how much memory we need to allocate for the fragment.
     dtype_len_in = 8;
   } else {
-    dtype_len_in = (dtype == BYTE_ARRAY) ? sizeof(nvstrdesc_s) : dtype_len;
+    dtype_len_in = dtype_len;
   }
   __syncthreads();
 
@@ -218,7 +220,7 @@ __global__ void __launch_bounds__(block_size)
         if (dtype == BYTE_ARRAY) {
           auto str = s->col.leaf_column->element<string_view>(val_idx);
           len += str.size_bytes();
-          hash = nvstr_init_hash(reinterpret_cast<const uint8_t *>(str.data()), str.size_bytes());
+          hash = hash_string(str);
         } else if (dtype_len_in == 8) {
           hash = uint64_init_hash(s->col.leaf_column->element<uint64_t>(val_idx));
         } else {
@@ -1059,7 +1061,7 @@ __global__ void __launch_bounds__(128, 8) gpuEncodePages(EncPage *pages,
   } else if (dtype == INT96) {
     dtype_len_in = 8;
   } else {
-    dtype_len_in = (dtype == BYTE_ARRAY) ? sizeof(nvstrdesc_s) : dtype_len_out;
+    dtype_len_in = dtype_len_out;
   }
   dict_bits = (dtype == BOOLEAN) ? 1 : (s->page.dict_bits_plus1 - 1);
   if (t == 0) {
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 31baf419f45..1e8a6920ea4 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -969,7 +969,7 @@ void writer::impl::write(table_view const &table)
   }
   // Create table_device_view so that corresponding column_device_view data
   // can be written into col_desc members
-  auto parent_column_table_device_view = table_device_view::create(single_streams_table);
+  auto parent_column_table_device_view = table_device_view::create(single_streams_table, stream);
   rmm::device_uvector<column_device_view> leaf_column_views(0, stream);
 
   // Initialize column description
diff --git a/cpp/src/io/statistics/column_stats.cu b/cpp/src/io/statistics/column_stats.cu
index 128bd905259..52f21f0a9ad 100644
--- a/cpp/src/io/statistics/column_stats.cu
+++ b/cpp/src/io/statistics/column_stats.cu
@@ -187,12 +187,6 @@ gatherFloatColumnStats(stats_state_s *s, statistics_dtype dtype, uint32_t t, Sto
   }
 }
 
-// FIXME: Use native libcudf string type
-struct nvstrdesc_s {
-  const char *ptr;
-  size_t count;
-};
-
 /**
  * @brief Gather statistics for string columns
  *
diff --git a/cpp/src/io/statistics/column_stats.h b/cpp/src/io/statistics/column_stats.h
index d1d414aa7b4..d7895de50ce 100644
--- a/cpp/src/io/statistics/column_stats.h
+++ b/cpp/src/io/statistics/column_stats.h
@@ -45,10 +45,7 @@ struct stats_column_desc {
   uint32_t num_rows;             //!< number of rows in column
   uint32_t num_values;  //!< Number of data values in column. Different from num_rows in case of
                         //!< nested columns
-  const uint32_t *valid_map_base;  //!< base of valid bit map for this column (null if not present)
-  size_type column_offset;         //! < index of the first element relative to the base memory
-  const void *column_data_base;    //!< base ptr to column data
-  int32_t ts_scale;  //!< timestamp scale (>0: multiply by scale, <0: divide by -scale)
+  int32_t ts_scale;     //!< timestamp scale (>0: multiply by scale, <0: divide by -scale)
 
   column_device_view *leaf_column;    //!< Pointer to leaf column
   column_device_view *parent_column;  //!< Pointer to parent column. Is nullptr if not list type.
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index b64e91c18bd..d827d03a6c0 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -442,7 +442,9 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> construct_join_output_
                                               stream,
                                               rmm::mr::get_current_device_resource());
       common_table           = cudf::detail::concatenate(
-        {common_from_build->view(), common_from_probe->view()}, stream, mr);
+        std::vector<table_view>({common_from_build->view(), common_from_probe->view()}),
+        stream,
+        mr);
     }
     joined_indices = concatenate_vector_pairs(complement_indices, joined_indices);
   } else {
diff --git a/cpp/src/lists/copying/concatenate.cu b/cpp/src/lists/copying/concatenate.cu
index c6ca56085c8..facf2827f56 100644
--- a/cpp/src/lists/copying/concatenate.cu
+++ b/cpp/src/lists/copying/concatenate.cu
@@ -48,7 +48,7 @@ namespace {
  * @param[in] mr                    Device memory resource used to allocate the
  * returned column's device memory.
  */
-std::unique_ptr<column> merge_offsets(std::vector<lists_column_view> const& columns,
+std::unique_ptr<column> merge_offsets(host_span<lists_column_view const> columns,
                                       size_type total_list_count,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
@@ -90,7 +90,7 @@ std::unique_ptr<column> merge_offsets(std::vector<lists_column_view> const& colu
  * @copydoc cudf::lists::detail::concatenate
  */
 std::unique_ptr<column> concatenate(
-  std::vector<column_view> const& columns,
+  host_span<column_view const> columns,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
@@ -119,7 +119,7 @@ std::unique_ptr<column> concatenate(
 
   // if any of the input columns have nulls, construct the output mask
   bool const has_nulls =
-    std::any_of(columns.cbegin(), columns.cend(), [](auto const& col) { return col.has_nulls(); });
+    std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
   rmm::device_buffer null_mask = create_null_mask(
     total_list_count, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED);
   if (has_nulls) {
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 783e0b4b1de..cb142c2c1e2 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -450,7 +450,8 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::diction
   auto replacements = cudf::dictionary_column_view(replacement_values);
 
   auto matched_input = [&] {
-    auto new_keys = cudf::detail::concatenate({values.keys(), replacements.keys()}, stream);
+    auto new_keys = cudf::detail::concatenate(
+      std::vector<cudf::column_view>({values.keys(), replacements.keys()}), stream);
     return cudf::dictionary::detail::add_keys(input, new_keys->view(), stream, mr);
   }();
   auto matched_view   = cudf::dictionary_column_view(matched_input->view());
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index b8cb5e45fec..34d6d5fa194 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -838,8 +838,8 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
   index_vector group_offsets(0, stream), group_labels(0, stream);
   if (group_keys.num_columns() > 0) {
     sort_groupby_helper helper{group_keys, cudf::null_policy::INCLUDE, cudf::sorted::YES};
-    group_offsets = index_vector(helper.group_offsets(), stream);
-    group_labels  = index_vector(helper.group_labels(), stream);
+    group_offsets = index_vector(helper.group_offsets(stream), stream);
+    group_labels  = index_vector(helper.group_labels(stream), stream);
   }
 
   // Assumes that `timestamp_column` is actually of a timestamp type.
diff --git a/cpp/src/sort/is_sorted.cu b/cpp/src/sort/is_sorted.cu
index 5c31e565530..d1a1169dae4 100644
--- a/cpp/src/sort/is_sorted.cu
+++ b/cpp/src/sort/is_sorted.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
+#include <structs/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_vector.hpp>
@@ -34,10 +35,12 @@ auto is_sorted(cudf::table_view const& in,
                std::vector<null_order> const& null_precedence,
                rmm::cuda_stream_view stream)
 {
-  auto in_d = table_device_view::create(in);
-  rmm::device_vector<order> d_column_order(column_order);
+  // 0-table_view, 1-column_order, 2-null_precedence, 3-validity_columns
+  auto flattened = structs::detail::flatten_nested_columns(in, column_order, null_precedence);
+  auto in_d      = table_device_view::create(std::get<0>(flattened), stream);
+  rmm::device_vector<order> d_column_order(std::get<1>(flattened));
   rmm::device_vector<null_order> const d_null_precedence =
-    (has_nulls) ? rmm::device_vector<null_order>{null_precedence}
+    (has_nulls) ? rmm::device_vector<null_order>{std::get<2>(flattened)}
                 : rmm::device_vector<null_order>{};
   auto ineq_op = row_lexicographic_comparator<has_nulls>(
     *in_d, *in_d, d_column_order.data().get(), d_null_precedence.data().get());
diff --git a/cpp/src/sort/sort_impl.cuh b/cpp/src/sort/sort_impl.cuh
index 4fc83d343d5..506334c2a3d 100644
--- a/cpp/src/sort/sort_impl.cuh
+++ b/cpp/src/sort/sort_impl.cuh
@@ -21,6 +21,8 @@
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/traits.hpp>
+#include <structs/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -112,7 +114,7 @@ std::unique_ptr<column> sorted_order(table_view input,
                    0);
 
   // fast-path for single column sort
-  if (input.num_columns() == 1) {
+  if (input.num_columns() == 1 and not cudf::is_nested(input.column(0).type())) {
     auto const single_col = input.column(0);
     auto const col_order  = column_order.empty() ? order::ASCENDING : column_order.front();
     auto const null_prec  = null_precedence.empty() ? null_order::BEFORE : null_precedence.front();
@@ -120,11 +122,13 @@ std::unique_ptr<column> sorted_order(table_view input,
                   : sorted_order<false>(single_col, col_order, null_prec, stream, mr);
   }
 
-  auto device_table = table_device_view::create(input, stream);
-  rmm::device_vector<order> d_column_order(column_order);
+  auto flattened = structs::detail::flatten_nested_columns(input, column_order, null_precedence);
+  auto& input_flattened = std::get<0>(flattened);
+  auto device_table     = table_device_view::create(input_flattened, stream);
+  rmm::device_vector<order> d_column_order(std::get<1>(flattened));
 
-  if (has_nulls(input)) {
-    rmm::device_vector<null_order> d_null_precedence(null_precedence);
+  if (has_nulls(input_flattened)) {
+    rmm::device_vector<null_order> d_null_precedence(std::get<2>(flattened));
     auto comparator = row_lexicographic_comparator<true>(
       *device_table, *device_table, d_column_order.data().get(), d_null_precedence.data().get());
     if (stable) {
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 65c6c8f2836..48358cb4a38 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/detail/concatenate.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/detail/concatenate.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -27,6 +28,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include "thrust/iterator/transform_iterator.h"
 
 #include <thrust/binary_search.h>
 #include <thrust/for_each.h>
@@ -65,8 +67,7 @@ struct chars_size_transform {
   }
 };
 
-auto create_strings_device_views(std::vector<column_view> const& views,
-                                 rmm::cuda_stream_view stream)
+auto create_strings_device_views(host_span<column_view const> views, rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   // Assemble contiguous array of device views
@@ -77,33 +78,30 @@ auto create_strings_device_views(std::vector<column_view> const& views,
 
   // Compute the partition offsets and size of offset column
   // Note: Using 64-bit size_t so we can detect overflow of 32-bit size_type
-  auto input_offsets = thrust::host_vector<size_t>(views.size() + 1);
+  auto input_offsets = std::vector<size_t>(views.size() + 1);
   auto offset_it     = std::next(input_offsets.begin());
   thrust::transform(
-    thrust::host, views.cbegin(), views.cend(), offset_it, [](auto const& col) -> size_t {
+    thrust::host, views.begin(), views.end(), offset_it, [](auto const& col) -> size_t {
       return static_cast<size_t>(col.size());
     });
   thrust::inclusive_scan(thrust::host, offset_it, input_offsets.end(), offset_it);
-  auto const d_input_offsets = rmm::device_vector<size_t>{input_offsets};
-  auto const output_size     = input_offsets.back();
+  auto d_input_offsets   = cudf::detail::make_device_uvector_async(input_offsets, stream);
+  auto const output_size = input_offsets.back();
 
   // Compute the partition offsets and size of chars column
   // Note: Using 64-bit size_t so we can detect overflow of 32-bit size_type
-  // Note: Using separate transform and inclusive_scan because
-  // transform_inclusive_scan fails to compile with:
-  // error: the default constructor of "cudf::column_device_view" cannot be
-  // referenced -- it is a deleted function
-  auto d_partition_offsets = rmm::device_vector<size_t>(views.size() + 1);
-  thrust::transform(rmm::exec_policy(stream),
-                    device_views_ptr,
-                    device_views_ptr + views.size(),
-                    std::next(d_partition_offsets.begin()),
-                    chars_size_transform{});
-  thrust::inclusive_scan(rmm::exec_policy(stream),
-                         d_partition_offsets.cbegin(),
-                         d_partition_offsets.cend(),
-                         d_partition_offsets.begin());
-  auto const output_chars_size = d_partition_offsets.back();
+  auto d_partition_offsets = rmm::device_uvector<size_t>(views.size() + 1, stream);
+  size_t zero{0};
+  d_partition_offsets.set_element_async(0, zero, stream);  // zero first element
+
+  thrust::transform_inclusive_scan(rmm::exec_policy(stream),
+                                   device_views_ptr,
+                                   device_views_ptr + views.size(),
+                                   std::next(d_partition_offsets.begin()),
+                                   chars_size_transform{},
+                                   thrust::plus<size_t>{});
+  auto const output_chars_size = d_partition_offsets.back_element(stream);
+  stream.synchronize();  // ensure copy of output_chars_size is complete before returning
 
   return std::make_tuple(std::move(device_view_owners),
                          device_views_ptr,
@@ -205,7 +203,7 @@ __global__ void fused_concatenate_string_chars_kernel(column_device_view const*
   }
 }
 
-std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
+std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
@@ -257,8 +255,8 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
                                   : fused_concatenate_string_offset_kernel<block_size, false>;
     kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
       d_views,
-      d_input_offsets.data().get(),
-      d_partition_offsets.data().get(),
+      d_input_offsets.data(),
+      d_partition_offsets.data(),
       static_cast<size_type>(columns.size()),
       strings_count,
       d_new_offsets,
@@ -277,7 +275,7 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
       auto const kernel = fused_concatenate_string_chars_kernel;
       kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
         d_views,
-        d_partition_offsets.data().get(),
+        d_partition_offsets.data(),
         static_cast<size_type>(columns.size()),
         total_bytes,
         d_new_chars);
diff --git a/cpp/src/structs/copying/concatenate.cu b/cpp/src/structs/copying/concatenate.cu
index b2f861c7c8d..6f18c4bcbd4 100644
--- a/cpp/src/structs/copying/concatenate.cu
+++ b/cpp/src/structs/copying/concatenate.cu
@@ -36,7 +36,7 @@ namespace detail {
 /**
  * @copydoc cudf::structs::detail::concatenate
  */
-std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
+std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
@@ -49,7 +49,7 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
   std::transform(ordered_children.begin(),
                  ordered_children.end(),
                  std::back_inserter(children),
-                 [mr, stream](std::vector<column_view> const& cols) {
+                 [mr, stream](host_span<column_view const> cols) {
                    return cudf::detail::concatenate(cols, stream, mr);
                  });
 
@@ -57,7 +57,7 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
 
   // if any of the input columns have nulls, construct the output mask
   bool const has_nulls =
-    std::any_of(columns.cbegin(), columns.cend(), [](auto const& col) { return col.has_nulls(); });
+    std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
   rmm::device_buffer null_mask =
     create_null_mask(total_length, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED);
   if (has_nulls) {
diff --git a/cpp/src/structs/utilities.cu b/cpp/src/structs/utilities.cu
index 09e6c5d949d..174e36a1628 100644
--- a/cpp/src/structs/utilities.cu
+++ b/cpp/src/structs/utilities.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,10 @@
 #include <thrust/iterator/counting_iterator.h>
 
 #include <cudf/structs/structs_column_view.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/unary.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/span.hpp>
 
 namespace cudf {
 namespace structs {
@@ -27,7 +30,7 @@ namespace detail {
  * @copydoc cudf::structs::detail::extract_ordered_struct_children
  */
 std::vector<std::vector<column_view>> extract_ordered_struct_children(
-  std::vector<column_view> const& struct_cols)
+  host_span<column_view const> struct_cols)
 {
   auto const num_children = struct_cols[0].num_children();
   auto const num_cols     = static_cast<size_type>(struct_cols.size());
@@ -56,6 +59,103 @@ std::vector<std::vector<column_view>> extract_ordered_struct_children(
   return result;
 }
 
+/**
+ * @brief Flattens struct columns to constituent non-struct columns in the input table.
+ *
+ */
+struct flattened_table {
+  // reference variables
+  table_view const& input;
+  std::vector<order> const& column_order;
+  std::vector<null_order> const& null_precedence;
+  // output
+  std::vector<std::unique_ptr<column>> validity_as_column;
+  std::vector<column_view> flat_columns;
+  std::vector<order> flat_column_order;
+  std::vector<null_order> flat_null_precedence;
+
+  flattened_table(table_view const& input,
+                  std::vector<order> const& column_order,
+                  std::vector<null_order> const& null_precedence)
+    : input(input), column_order(column_order), null_precedence(null_precedence)
+  {
+  }
+
+  // Convert null_mask to BOOL8 columns and flatten the struct children in order.
+  void flatten_struct_column(structs_column_view const& col,
+                             order col_order,
+                             null_order col_null_order)
+  {
+    if (col.nullable()) {
+      validity_as_column.push_back(cudf::is_valid(col));
+      validity_as_column.back()->set_null_mask(copy_bitmask(col));
+      flat_columns.push_back(validity_as_column.back()->view());
+      if (not column_order.empty()) flat_column_order.push_back(col_order);  // doesn't matter.
+      if (not null_precedence.empty()) flat_null_precedence.push_back(col_null_order);
+    }
+    for (decltype(col.num_children()) i = 0; i < col.num_children(); ++i) {
+      auto const& child = col.get_sliced_child(i);
+      if (child.type().id() == type_id::STRUCT) {
+        flatten_struct_column(structs_column_view{child}, col_order, null_order::BEFORE);
+        // default spark behaviour is null_order::BEFORE
+      } else {
+        flat_columns.push_back(child);
+        if (not column_order.empty()) flat_column_order.push_back(col_order);
+        if (not null_precedence.empty()) flat_null_precedence.push_back(null_order::BEFORE);
+        // default spark behaviour is null_order::BEFORE
+      }
+    }
+  }
+  // Note: possibly expand for flattening list columns too.
+
+  /**
+   * @copydoc flattened_table
+   *
+   * @return tuple with flattened table, flattened column order, flattened null precedence,
+   * vector of boolean columns (struct validity).
+   */
+  auto operator()()
+  {
+    for (auto i = 0; i < input.num_columns(); ++i) {
+      auto const& col = input.column(i);
+      if (col.type().id() == type_id::STRUCT) {
+        flatten_struct_column(structs_column_view{col},
+                              (column_order.empty() ? order() : column_order[i]),
+                              (null_precedence.empty() ? null_order() : null_precedence[i]));
+      } else {
+        flat_columns.push_back(col);
+        if (not column_order.empty()) flat_column_order.push_back(column_order[i]);
+        if (not null_precedence.empty()) flat_null_precedence.push_back(null_precedence[i]);
+      }
+    }
+
+    return std::make_tuple(table_view{flat_columns},
+                           std::move(flat_column_order),
+                           std::move(flat_null_precedence),
+                           std::move(validity_as_column));
+  }
+};
+
+/**
+ * @copydoc cudf::detail::flatten_nested_columns
+ */
+std::tuple<table_view,
+           std::vector<order>,
+           std::vector<null_order>,
+           std::vector<std::unique_ptr<column>>>
+flatten_nested_columns(table_view const& input,
+                       std::vector<order> const& column_order,
+                       std::vector<null_order> const& null_precedence)
+{
+  std::vector<std::unique_ptr<column>> validity_as_column;
+  auto const has_struct = std::any_of(
+    input.begin(), input.end(), [](auto const& col) { return col.type().id() == type_id::STRUCT; });
+  if (not has_struct)
+    return std::make_tuple(input, column_order, null_precedence, std::move(validity_as_column));
+
+  return flattened_table{input, column_order, null_precedence}();
+}
+
 }  // namespace detail
 }  // namespace structs
 }  // namespace cudf
diff --git a/cpp/src/structs/utilities.hpp b/cpp/src/structs/utilities.hpp
index 1e0511cfd83..c0111d0bbde 100644
--- a/cpp/src/structs/utilities.hpp
+++ b/cpp/src/structs/utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 #pragma once
 
 #include <cudf/structs/structs_column_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 namespace cudf {
 namespace structs {
@@ -45,7 +47,26 @@ namespace detail {
  * @return New column with concatenated results.
  */
 std::vector<std::vector<column_view>> extract_ordered_struct_children(
-  std::vector<column_view> const& struct_cols);
+  host_span<column_view const> struct_cols);
+
+/**
+ * @brief Flatten table with struct columns to table with constituent columns of struct columns.
+ *
+ * If a table does not have struct columns, same input arguments are returned.
+ *
+ * @param input input table to be flattened
+ * @param column_order column order for input table
+ * @param null_precedence null order for input table
+ * @return tuple with flattened table, flattened column order, flattened null precedence,
+ * vector of boolean columns (struct validity).
+ */
+std::tuple<table_view,
+           std::vector<order>,
+           std::vector<null_order>,
+           std::vector<std::unique_ptr<column>>>
+flatten_nested_columns(table_view const& input,
+                       std::vector<order> const& column_order,
+                       std::vector<null_order> const& null_precedence);
 
 }  // namespace detail
 }  // namespace structs
diff --git a/cpp/src/table/table_device_view.cu b/cpp/src/table/table_device_view.cu
index bdce1c325c5..62daeed6d79 100644
--- a/cpp/src/table/table_device_view.cu
+++ b/cpp/src/table/table_device_view.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <thrust/logical.h>
+
 namespace cudf {
 namespace detail {
 template <typename ColumnDeviceView, typename HostTableView>
@@ -52,5 +54,45 @@ template class table_device_view_base<column_device_view, table_view>;
 // Explicit instantiation for a device table of mutable views
 template class table_device_view_base<mutable_column_device_view, mutable_table_view>;
 
+namespace {
+struct is_relationally_comparable_impl {
+  template <typename T>
+  constexpr bool operator()()
+  {
+    return cudf::is_relationally_comparable<T, T>();
+  }
+};
+}  // namespace
+
+template <typename TableView>
+bool is_relationally_comparable(TableView const& lhs, TableView const& rhs)
+{
+  return thrust::all_of(thrust::counting_iterator<size_type>(0),
+                        thrust::counting_iterator<size_type>(lhs.num_columns()),
+                        [lhs, rhs] __device__(auto const i) {
+                          // Simplified this for compile time. (Ideally use double_type_dispatcher)
+                          // TODO: possible to implement without double type dispatcher.
+                          return lhs.column(i).type() == rhs.column(i).type() and
+                                 type_dispatcher(lhs.column(i).type(),
+                                                 is_relationally_comparable_impl{});
+                        });
+}
+
+// Explicit extern template instantiation for a table of immutable views
+extern template bool is_relationally_comparable<table_view>(table_view const& lhs,
+                                                            table_view const& rhs);
+
+// Explicit extern template instantiation for a table of mutable views
+extern template bool is_relationally_comparable<mutable_table_view>(mutable_table_view const& lhs,
+                                                                    mutable_table_view const& rhs);
+
+// Explicit extern template instantiation for a device table of immutable views
+template bool is_relationally_comparable<table_device_view>(table_device_view const& lhs,
+                                                            table_device_view const& rhs);
+
+// Explicit extern template instantiation for a device table of mutable views
+template bool is_relationally_comparable<mutable_table_device_view>(
+  mutable_table_device_view const& lhs, mutable_table_device_view const& rhs);
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 3c583622ed8..4a41dacbd30 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -50,7 +50,7 @@ struct ngram_generator_fn {
   cudf::column_device_view const d_strings;
   cudf::size_type ngrams;
   cudf::string_view const d_separator;
-  int32_t const* d_offsets{};
+  int32_t* d_offsets{};
   char* d_chars{};
 
   /**
@@ -62,7 +62,7 @@ struct ngram_generator_fn {
    * @param idx Index of the kernel thread.
    * @return Number of bytes required for the string for this thread.
    */
-  __device__ cudf::size_type operator()(cudf::size_type idx)
+  __device__ void operator()(cudf::size_type idx)
   {
     char* out_ptr         = d_chars ? d_chars + d_offsets[idx] : nullptr;
     cudf::size_type bytes = 0;
@@ -74,7 +74,7 @@ struct ngram_generator_fn {
       bytes += d_separator.size_bytes();
       if (out_ptr) out_ptr = cudf::strings::detail::copy_string(out_ptr, d_separator);
     }
-    return bytes;
+    if (!d_chars) d_offsets[idx] = bytes;
   }
 };
 
@@ -109,11 +109,11 @@ std::unique_ptr<cudf::column> generate_ngrams(
                              if (d_strings.is_null(idx)) return false;
                              return !d_strings.element<cudf::string_view>(idx).empty();
                            },
-                           stream,
-                           mr)
+                           stream)
                            ->release();
     strings_count = table_offsets.front()->size() - 1;
-    return std::move(table_offsets.front());
+    auto result   = std::move(table_offsets.front());
+    return result;
   }();  // this allows freeing the temporary table_offsets
 
   CUDF_EXPECTS(strings_count >= ngrams, "Insufficient number of strings to generate ngrams");
@@ -131,30 +131,13 @@ std::unique_ptr<cudf::column> generate_ngrams(
   // compute the number of strings of ngrams
   auto const ngrams_count = strings_count - ngrams + 1;
 
-  // build output offsets by computing the output bytes for each generated ngram
-  auto offsets_transformer_itr = cudf::detail::make_counting_transform_iterator(
-    0, ngram_generator_fn{d_strings, ngrams, d_separator});
-  auto offsets_column = cudf::strings::detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + ngrams_count, stream, mr);
-  auto d_offsets = offsets_column->view().data<int32_t>();
-
-  // build the chars column
-  // generate the ngrams from the input strings and copy them into the chars data buffer
-  cudf::size_type const total_bytes = thrust::device_pointer_cast(d_offsets)[ngrams_count];
-  auto chars_column =
-    cudf::strings::detail::create_chars_child_column(ngrams_count, 0, total_bytes, stream, mr);
-  char* const d_chars = chars_column->mutable_view().data<char>();
-
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<cudf::size_type>(0),
-                     ngrams_count,
-                     ngram_generator_fn{d_strings, ngrams, d_separator, d_offsets, d_chars});
-  chars_column->set_null_count(0);
+  auto children = cudf::strings::detail::make_strings_children(
+    ngram_generator_fn{d_strings, ngrams, d_separator}, ngrams_count, 0, stream, mr);
 
   // make the output strings column from the offsets and chars column
   return cudf::make_strings_column(ngrams_count,
-                                   std::move(offsets_column),
-                                   std::move(chars_column),
+                                   std::move(children.first),
+                                   std::move(children.second),
                                    0,
                                    rmm::device_buffer{0, stream, mr},
                                    stream,
diff --git a/cpp/tests/copying/concatenate_tests.cu b/cpp/tests/copying/concatenate_tests.cu
index e63cbac1e72..cea53326895 100644
--- a/cpp/tests/copying/concatenate_tests.cu
+++ b/cpp/tests/copying/concatenate_tests.cu
@@ -99,7 +99,7 @@ TYPED_TEST(TypedColumnTest, ConcatenateNoColumns)
 
 TYPED_TEST(TypedColumnTest, ConcatenateColumnView)
 {
-  cudf::column original{this->type(), this->num_elements(), this->data, this->mask};
+  column original{this->type(), this->num_elements(), this->data, this->mask};
   std::vector<cudf::size_type> indices{0,
                                        this->num_elements() / 3,
                                        this->num_elements() / 3,
@@ -223,7 +223,7 @@ TEST_F(TableTest, ConcatenateTables)
   cols_table2.push_back(col3_table2.release());
   Table t2(std::move(cols_table2));
 
-  auto concat_table = cudf::concatenate({t1.view(), t2.view()});
+  auto concat_table = cudf::concatenate(std::vector<TView>({t1, t2}));
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*concat_table, gold_table);
 }
@@ -341,7 +341,8 @@ TEST_F(TableTest, SizeOverflowTest)
     auto many_chars = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, size);
 
     cudf::table_view tbl({*many_chars});
-    EXPECT_THROW(cudf::concatenate({tbl, tbl, tbl, tbl, tbl, tbl}), cudf::logic_error);
+    EXPECT_THROW(cudf::concatenate(std::vector<TView>({tbl, tbl, tbl, tbl, tbl, tbl})),
+                 cudf::logic_error);
   }
 
   // string column, overflow on chars
@@ -356,7 +357,8 @@ TEST_F(TableTest, SizeOverflowTest)
       1, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{0});
 
     cudf::table_view tbl({*col});
-    EXPECT_THROW(cudf::concatenate({tbl, tbl, tbl, tbl, tbl, tbl}), cudf::logic_error);
+    EXPECT_THROW(cudf::concatenate(std::vector<TView>({tbl, tbl, tbl, tbl, tbl, tbl})),
+                 cudf::logic_error);
   }
 
   // string column, overflow on offsets (rows)
@@ -372,7 +374,8 @@ TEST_F(TableTest, SizeOverflowTest)
       size, std::move(many_offsets), chars.release(), 0, rmm::device_buffer{0});
 
     cudf::table_view tbl({*col});
-    EXPECT_THROW(cudf::concatenate({tbl, tbl, tbl, tbl, tbl, tbl}), cudf::logic_error);
+    EXPECT_THROW(cudf::concatenate(std::vector<TView>({tbl, tbl, tbl, tbl, tbl, tbl})),
+                 cudf::logic_error);
   }
 
   // list<struct>, structs too long
@@ -395,8 +398,8 @@ TEST_F(TableTest, SizeOverflowTest)
       1, offsets.release(), std::move(struct_col), 0, rmm::device_buffer{0});
 
     cudf::table_view tbl({*col});
-    EXPECT_THROW(cudf::concatenate({tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl}),
-                 cudf::logic_error);
+    auto tables = std::vector<TView>({tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl});
+    EXPECT_THROW(cudf::concatenate(tables), cudf::logic_error);
   }
 
   // struct<int, list>, list child too long
@@ -419,8 +422,8 @@ TEST_F(TableTest, SizeOverflowTest)
     auto col = cudf::make_structs_column(size, std::move(children), 0, rmm::device_buffer{0});
 
     cudf::table_view tbl({*col});
-    EXPECT_THROW(cudf::concatenate({tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl}),
-                 cudf::logic_error);
+    auto tables = std::vector<TView>({tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl});
+    EXPECT_THROW(cudf::concatenate(tables), cudf::logic_error);
   }
 }
 
@@ -463,12 +466,14 @@ TEST_F(StructsColumnTest, ConcatenateStructs)
 
   // build expected output
   std::vector<std::unique_ptr<column>> expected_children;
-  expected_children.push_back(
-    cudf::concatenate({name_cols[0], name_cols[1], name_cols[2], name_cols[3]}));
-  expected_children.push_back(
-    cudf::concatenate({age_cols[0], age_cols[1], age_cols[2], age_cols[3]}));
-  expected_children.push_back(
-    cudf::concatenate({is_human_cols[0], is_human_cols[1], is_human_cols[2], is_human_cols[3]}));
+  auto name_col_vec =
+    std::vector<column_view>({name_cols[0], name_cols[1], name_cols[2], name_cols[3]});
+  auto age_col_vec = std::vector<column_view>({age_cols[0], age_cols[1], age_cols[2], age_cols[3]});
+  auto is_human_col_vec = std::vector<column_view>(
+    {is_human_cols[0], is_human_cols[1], is_human_cols[2], is_human_cols[3]});
+  expected_children.push_back(cudf::concatenate(name_col_vec));
+  expected_children.push_back(cudf::concatenate(age_col_vec));
+  expected_children.push_back(cudf::concatenate(is_human_col_vec));
   std::vector<bool> struct_validity({1, 0, 1, 1, 1, 0});
   auto expected = make_structs_column(
     6,
@@ -484,7 +489,7 @@ TEST_F(StructsColumnTest, ConcatenateStructs)
   src.push_back(structs_column_wrapper({name_cols[3], age_cols[3], is_human_cols[3]}, {1, 0}));
 
   // concatenate
-  auto result = cudf::concatenate({src[0], src[1], src[2], src[3]});
+  auto result = cudf::concatenate(std::vector<column_view>({src[0], src[1], src[2], src[3]}));
   cudf::test::expect_columns_equivalent(*result, *expected);
 }
 
@@ -536,9 +541,13 @@ TEST_F(StructsColumnTest, ConcatenateSplitStructs)
 
   // build expected output
   std::vector<std::unique_ptr<column>> expected_children;
-  expected_children.push_back(cudf::concatenate({split_names_cols[0], split_names_cols[1]}));
-  expected_children.push_back(cudf::concatenate({split_ages_cols[0], split_ages_cols[1]}));
-  expected_children.push_back(cudf::concatenate({split_is_human_cols[0], split_is_human_cols[1]}));
+  auto expected_names = std::vector<column_view>({split_names_cols[0], split_names_cols[1]});
+  auto expected_ages  = std::vector<column_view>({split_ages_cols[0], split_ages_cols[1]});
+  auto expected_is_human =
+    std::vector<column_view>({split_is_human_cols[0], split_is_human_cols[1]});
+  expected_children.push_back(cudf::concatenate(expected_names));
+  expected_children.push_back(cudf::concatenate(expected_ages));
+  expected_children.push_back(cudf::concatenate(expected_is_human));
   auto expected = make_structs_column(7, std::move(expected_children), 0, rmm::device_buffer{});
 
   // concatenate as structs
@@ -552,7 +561,8 @@ TEST_F(StructsColumnTest, ConcatenateSplitStructs)
   }
 
   // concatenate
-  auto result = cudf::concatenate({src[0], src[1]});
+
+  auto result = cudf::concatenate(std::vector<column_view>({src[0], src[1]}));
   cudf::test::expect_columns_equivalent(*result, *expected);
 }
 
@@ -607,8 +617,11 @@ TEST_F(StructsColumnTest, ConcatenateStructsNested)
 
   // build expected output
   std::vector<std::unique_ptr<column>> expected_children;
-  expected_children.push_back(cudf::concatenate({inner_structs[0], inner_structs[1]}));
-  expected_children.push_back(cudf::concatenate({inner_lists[0], inner_lists[1]}));
+
+  expected_children.push_back(
+    cudf::concatenate(std::vector<column_view>({inner_structs[0], inner_structs[1]})));
+  expected_children.push_back(
+    cudf::concatenate(std::vector<column_view>({inner_lists[0], inner_lists[1]})));
   auto expected = make_structs_column(11, std::move(expected_children), 0, rmm::device_buffer{});
 
   // concatenate as structs
@@ -621,7 +634,7 @@ TEST_F(StructsColumnTest, ConcatenateStructsNested)
   }
 
   // concatenate
-  auto result = cudf::concatenate({src[0], src[1]});
+  auto result = cudf::concatenate(std::vector<column_view>({src[0], src[1]}));
   cudf::test::expect_columns_equivalent(*result, *expected);
 }
 
@@ -635,7 +648,7 @@ TEST_F(ListsColumnTest, ConcatenateLists)
     cudf::test::lists_column_wrapper<int> b{4, 5, 6, 7, 8, 9, 10};
     cudf::test::lists_column_wrapper<int> expected{{0, 1, 2, 3}, {4, 5, 6, 7, 8, 9, 10}};
 
-    auto result = cudf::concatenate({a, b});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -646,7 +659,7 @@ TEST_F(ListsColumnTest, ConcatenateLists)
     cudf::test::lists_column_wrapper<int> expected{
       {0, 1, 1}, {2, 3}, {4, 5}, {6}, {8, 9, 9, 9}, {10, 11}};
 
-    auto result = cudf::concatenate({a, b});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -657,7 +670,7 @@ TEST_F(ListsColumnTest, ConcatenateLists)
     cudf::test::lists_column_wrapper<int> expected{
       {0, 1}, {2, 3, 4, 5}, {6, 7, 8}, {9}, {10, 11}, {12, 13, 14, 15}};
 
-    auto result = cudf::concatenate({a, b});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -674,7 +687,7 @@ TEST_F(ListsColumnTest, ConcatenateEmptyLists)
     cudf::test::lists_column_wrapper<int> b{4, 5, 6, 7};
     cudf::test::lists_column_wrapper<int> expected{4, 5, 6, 7};
 
-    auto result = cudf::concatenate({a, b});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -684,7 +697,7 @@ TEST_F(ListsColumnTest, ConcatenateEmptyLists)
     cudf::test::lists_column_wrapper<int> d{4, 5, 6, 7};
     cudf::test::lists_column_wrapper<int> expected{4, 5, 6, 7};
 
-    auto result = cudf::concatenate({a, b, c, d});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b, c, d}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -694,7 +707,7 @@ TEST_F(ListsColumnTest, ConcatenateEmptyLists)
     cudf::test::lists_column_wrapper<int> b{4, 5, 6, 7};
     cudf::test::lists_column_wrapper<int> expected{LCW{}, {4, 5, 6, 7}};
 
-    auto result = cudf::concatenate({a, b});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -704,7 +717,7 @@ TEST_F(ListsColumnTest, ConcatenateEmptyLists)
     cudf::test::lists_column_wrapper<int> d{4, 5, 6, 7};
     cudf::test::lists_column_wrapper<int> expected{LCW{}, LCW{}, LCW{}, {4, 5, 6, 7}};
 
-    auto result = cudf::concatenate({a, b, c, d});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b, c, d}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -715,7 +728,7 @@ TEST_F(ListsColumnTest, ConcatenateEmptyLists)
     cudf::test::lists_column_wrapper<int> d{4, 5, 6, 7};
     cudf::test::lists_column_wrapper<int> expected{{1, 2}, LCW{}, LCW{}, {4, 5, 6, 7}};
 
-    auto result = cudf::concatenate({a, b, c, d});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b, c, d}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -732,7 +745,7 @@ TEST_F(ListsColumnTest, ConcatenateListsWithNulls)
     cudf::test::lists_column_wrapper<int> b{{{4, 6, 7}, valids}};
     cudf::test::lists_column_wrapper<int> expected{{{0, 1, 2, 3}, valids}, {{4, 6, 7}, valids}};
 
-    auto result = cudf::concatenate({a, b});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -746,7 +759,7 @@ TEST_F(ListsColumnTest, ConcatenateNestedLists)
     cudf::test::lists_column_wrapper<int> expected{
       {{0, 1}, {2}}, {{4, 5, 6, 7, 8, 9, 10}}, {{6, 7}}, {{8, 9, 10}, {11, 12}}};
 
-    auto result = cudf::concatenate({a, b});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -770,7 +783,7 @@ TEST_F(ListsColumnTest, ConcatenateNestedLists)
       {{{31, 32}, {33, 34}}, {{35, 36}, {37, 38}}, {{39, 40}}},
       {{{71, 72}, {74}}, {{75, 76, 77, 78}, {77, 78}}, {{79, 80, 81}}}};
 
-    auto result = cudf::concatenate({a, b});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -789,7 +802,7 @@ TEST_F(ListsColumnTest, ConcatenateNestedEmptyLists)
     cudf::test::lists_column_wrapper<int> expected{
       {{LCW{}}}, {{0, 1}, {2, 3}}, {{6, 7}}, {LCW{}, {11, 12}}};
 
-    auto result = cudf::concatenate({a, b});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -815,7 +828,7 @@ TEST_F(ListsColumnTest, ConcatenateNestedEmptyLists)
       {{{31, 32}, {33, 34}}, {{35, 36}, {37, 38}, {1, 2}}, {{39, 40}}},
       {{{LCW{}}}}};
 
-    auto result = cudf::concatenate({a, b});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -834,7 +847,7 @@ TEST_F(ListsColumnTest, ConcatenateNestedListsWithNulls)
     cudf::test::lists_column_wrapper<int> expected{{{{0, 1}, {2, 3}}, valids},
                                                    {{{4}, {6, 7}}, valids}};
 
-    auto result = cudf::concatenate({a, b});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -848,7 +861,7 @@ TEST_F(ListsColumnTest, ConcatenateNestedListsWithNulls)
                                                    {{6, 7}},
                                                    {{{{8, 9, 10}, valids}, {11, 12}}, valids}};
 
-    auto result = cudf::concatenate({a, b});
+    auto result = cudf::concatenate(std::vector<column_view>({a, b}));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
@@ -864,7 +877,8 @@ TEST_F(ListsColumnTest, ConcatenateMismatchedHierarchies)
     cudf::test::lists_column_wrapper<int> a{{{{LCW{}}}}};
     cudf::test::lists_column_wrapper<int> b{{{LCW{}}}};
     cudf::test::lists_column_wrapper<int> c{{LCW{}}};
-    EXPECT_THROW(cudf::concatenate({a, b, c}), cudf::logic_error);
+
+    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::logic_error);
   }
 
   {
@@ -872,20 +886,23 @@ TEST_F(ListsColumnTest, ConcatenateMismatchedHierarchies)
     cudf::test::lists_column_wrapper<int> a{{{{{LCW{}}}}, valids.begin()}};
     cudf::test::lists_column_wrapper<int> b{{{LCW{}}}};
     cudf::test::lists_column_wrapper<int> c{{LCW{}}};
-    EXPECT_THROW(cudf::concatenate({a, b, c}), cudf::logic_error);
+
+    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::logic_error);
   }
 
   {
     cudf::test::lists_column_wrapper<int> a{{{{LCW{}}}}};
     cudf::test::lists_column_wrapper<int> b{1, 2, 3};
     cudf::test::lists_column_wrapper<int> c{{3, 4, 5}};
-    EXPECT_THROW(cudf::concatenate({a, b, c}), cudf::logic_error);
+
+    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::logic_error);
   }
 
   {
     cudf::test::lists_column_wrapper<int> a{{{1, 2, 3}}};
     cudf::test::lists_column_wrapper<int> b{{4, 5}};
-    EXPECT_THROW(cudf::concatenate({a, b}), cudf::logic_error);
+
+    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b})), cudf::logic_error);
   }
 }
 
@@ -910,14 +927,16 @@ TEST_F(ListsColumnTest, SlicedColumns)
                                                     {{4, 4, 4}, {5, 5}, {6, 6}},
                                                     {{-1, -1, -1, -1}, {-2}},
                                                     {{-3, -3, -3, -3}, {-4}}};
-    auto result0 = cudf::concatenate({split_a[0], split_b[0]});
+
+    auto result0 = cudf::concatenate(std::vector<column_view>({split_a[0], split_b[0]}));
     cudf::test::expect_columns_equivalent(*result0, expected0);
 
     cudf::test::lists_column_wrapper<int> expected1{{{1, 1, 1}, {2, 2}, {3, 3}},
                                                     {{4, 4, 4}, {5, 5}, {6, 6}},
                                                     {{-5, -5, -5, -5}, {-6}},
                                                     {{-7, -7, -7, -7}, {-8}}};
-    auto result1 = cudf::concatenate({split_a[0], split_b[1]});
+
+    auto result1 = cudf::concatenate(std::vector<column_view>({split_a[0], split_b[1]}));
     cudf::test::expect_columns_equivalent(*result1, expected1);
 
     cudf::test::lists_column_wrapper<int> expected2{
@@ -926,14 +945,16 @@ TEST_F(ListsColumnTest, SlicedColumns)
       {{-1, -1, -1, -1}, {-2}},
       {{-3, -3, -3, -3}, {-4}},
     };
-    auto result2 = cudf::concatenate({split_a[1], split_b[0]});
+
+    auto result2 = cudf::concatenate(std::vector<column_view>({split_a[1], split_b[0]}));
     cudf::test::expect_columns_equivalent(*result2, expected2);
 
     cudf::test::lists_column_wrapper<int> expected3{{{7, 7, 7}, {8, 8}, {9, 9}},
                                                     {{10, 10, 10}, {11, 11}, {12, 12}},
                                                     {{-5, -5, -5, -5}, {-6}},
                                                     {{-7, -7, -7, -7}, {-8}}};
-    auto result3 = cudf::concatenate({split_a[1], split_b[1]});
+
+    auto result3 = cudf::concatenate(std::vector<column_view>({split_a[1], split_b[1]}));
     cudf::test::expect_columns_equivalent(*result3, expected3);
   }
 
@@ -958,7 +979,9 @@ TEST_F(ListsColumnTest, SlicedColumns)
       {LCW{}, {LCW{}}, {{6, 6}, {2}}},
       {{LCW{}}},
       {LCW{}, {LCW{}}}};
-    auto result0 = cudf::concatenate({split_a[0], split_b[0]});
+
+    auto result0 = cudf::concatenate(std::vector<column_view>({split_a[0], split_b[0]}));
+
     cudf::test::expect_columns_equivalent(*result0, expected0);
 
     cudf::test::lists_column_wrapper<int> expected1{
@@ -967,7 +990,8 @@ TEST_F(ListsColumnTest, SlicedColumns)
       {{{1, 2, 9}, LCW{}}, {{5, 6, 7, 8, 9}, {0}, {15, 17}}},
       {{LCW{}}},
     };
-    auto result1 = cudf::concatenate({split_a[0], split_b[1]});
+
+    auto result1 = cudf::concatenate(std::vector<column_view>({split_a[0], split_b[1]}));
     cudf::test::expect_columns_equivalent(*result1, expected1);
 
     cudf::test::lists_column_wrapper<int> expected2{
@@ -975,7 +999,8 @@ TEST_F(ListsColumnTest, SlicedColumns)
       {LCW{}, LCW{}, {{10, 10, 10}, {11, 11}, {12, 12}}, LCW{}},
       {{LCW{}}},
       {LCW{}, {LCW{}}}};
-    auto result2 = cudf::concatenate({split_a[1], split_b[0]});
+
+    auto result2 = cudf::concatenate(std::vector<column_view>({split_a[1], split_b[0]}));
     cudf::test::expect_columns_equivalent(*result2, expected2);
 
     cudf::test::lists_column_wrapper<int> expected3{
@@ -984,7 +1009,8 @@ TEST_F(ListsColumnTest, SlicedColumns)
       {{{1, 2, 9}, LCW{}}, {{5, 6, 7, 8, 9}, {0}, {15, 17}}},
       {{LCW{}}},
     };
-    auto result3 = cudf::concatenate({split_a[1], split_b[1]});
+
+    auto result3 = cudf::concatenate(std::vector<column_view>({split_a[1], split_b[1]}));
     cudf::test::expect_columns_equivalent(*result3, expected3);
   }
 }
@@ -1015,14 +1041,16 @@ TEST_F(ListsColumnTest, SlicedColumnsWithNulls)
                                                     {{{{-1, -1, -1, -1}, valids}, {-2}}, valids},
                                                     {{{{-3, -3, -3, -3}, valids}, {-4}}, valids},
                                                     {{{{-5, -5, -5, -5}, valids}, {-6}}, valids}};
-    auto result0 = cudf::concatenate({split_a[0], split_b[0]});
+
+    auto result0 = cudf::concatenate(std::vector<column_view>({split_a[0], split_b[0]}));
     cudf::test::expect_columns_equivalent(*result0, expected0);
 
     cudf::test::lists_column_wrapper<int> expected1{{{{1, 1, 1}, valids}, {2, 2}, {{3, 3}, valids}},
                                                     {{{4, 4, 4}, {{5, 5}, valids}, {6, 6}}, valids},
                                                     {{7, 7, 7}, {8, 8}, {9, 9}},
                                                     {{{{-7, -7, -7, -7}, valids}, {-8}}, valids}};
-    auto result1 = cudf::concatenate({split_a[0], split_b[1]});
+
+    auto result1 = cudf::concatenate(std::vector<column_view>({split_a[0], split_b[1]}));
     cudf::test::expect_columns_equivalent(*result1, expected1);
 
     cudf::test::lists_column_wrapper<int> expected2{
@@ -1030,13 +1058,15 @@ TEST_F(ListsColumnTest, SlicedColumnsWithNulls)
       {{{{-1, -1, -1, -1}, valids}, {-2}}, valids},
       {{{{-3, -3, -3, -3}, valids}, {-4}}, valids},
       {{{{-5, -5, -5, -5}, valids}, {-6}}, valids}};
-    auto result2 = cudf::concatenate({split_a[1], split_b[0]});
+
+    auto result2 = cudf::concatenate(std::vector<column_view>({split_a[1], split_b[0]}));
     cudf::test::expect_columns_equivalent(*result2, expected2);
 
     cudf::test::lists_column_wrapper<int> expected3{
       {{{10, 10, 10}, {11, 11}, {{12, 12}, valids}}, valids},
       {{{{-7, -7, -7, -7}, valids}, {-8}}, valids}};
-    auto result3 = cudf::concatenate({split_a[1], split_b[1]});
+
+    auto result3 = cudf::concatenate(std::vector<column_view>({split_a[1], split_b[1]}));
     cudf::test::expect_columns_equivalent(*result3, expected3);
   }
 
@@ -1068,7 +1098,8 @@ TEST_F(ListsColumnTest, SlicedColumnsWithNulls)
       {{LCW{}, {{LCW{}}, valids}}, valids},
       {{{{1, 2, 9}, LCW{}}, {{5, 6, 7, 8, 9}, {0}, {15, 17}}}, valids},
     };
-    auto result0 = cudf::concatenate({split_a[0], split_b[0]});
+
+    auto result0 = cudf::concatenate(std::vector<column_view>({split_a[0], split_b[0]}));
     cudf::test::expect_columns_equivalent(*result0, expected0);
 
     cudf::test::lists_column_wrapper<int> expected1{
@@ -1079,7 +1110,8 @@ TEST_F(ListsColumnTest, SlicedColumnsWithNulls)
       {{{LCW{}, LCW{}}, valids}},
       {{LCW{}}},
     };
-    auto result1 = cudf::concatenate({split_a[0], split_b[1]});
+
+    auto result1 = cudf::concatenate(std::vector<column_view>({split_a[0], split_b[1]}));
     cudf::test::expect_columns_equivalent(*result1, expected1);
 
     cudf::test::lists_column_wrapper<int> expected2{
@@ -1088,14 +1120,16 @@ TEST_F(ListsColumnTest, SlicedColumnsWithNulls)
       {{LCW{}, {{LCW{}}, valids}}, valids},
       {{{{1, 2, 9}, LCW{}}, {{5, 6, 7, 8, 9}, {0}, {15, 17}}}, valids},
     };
-    auto result2 = cudf::concatenate({split_a[1], split_b[0]});
+
+    auto result2 = cudf::concatenate(std::vector<column_view>({split_a[1], split_b[0]}));
     cudf::test::expect_columns_equivalent(*result2, expected2);
 
     cudf::test::lists_column_wrapper<int> expected3{
       {LCW{}, LCW{}, {{{10, 10, 10}, {{11, 11}, valids}, {12, 12}}, valids}, LCW{}},
       {{LCW{}}},
     };
-    auto result3 = cudf::concatenate({split_a[1], split_b[1]});
+
+    auto result3 = cudf::concatenate(std::vector<column_view>({split_a[1], split_b[1]}));
     cudf::test::expect_columns_equivalent(*result3, expected3);
   }
 }
@@ -1140,11 +1174,12 @@ TEST_F(ListsColumnTest, ListOfStructs)
   }
 
   // build expected output
-  auto expected_child =
-    cudf::concatenate({inner_structs[0], inner_structs[1], inner_structs[2], inner_structs[3]});
+  auto struct_views = std::vector<column_view>(
+    {inner_structs[0], inner_structs[1], inner_structs[2], inner_structs[3]});
+  auto expected_child = cudf::concatenate(struct_views);
   fixed_width_column_wrapper<int> offsets_w{0, 1, 1, 1, 1, 4, 6, 6, 6, 10, 11};
-  auto expected = make_lists_column(
-    10, std::move(offsets_w.release()), std::move(expected_child), 0, rmm::device_buffer{});
+  auto expected =
+    make_lists_column(10, offsets_w.release(), std::move(expected_child), 0, rmm::device_buffer{});
 
   // lists
   std::vector<fixed_width_column_wrapper<int>> offsets;
@@ -1154,7 +1189,7 @@ TEST_F(ListsColumnTest, ListOfStructs)
   offsets.push_back({0, 0, 4, 5});
 
   // concatenate as lists
-  std::vector<std::unique_ptr<cudf::column>> src;
+  std::vector<std::unique_ptr<column>> src;
   for (size_t idx = 0; idx < inner_structs.size(); idx++) {
     int size = static_cast<column_view>(offsets[idx]).size() - 1;
     src.push_back(make_lists_column(
@@ -1162,7 +1197,7 @@ TEST_F(ListsColumnTest, ListOfStructs)
   }
 
   // concatenate
-  auto result = cudf::concatenate({*src[0], *src[1], *src[2], *src[3]});
+  auto result = cudf::concatenate(std::vector<column_view>({*src[0], *src[1], *src[2], *src[3]}));
   cudf::test::expect_columns_equivalent(*result, *expected);
 }
 
@@ -1189,8 +1224,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointConcatentate)
   auto const b = fw_wrapper(vec.begin() + 300, vec.begin() + 700);
   auto const c = fw_wrapper(vec.begin() + 700, vec.end());
 
-  auto const columns  = std::vector<cudf::column_view>{a, b, c};
-  auto const results  = cudf::concatenate(columns);
+  auto const results  = cudf::concatenate(std::vector<cudf::column_view>{a, b, c});
   auto const expected = fw_wrapper(vec.begin(), vec.end());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
@@ -1208,8 +1242,7 @@ TEST_F(FixedPointTest, FixedPointConcatentate)
   auto const b = fp_wrapper(vec.begin() + 300, vec.begin() + 700, scale_type{-2});
   auto const c = fp_wrapper(vec.begin() + 700, vec.end(), /*****/ scale_type{-2});
 
-  auto const columns  = std::vector<cudf::column_view>{a, b, c};
-  auto const results  = cudf::concatenate(columns);
+  auto const results  = cudf::concatenate(std::vector<cudf::column_view>{a, b, c});
   auto const expected = fp_wrapper(vec.begin(), vec.end(), scale_type{-2});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
@@ -1227,8 +1260,7 @@ TEST_F(FixedPointTest, FixedPointScaleMismatch)
   auto const b = fp_wrapper(vec.begin() + 300, vec.begin() + 700, scale_type{-2});
   auto const c = fp_wrapper(vec.begin() + 700, vec.end(), /*****/ scale_type{-3});
 
-  auto const columns = std::vector<cudf::column_view>{a, b, c};
-  EXPECT_THROW(cudf::concatenate(columns), cudf::logic_error);
+  EXPECT_THROW(cudf::concatenate(std::vector<cudf::column_view>{a, b, c}), cudf::logic_error);
 }
 
 struct DictionaryConcatTest : public cudf::test::BaseFixture {
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index 9f5bbe2dcb9..d79307dcbf6 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -168,7 +168,7 @@ TEST_F(FromArrowTest, StructColumn)
     std::vector<std::vector<std::string>>{{"string", "integral", "bool", "nested_list", "struct"}};
   auto str_col =
     cudf::test::strings_column_wrapper{
-      "Samuel Vimes", "Carrot Ironfoundersson", "Angua von Uberwald"}
+      "Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"}
       .release();
   auto str_col2 =
     cudf::test::strings_column_wrapper{{"CUDF", "ROCKS", "EVERYWHERE"}, {0, 1, 0}}.release();
@@ -198,7 +198,7 @@ TEST_F(FromArrowTest, StructColumn)
   cudf::table_view expected_cudf_table({struct_col->view()});
 
   // Create Arrow table
-  std::vector<std::string> str{"Samuel Vimes", "Carrot Ironfoundersson", "Angua von Uberwald"};
+  std::vector<std::string> str{"Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"};
   std::vector<std::string> str2{"CUDF", "ROCKS", "EVERYWHERE"};
   auto str_array  = get_arrow_array<cudf::string_view>(str);
   auto int_array  = get_arrow_array<int32_t>({48, 27, 25});
diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
index c8e56711135..57275433516 100644
--- a/cpp/tests/interop/to_arrow_test.cpp
+++ b/cpp/tests/interop/to_arrow_test.cpp
@@ -270,7 +270,7 @@ TEST_F(ToArrowTest, StructColumn)
     std::vector<std::vector<std::string>>{{"string", "integral", "bool", "nested_list", "struct"}};
   auto str_col =
     cudf::test::strings_column_wrapper{
-      "Samuel Vimes", "Carrot Ironfoundersson", "Angua von Uberwald"}
+      "Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"}
       .release();
   auto str_col2 =
     cudf::test::strings_column_wrapper{{"CUDF", "ROCKS", "EVERYWHERE"}, {0, 1, 0}}.release();
@@ -306,7 +306,7 @@ TEST_F(ToArrowTest, StructColumn)
   metadata.children_meta     = {{"string"}, {"integral"}, {"bool"}, {"nested_list"}, sub_metadata};
 
   // Create Arrow table
-  std::vector<std::string> str{"Samuel Vimes", "Carrot Ironfoundersson", "Angua von Uberwald"};
+  std::vector<std::string> str{"Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"};
   std::vector<std::string> str2{"CUDF", "ROCKS", "EVERYWHERE"};
   auto str_array  = get_arrow_array<cudf::string_view>(str);
   auto int_array  = get_arrow_array<int32_t>({48, 27, 25});
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index b0dc01ea001..108befa80a7 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -29,6 +29,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <type_traits>
 
@@ -395,7 +396,7 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
   auto col3_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i == (num_rows - 1)); });
   auto col4_mask =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i >= 40 || i <= 60); });
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i >= 40 && i <= 60); });
   auto col5_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i > 80); });
 
@@ -657,7 +658,7 @@ TEST_F(OrcChunkedWriterTest, SimpleTable)
   auto table1 = create_random_fixed_table<int>(5, 5, true);
   auto table2 = create_random_fixed_table<int>(5, 5, true);
 
-  auto full_table = cudf::concatenate({*table1, *table2});
+  auto full_table = cudf::concatenate(std::vector<table_view>({*table1, *table2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedSimple.orc");
   cudf_io::chunked_orc_writer_options opts =
@@ -677,7 +678,7 @@ TEST_F(OrcChunkedWriterTest, LargeTables)
   auto table1 = create_random_fixed_table<int>(512, 4096, true);
   auto table2 = create_random_fixed_table<int>(512, 8192, true);
 
-  auto full_table = cudf::concatenate({*table1, *table2});
+  auto full_table = cudf::concatenate(std::vector<table_view>({*table1, *table2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedLarge.orc");
   cudf_io::chunked_orc_writer_options opts =
@@ -737,7 +738,7 @@ TEST_F(OrcChunkedWriterTest, Strings)
   cols.push_back(strings2.release());
   cudf::table tbl2(std::move(cols));
 
-  auto expected = cudf::concatenate({tbl1, tbl2});
+  auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedStrings.orc");
   cudf_io::chunked_orc_writer_options opts =
@@ -799,7 +800,7 @@ TEST_F(OrcChunkedWriterTest, ReadStripes)
   auto table1 = create_random_fixed_table<int>(5, 5, true);
   auto table2 = create_random_fixed_table<int>(5, 5, true);
 
-  auto full_table = cudf::concatenate({*table2, *table1, *table2});
+  auto full_table = cudf::concatenate(std::vector<table_view>({*table2, *table1, *table2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedStripes.orc");
   cudf_io::chunked_orc_writer_options opts =
@@ -863,7 +864,7 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize)
   cols.push_back(c2b_w.release());
   cudf::table tbl2(std::move(cols));
 
-  auto expected = cudf::concatenate({tbl1, tbl2});
+  auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedUnalignedSize.orc");
   cudf_io::chunked_orc_writer_options opts =
@@ -910,7 +911,7 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2)
   cols.push_back(c2b_w.release());
   cudf::table tbl2(std::move(cols));
 
-  auto expected = cudf::concatenate({tbl1, tbl2});
+  auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedUnalignedSize2.orc");
   cudf_io::chunked_orc_writer_options opts =
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 013457d8ed6..880f11aaeb2 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -23,6 +23,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -472,7 +473,7 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls)
   auto col3_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i == (num_rows - 1)); });
   auto col4_mask =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i >= 40 || i <= 60); });
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i >= 40 && i <= 60); });
   auto col5_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i > 80); });
   auto col6_mask =
@@ -1218,7 +1219,7 @@ TEST_F(ParquetChunkedWriterTest, SimpleTable)
   auto table1 = create_random_fixed_table<int>(5, 5, true);
   auto table2 = create_random_fixed_table<int>(5, 5, true);
 
-  auto full_table = cudf::concatenate({*table1, *table2});
+  auto full_table = cudf::concatenate(std::vector<table_view>({*table1, *table2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedSimple.parquet");
   cudf_io::chunked_parquet_writer_options args =
@@ -1238,7 +1239,7 @@ TEST_F(ParquetChunkedWriterTest, LargeTables)
   auto table1 = create_random_fixed_table<int>(512, 4096, true);
   auto table2 = create_random_fixed_table<int>(512, 8192, true);
 
-  auto full_table = cudf::concatenate({*table1, *table2});
+  auto full_table = cudf::concatenate(std::vector<table_view>({*table1, *table2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedLarge.parquet");
   cudf_io::chunked_parquet_writer_options args =
@@ -1300,7 +1301,7 @@ TEST_F(ParquetChunkedWriterTest, Strings)
   cols.push_back(strings2.release());
   cudf::table tbl2(std::move(cols));
 
-  auto expected = cudf::concatenate({tbl1, tbl2});
+  auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedStrings.parquet");
   cudf_io::chunked_parquet_writer_options args =
@@ -1359,7 +1360,7 @@ TEST_F(ParquetChunkedWriterTest, ListColumn)
   auto tbl0 = table_view({col0_tbl0, col1_tbl0, col2_tbl0});
   auto tbl1 = table_view({col0_tbl1, col1_tbl1, col2_tbl1});
 
-  auto expected = cudf::concatenate({tbl0, tbl1});
+  auto expected = cudf::concatenate(std::vector<table_view>({tbl0, tbl1}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedLists.parquet");
   cudf_io::chunked_parquet_writer_options args =
@@ -1413,7 +1414,7 @@ TEST_F(ParquetChunkedWriterTest, ListOfStruct)
 
   auto table_2 = table_view({*list_col_2});
 
-  auto full_table = cudf::concatenate({table_1, table_2});
+  auto full_table = cudf::concatenate(std::vector<table_view>({table_1, table_2}));
 
   cudf_io::table_input_metadata expected_metadata(table_1);
   expected_metadata.column_metadata[0].set_name("family");
@@ -1504,7 +1505,7 @@ TEST_F(ParquetChunkedWriterTest, ListOfStructOfStructOfListOfList)
 
   auto table_2 = table_view({*list_col_2});
 
-  auto full_table = cudf::concatenate({table_1, table_2});
+  auto full_table = cudf::concatenate(std::vector<table_view>({table_1, table_2}));
 
   cudf_io::table_input_metadata expected_metadata(table_1);
   expected_metadata.column_metadata[0].set_name("family");
@@ -1639,7 +1640,7 @@ TEST_F(ParquetChunkedWriterTest, DifferentNullability)
   auto table1 = create_random_fixed_table<int>(5, 5, true);
   auto table2 = create_random_fixed_table<int>(5, 5, false);
 
-  auto full_table = cudf::concatenate({*table1, *table2});
+  auto full_table = cudf::concatenate(std::vector<table_view>({*table1, *table2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedNullable.parquet");
   cudf_io::chunked_parquet_writer_options args =
@@ -1678,7 +1679,7 @@ TEST_F(ParquetChunkedWriterTest, DifferentNullabilityStruct)
   auto struct_2_2 = cudf::test::structs_column_wrapper{{is_human_2, struct_1_2}};
   auto table_2    = cudf::table_view({struct_2_2});
 
-  auto full_table = cudf::concatenate({table_1, table_2});
+  auto full_table = cudf::concatenate(std::vector<table_view>({table_1, table_2}));
 
   cudf_io::table_input_metadata expected_metadata(table_1);
   expected_metadata.column_metadata[0].set_name("being");
@@ -1707,7 +1708,7 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullability)
   auto table1 = create_random_fixed_table<int>(5, 5, false);
   auto table2 = create_random_fixed_table<int>(5, 5, false);
 
-  auto full_table = cudf::concatenate({*table1, *table2});
+  auto full_table = cudf::concatenate(std::vector<table_view>({*table1, *table2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedNoNullable.parquet");
 
@@ -1764,7 +1765,7 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityList)
   auto table1 = table_view({col00, col10});
   auto table2 = table_view({col01, col11});
 
-  auto full_table = cudf::concatenate({table1, table2});
+  auto full_table = cudf::concatenate(std::vector<table_view>({table1, table2}));
 
   cudf_io::table_input_metadata metadata(table1);
   metadata.column_metadata[0].set_nullability(true);  // List is nullable at first (root) level
@@ -1809,7 +1810,7 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityStruct)
   auto struct_2_2 = cudf::test::structs_column_wrapper{{is_human_2, struct_1_2}};
   auto table_2    = cudf::table_view({struct_2_2});
 
-  auto full_table = cudf::concatenate({table_1, table_2});
+  auto full_table = cudf::concatenate(std::vector<table_view>({table_1, table_2}));
 
   cudf_io::table_input_metadata expected_metadata(table_1);
   expected_metadata.column_metadata[0].set_name("being").set_nullability(false);
@@ -1838,7 +1839,7 @@ TEST_F(ParquetChunkedWriterTest, ReadRowGroups)
   auto table1 = create_random_fixed_table<int>(5, 5, true);
   auto table2 = create_random_fixed_table<int>(5, 5, true);
 
-  auto full_table = cudf::concatenate({*table2, *table1, *table2});
+  auto full_table = cudf::concatenate(std::vector<table_view>({*table2, *table1, *table2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedRowGroups.parquet");
   cudf_io::chunked_parquet_writer_options args =
@@ -1951,7 +1952,7 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize)
   cols.push_back(c2b_w.release());
   cudf::table tbl2(std::move(cols));
 
-  auto expected = cudf::concatenate({tbl1, tbl2});
+  auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedUnalignedSize.parquet");
   cudf_io::chunked_parquet_writer_options args =
@@ -1998,7 +1999,7 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize2)
   cols.push_back(c2b_w.release());
   cudf::table tbl2(std::move(cols));
 
-  auto expected = cudf::concatenate({tbl1, tbl2});
+  auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedUnalignedSize2.parquet");
   cudf_io::chunked_parquet_writer_options args =
diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp
index 451fa82d5a3..b7d98704aff 100644
--- a/cpp/tests/merge/merge_test.cpp
+++ b/cpp/tests/merge/merge_test.cpp
@@ -705,7 +705,7 @@ TEST_F(MergeTest, KeysWithNulls)
   auto valids2 = cudf::detail::make_counting_transform_iterator(
     0, [](auto row) { return (row % 15 == 0) ? false : true; });
   cudf::test::fixed_width_column_wrapper<int32_t> data2(data_iter, data_iter + nrows, valids2);
-  auto all_data = cudf::concatenate({data1, data2});
+  auto all_data = cudf::concatenate(std::vector<cudf::column_view>{{data1, data2}});
 
   std::vector<cudf::order> column_orders{cudf::order::ASCENDING, cudf::order::DESCENDING};
   std::vector<cudf::null_order> null_precedences{cudf::null_order::AFTER, cudf::null_order::BEFORE};
diff --git a/cpp/tests/sort/is_sorted_tests.cpp b/cpp/tests/sort/is_sorted_tests.cpp
index 1e6bb2a70fb..abc9a9bfe9e 100644
--- a/cpp/tests/sort/is_sorted_tests.cpp
+++ b/cpp/tests/sort/is_sorted_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -162,6 +162,73 @@ auto nulls_before<cudf::string_view>()
   return strings_column_wrapper({"identical", "identical"}, {0, 1});
 }
 
+// ----- struct_view {"nestedInt" : {"Int" : 0 }, "float" : 1}
+
+template <typename T>
+typename std::enable_if<std::is_same<T, cudf::struct_view>::value, structs_column_wrapper>::type
+ascending()
+{
+  using T1           = int32_t;
+  auto int_col       = fixed_width_column_wrapper<int32_t>({std::numeric_limits<T1>::lowest(),
+                                                      T1(-100),
+                                                      T1(-10),
+                                                      T1(-10),
+                                                      T1(0),
+                                                      T1(10),
+                                                      T1(10),
+                                                      T1(100),
+                                                      std::numeric_limits<T1>::max()});
+  auto nestedInt_col = structs_column_wrapper{{int_col}};
+  auto float_col     = ascending<float>();
+  return structs_column_wrapper{{nestedInt_col, float_col}};
+}
+
+template <typename T>
+typename std::enable_if<std::is_same<T, cudf::struct_view>::value, structs_column_wrapper>::type
+descending()
+{
+  using T1           = int32_t;
+  auto int_col       = fixed_width_column_wrapper<int32_t>({std::numeric_limits<T1>::max(),
+                                                      T1(100),
+                                                      T1(10),
+                                                      T1(10),
+                                                      T1(0),
+                                                      T1(-10),
+                                                      T1(-10),
+                                                      T1(-100),
+                                                      std::numeric_limits<T1>::lowest()});
+  auto nestedInt_col = structs_column_wrapper{{int_col}};
+  auto float_col     = descending<float>();
+  return structs_column_wrapper{{nestedInt_col, float_col}};
+}
+
+template <>
+auto empty<cudf::struct_view>()
+{
+  auto int_col = fixed_width_column_wrapper<int32_t>();
+  auto col1    = structs_column_wrapper{{int_col}};
+  auto col2    = fixed_width_column_wrapper<float>();
+  return structs_column_wrapper{{col1, col2}};
+}
+
+template <>
+auto nulls_after<cudf::struct_view>()
+{
+  auto int_col = fixed_width_column_wrapper<int32_t>({1, 1});
+  auto col1    = structs_column_wrapper{{int_col}};
+  auto col2    = fixed_width_column_wrapper<float>({1, 1});
+  return structs_column_wrapper{{col1, col2}, {1, 0}};
+}
+
+template <>
+auto nulls_before<cudf::struct_view>()
+{
+  auto int_col = fixed_width_column_wrapper<int32_t>({1, 1});
+  auto col1    = structs_column_wrapper{{int_col}};
+  auto col2    = fixed_width_column_wrapper<float>({1, 1});
+  return structs_column_wrapper{{col1, col2}, {0, 1}};
+}
+
 }  // namespace testdata
 }  // anonymous namespace
 
@@ -172,7 +239,8 @@ template <typename T>
 struct IsSortedTest : public BaseFixture {
 };
 
-TYPED_TEST_CASE(IsSortedTest, ComparableTypes);
+using SupportedTypes = Concat<ComparableTypes, cudf::test::Types<cudf::struct_view>>;
+TYPED_TEST_CASE(IsSortedTest, SupportedTypes);
 
 TYPED_TEST(IsSortedTest, NoColumns)
 {
diff --git a/cpp/tests/sort/sort_test.cpp b/cpp/tests/sort/sort_test.cpp
index 5359014a831..9eb082c513c 100644
--- a/cpp/tests/sort/sort_test.cpp
+++ b/cpp/tests/sort/sort_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -197,6 +197,386 @@ TYPED_TEST(Sort, WithAllValid)
   }
 }
 
+TYPED_TEST(Sort, WithStructColumn)
+{
+  using T = TypeParam;
+
+  std::initializer_list<std::string> names = {"Samuel Vimes",
+                                              "Carrot Ironfoundersson",
+                                              "Angua von Überwald",
+                                              "Cheery Littlebottom",
+                                              "Detritus",
+                                              "Mr Slant"};
+  auto num_rows{std::distance(names.begin(), names.end())};
+  auto names_col = cudf::test::strings_column_wrapper{names.begin(), names.end()};
+  auto ages_col  = cudf::test::fixed_width_column_wrapper<T, int32_t>{{48, 27, 25, 31, 351, 351}};
+
+  auto is_human_col = cudf::test::fixed_width_column_wrapper<bool>{
+    {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}};
+
+  auto struct_col =
+    cudf::test::structs_column_wrapper{{names_col, ages_col, is_human_col}}.release();
+  auto struct_col_view{struct_col->view()};
+  EXPECT_EQ(num_rows, struct_col->size());
+
+  fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8, 9}};
+  strings_column_wrapper col2({"d", "e", "a", "d", "k", "a"});
+  fixed_width_column_wrapper<T> col3{{10, 40, 70, 5, 2, 20}};
+  table_view input{{col1, col2, col3, struct_col_view}};
+
+  fixed_width_column_wrapper<int32_t> expected{{2, 1, 0, 3, 4, 5}};
+  std::vector<order> column_order{
+    order::ASCENDING, order::ASCENDING, order::DESCENDING, order::ASCENDING};
+
+  auto got = sorted_order(input, column_order);
+
+  // Skip validating bools order. Valid true bools are all
+  // equivalent, and yield random order after thrust::sort
+  if (!std::is_same<T, bool>::value) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view());
+
+    // Run test for sort and sort_by_key
+    run_sort_test(input, expected, column_order);
+  } else {
+    // Run test for sort and sort_by_key
+    fixed_width_column_wrapper<int32_t> expected_for_bool{{2, 5, 3, 0, 1, 4}};
+    run_sort_test(input, expected_for_bool, column_order);
+  }
+}
+
+TYPED_TEST(Sort, WithNestedStructColumn)
+{
+  using T = TypeParam;
+
+  std::initializer_list<std::string> names = {"Samuel Vimes",
+                                              "Carrot Ironfoundersson",
+                                              "Angua von Überwald",
+                                              "Cheery Littlebottom",
+                                              "Detritus",
+                                              "Mr Slant"};
+  std::vector<bool> v{1, 1, 0, 1, 1, 0};
+  auto names_col = cudf::test::strings_column_wrapper{names.begin(), names.end()};
+  auto ages_col  = cudf::test::fixed_width_column_wrapper<T, int32_t>{{48, 27, 25, 31, 351, 351}};
+  auto is_human_col = cudf::test::fixed_width_column_wrapper<bool>{
+    {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}};
+  auto struct_col1 = cudf::test::structs_column_wrapper{{names_col, ages_col, is_human_col}, v};
+
+  auto ages_col2   = cudf::test::fixed_width_column_wrapper<T, int32_t>{{48, 27, 25, 31, 351, 351}};
+  auto struct_col2 = cudf::test::structs_column_wrapper{{ages_col2, struct_col1}}.release();
+
+  auto struct_col_view{struct_col2->view()};
+
+  fixed_width_column_wrapper<T> col1{{6, 6, 6, 6, 6, 6}};
+  fixed_width_column_wrapper<T> col2{{1, 1, 1, 2, 2, 2}};
+  table_view input{{col1, col2, struct_col_view}};
+
+  fixed_width_column_wrapper<int32_t> expected{{3, 5, 4, 2, 1, 0}};
+  std::vector<order> column_order{order::ASCENDING, order::DESCENDING, order::ASCENDING};
+
+  auto got = sorted_order(input, column_order);
+
+  // Skip validating bools order. Valid true bools are all
+  // equivalent, and yield random order after thrust::sort
+  if (!std::is_same<T, bool>::value) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view());
+
+    // Run test for sort and sort_by_key
+    run_sort_test(input, expected, column_order);
+  } else {
+    // Run test for sort and sort_by_key
+    fixed_width_column_wrapper<int32_t> expected_for_bool{{2, 5, 1, 3, 4, 0}};
+    run_sort_test(input, expected_for_bool, column_order);
+  }
+}
+
+TYPED_TEST(Sort, WithSingleStructColumn)
+{
+  using T = TypeParam;
+
+  std::initializer_list<std::string> names = {"Samuel Vimes",
+                                              "Carrot Ironfoundersson",
+                                              "Angua von Überwald",
+                                              "Cheery Littlebottom",
+                                              "Detritus",
+                                              "Mr Slant"};
+  std::vector<bool> v{1, 1, 0, 1, 1, 0};
+  auto names_col = cudf::test::strings_column_wrapper{names.begin(), names.end()};
+  auto ages_col  = cudf::test::fixed_width_column_wrapper<T, int32_t>{{48, 27, 25, 31, 351, 351}};
+  auto is_human_col = cudf::test::fixed_width_column_wrapper<bool>{
+    {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}};
+  auto struct_col =
+    cudf::test::structs_column_wrapper{{names_col, ages_col, is_human_col}, v}.release();
+  auto struct_col_view{struct_col->view()};
+  table_view input{{struct_col_view}};
+
+  fixed_width_column_wrapper<int32_t> expected{{2, 5, 1, 3, 4, 0}};
+  std::vector<order> column_order{order::ASCENDING};
+
+  auto got = sorted_order(input, column_order);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view());
+
+  // Run test for sort and sort_by_key
+  run_sort_test(input, expected, column_order);
+}
+
+TYPED_TEST(Sort, WithSlicedStructColumn)
+{
+  using T = TypeParam;
+  /*
+       /+-------------+
+       |             s|
+       +--------------+
+     0 | {"bbe", 1, 7}|
+     1 | {"bbe", 1, 8}|
+     2 | {"aaa", 0, 1}|
+     3 | {"abc", 0, 1}|
+     4 | {"ab",  0, 9}|
+     5 | {"za",  2, 5}|
+     6 | {"b",   1, 7}|
+     7 | { @,    3, 3}|
+       +--------------+
+  */
+  // clang-format off
+  using FWCW = cudf::test::fixed_width_column_wrapper<T, int32_t>;
+  std::vector<bool>             string_valids{    1,     1,     1,     1,    1,    1,   1,   0};
+  std::initializer_list<std::string> names = {"bbe", "bbe", "aaa", "abc", "ab", "za", "b", "x"};
+  auto col2 =                           FWCW{{    1,     1,     0,     0,    0,    2,   1,   3}};
+  auto col3 =                           FWCW{{    7,     8,     1,     1,    9,    5,   7,   3}};
+  auto col1 = cudf::test::strings_column_wrapper{names.begin(), names.end(), string_valids.begin()};
+  auto struct_col = structs_column_wrapper{{col1, col2, col3}}.release();
+  // clang-format on
+  auto struct_col_view{struct_col->view()};
+  table_view input{{struct_col_view}};
+  auto sliced_columns = cudf::split(struct_col_view, std::vector<size_type>{3});
+  auto sliced_tables  = cudf::split(input, std::vector<size_type>{3});
+  std::vector<order> column_order{order::ASCENDING};
+  /*
+        asce_null_first   sliced[3:]
+      /+-------------+
+      |             s|
+      +--------------+
+    7 | { @,    3, 3}|   7=4
+    2 | {"aaa", 0, 1}|
+    4 | {"ab",  0, 9}|   4=1
+    3 | {"abc", 0, 1}|   3=0
+    6 | {"b",   1, 7}|   6=3
+    0 | {"bbe", 1, 7}|
+    1 | {"bbe", 1, 8}|
+    5 | {"za",  2, 5}|   5=2
+      +--------------+
+  */
+
+  // normal
+  fixed_width_column_wrapper<int32_t> expected{{7, 2, 4, 3, 6, 0, 1, 5}};
+  auto got = sorted_order(input, column_order);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view());
+  // Run test for sort and sort_by_key
+  run_sort_test(input, expected, column_order);
+
+  // table with sliced column
+  table_view input2{{sliced_columns[1]}};
+  fixed_width_column_wrapper<int32_t> expected2{{4, 1, 0, 3, 2}};
+  got = sorted_order(input2, column_order);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, got->view());
+  // Run test for sort and sort_by_key
+  run_sort_test(input2, expected2, column_order);
+
+  // sliced table[1]
+  fixed_width_column_wrapper<int32_t> expected3{{4, 1, 0, 3, 2}};
+  got = sorted_order(sliced_tables[1], column_order);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected3, got->view());
+  // Run test for sort and sort_by_key
+  run_sort_test(sliced_tables[1], expected3, column_order);
+
+  // sliced table[0]
+  fixed_width_column_wrapper<int32_t> expected4{{2, 0, 1}};
+  got = sorted_order(sliced_tables[0], column_order);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected4, got->view());
+  // Run test for sort and sort_by_key
+  run_sort_test(sliced_tables[0], expected4, column_order);
+}
+
+TYPED_TEST(Sort, SlicedColumns)
+{
+  using T    = TypeParam;
+  using FWCW = cudf::test::fixed_width_column_wrapper<T, int32_t>;
+
+  // clang-format off
+  std::vector<bool>             string_valids{    1,     1,     1,     1,    1,    1,   1,   0};
+  std::initializer_list<std::string> names = {"bbe", "bbe", "aaa", "abc", "ab", "za", "b", "x"};
+  auto col2 =                           FWCW{{    7,     8,     1,     1,    9,    5,   7,   3}};
+  auto col1 = cudf::test::strings_column_wrapper{names.begin(), names.end(), string_valids.begin()};
+  // clang-format on
+  table_view input{{col1, col2}};
+  auto sliced_columns1 = cudf::split(col1, std::vector<size_type>{3});
+  auto sliced_columns2 = cudf::split(col1, std::vector<size_type>{3});
+  auto sliced_tables   = cudf::split(input, std::vector<size_type>{3});
+  std::vector<order> column_order{order::ASCENDING, order::ASCENDING};
+
+  // normal
+  // fixed_width_column_wrapper<int32_t> expected{{2, 3, 7, 5, 0, 6, 1, 4}};
+  fixed_width_column_wrapper<int32_t> expected{{7, 2, 4, 3, 6, 0, 1, 5}};
+  auto got = sorted_order(input, column_order);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view());
+  // Run test for sort and sort_by_key
+  run_sort_test(input, expected, column_order);
+
+  // table with sliced column
+  table_view input2{{sliced_columns1[1], sliced_columns2[1]}};
+  // fixed_width_column_wrapper<int32_t> expected2{{0, 4, 2, 3, 1}};
+  fixed_width_column_wrapper<int32_t> expected2{{4, 1, 0, 3, 2}};
+  got = sorted_order(input2, column_order);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, got->view());
+  // Run test for sort and sort_by_key
+  run_sort_test(input2, expected2, column_order);
+}
+
+TYPED_TEST(Sort, WithStructColumnCombinations)
+{
+  using T    = TypeParam;
+  using FWCW = cudf::test::fixed_width_column_wrapper<T, int32_t>;
+
+  // clang-format off
+  /*
+    +------------+
+    |           s|
+    +------------+
+  0 |   {0, null}|
+  1 |   {1, null}|
+  2 |        null|
+  3 |{null, null}|
+  4 |        null|
+  5 |{null, null}|
+  6 |   {null, 1}|
+  7 |   {null, 0}|
+    +------------+
+  */
+  std::vector<bool>                           struct_valids{1, 1, 0, 1, 0, 1, 1, 1};
+  auto col1       = FWCW{{ 0,  1,  9, -1,  9, -1, -1, -1}, {1, 1, 1, 0, 1, 0, 0, 0}};
+  auto col2       = FWCW{{-1, -1,  9, -1,  9, -1,  1,  0}, {0, 0, 1, 0, 1, 0, 1, 1}};
+  auto struct_col = cudf::test::structs_column_wrapper{{col1, col2}, struct_valids}.release();
+  /*
+    desc_nulls_first     desc_nulls_last     asce_nulls_first     asce_nulls_last
+    +------------+       +------------+      +------------+       +------------+
+    |           s|       |           s|      |           s|       |           s|
+    +------------+       +------------+      +------------+       +------------+
+  2 |        null|     1 |   {1, null}|    2 |        null|     3 |{null, null}|
+  4 |        null|     0 |   {0, null}|    4 |        null|     5 |{null, null}|
+  1 |   {1, null}|     6 |   {null, 1}|    3 |{null, null}|     7 |   {null, 0}|
+  0 |   {0, null}|     7 |   {null, 0}|    5 |{null, null}|     6 |   {null, 1}|
+  6 |   {null, 1}|     3 |{null, null}|    7 |   {null, 0}|     0 |   {0, null}|
+  7 |   {null, 0}|     5 |{null, null}|    6 |   {null, 1}|     1 |   {1, null}|
+  3 |{null, null}|     2 |        null|    0 |   {0, null}|     2 |        null|
+  5 |{null, null}|     4 |        null|    1 |   {1, null}|     4 |        null|
+    +------------+       +------------+      +------------+       +------------+
+  */
+  // clang-format on
+  auto struct_col_view{struct_col->view()};
+  table_view input{{struct_col_view}};
+  std::vector<order> column_order1{order::DESCENDING};
+
+  // desc_nulls_first
+  fixed_width_column_wrapper<int32_t> expected1{{2, 4, 1, 0, 6, 7, 3, 5}};
+  auto got = sorted_order(input, column_order1, {null_order::AFTER});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, got->view());
+  // Run test for sort and sort_by_key
+  run_sort_test(input, expected1, column_order1, {null_order::AFTER});
+
+  // desc_nulls_last
+  fixed_width_column_wrapper<int32_t> expected2{{1, 0, 6, 7, 3, 5, 2, 4}};
+  got = sorted_order(input, column_order1, {null_order::BEFORE});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, got->view());
+  // Run test for sort and sort_by_key
+  run_sort_test(input, expected2, column_order1, {null_order::BEFORE});
+
+  // asce_nulls_first
+  std::vector<order> column_order2{order::ASCENDING};
+  fixed_width_column_wrapper<int32_t> expected3{{2, 4, 3, 5, 7, 6, 0, 1}};
+  got = sorted_order(input, column_order2, {null_order::BEFORE});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected3, got->view());
+  // Run test for sort and sort_by_key
+  run_sort_test(input, expected3, column_order2, {null_order::BEFORE});
+
+  // asce_nulls_last
+  fixed_width_column_wrapper<int32_t> expected4{{3, 5, 7, 6, 0, 1, 2, 4}};
+  got = sorted_order(input, column_order2, {null_order::AFTER});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected4, got->view());
+  // Run test for sort and sort_by_key
+  run_sort_test(input, expected4, column_order2, {null_order::AFTER});
+}
+
+TYPED_TEST(Sort, WithStructColumnCombinationsWithoutNulls)
+{
+  using T    = TypeParam;
+  using FWCW = cudf::test::fixed_width_column_wrapper<T, int32_t>;
+
+  // clang-format off
+  /*
+    +------------+
+    |           s|
+    +------------+
+  0 |   {0, null}|
+  1 |   {1, null}|
+  2 |      {9, 9}|
+  3 |{null, null}|
+  4 |      {9, 9}|
+  5 |{null, null}|
+  6 |   {null, 1}|
+  7 |   {null, 0}|
+    +------------+
+  */
+  auto col1       = FWCW{{ 0,  1,  9, -1,  9, -1, -1, -1}, {1, 1, 1, 0, 1, 0, 0, 0}};
+  auto col2       = FWCW{{-1, -1,  9, -1,  9, -1,  1,  0}, {0, 0, 1, 0, 1, 0, 1, 1}};
+  auto struct_col = cudf::test::structs_column_wrapper{{col1, col2}}.release();
+  /* (nested columns are always nulls_first, spark requirement)
+    desc_nulls_*        asce_nulls_*
+    +------------+      +------------+
+    |           s|      |           s|
+    +------------+      +------------+
+  2 |      {9, 9}|    3 |{null, null}|
+  4 |      {9, 9}|    5 |{null, null}|
+  1 |   {1, null}|    7 |   {null, 0}|
+  0 |   {0, null}|    6 |   {null, 1}|
+  6 |   {null, 1}|    0 |   {0, null}|
+  7 |   {null, 0}|    1 |   {1, null}|
+  3 |{null, null}|    2 |      {9, 9}|
+  5 |{null, null}|    4 |      {9, 9}|
+    +------------+      +------------+
+  */
+  // clang-format on
+  auto struct_col_view{struct_col->view()};
+  table_view input{{struct_col_view}};
+  std::vector<order> column_order{order::DESCENDING};
+
+  // desc_nulls_first
+  fixed_width_column_wrapper<int32_t> expected1{{2, 4, 1, 0, 6, 7, 3, 5}};
+  auto got = sorted_order(input, column_order, {null_order::AFTER});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, got->view());
+  // Run test for sort and sort_by_key
+  run_sort_test(input, expected1, column_order, {null_order::AFTER});
+
+  // desc_nulls_last
+  got = sorted_order(input, column_order, {null_order::BEFORE});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, got->view());
+  // Run test for sort and sort_by_key
+  run_sort_test(input, expected1, column_order, {null_order::BEFORE});
+
+  // asce_nulls_first
+  std::vector<order> column_order2{order::ASCENDING};
+  fixed_width_column_wrapper<int32_t> expected2{{3, 5, 7, 6, 0, 1, 2, 4}};
+  got = sorted_order(input, column_order2, {null_order::BEFORE});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, got->view());
+  // Run test for sort and sort_by_key
+  run_sort_test(input, expected2, column_order2, {null_order::BEFORE});
+
+  // asce_nulls_last
+  got = sorted_order(input, column_order2, {null_order::AFTER});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, got->view());
+  // Run test for sort and sort_by_key
+  run_sort_test(input, expected2, column_order2, {null_order::AFTER});
+}
+
 TYPED_TEST(Sort, Stable)
 {
   using T = TypeParam;
diff --git a/cpp/tests/structs/structs_column_tests.cu b/cpp/tests/structs/structs_column_tests.cu
index 2a0856133ba..e1438c33044 100644
--- a/cpp/tests/structs/structs_column_tests.cu
+++ b/cpp/tests/structs/structs_column_tests.cu
@@ -68,7 +68,7 @@ TYPED_TEST(TypedStructColumnWrapperTest, TestColumnFactoryConstruction)
 {
   auto names_col =
     cudf::test::strings_column_wrapper{
-      "Samuel Vimes", "Carrot Ironfoundersson", "Angua von Uberwald"}
+      "Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"}
       .release();
 
   int num_rows{names_col->size()};
@@ -95,7 +95,7 @@ TYPED_TEST(TypedStructColumnWrapperTest, TestColumnFactoryConstruction)
   // Check child columns for exactly correct values.
   vector_of_columns expected_children;
   expected_children.emplace_back(cudf::test::strings_column_wrapper{
-    "Samuel Vimes", "Carrot Ironfoundersson", "Angua von Uberwald"}
+    "Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"}
                                    .release());
   expected_children.emplace_back(
     cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>{48, 27, 25}.release());
@@ -116,7 +116,7 @@ TYPED_TEST(TypedStructColumnWrapperTest, TestColumnWrapperConstruction)
 {
   std::initializer_list<std::string> names = {"Samuel Vimes",
                                               "Carrot Ironfoundersson",
-                                              "Angua von Uberwald",
+                                              "Angua von Überwald",
                                               "Cheery Littlebottom",
                                               "Detritus",
                                               "Mr Slant"};
@@ -174,7 +174,7 @@ TYPED_TEST(TypedStructColumnWrapperTest, TestStructsContainingLists)
 
   std::initializer_list<std::string> names = {"Samuel Vimes",
                                               "Carrot Ironfoundersson",
-                                              "Angua von Uberwald",
+                                              "Angua von Überwald",
                                               "Cheery Littlebottom",
                                               "Detritus",
                                               "Mr Slant"};
@@ -234,7 +234,7 @@ TYPED_TEST(TypedStructColumnWrapperTest, StructOfStructs)
 
   auto names = {"Samuel Vimes",
                 "Carrot Ironfoundersson",
-                "Angua von Uberwald",
+                "Angua von Überwald",
                 "Cheery Littlebottom",
                 "Detritus",
                 "Mr Slant"};
@@ -300,7 +300,7 @@ TYPED_TEST(TypedStructColumnWrapperTest, TestNullMaskPropagationForNonNullStruct
 
   auto names = {"Samuel Vimes",
                 "Carrot Ironfoundersson",
-                "Angua von Uberwald",
+                "Angua von Überwald",
                 "Cheery Littlebottom",
                 "Detritus",
                 "Mr Slant"};
@@ -393,7 +393,7 @@ TYPED_TEST(TypedStructColumnWrapperTest, TestListsOfStructs)
 
   std::initializer_list<std::string> names = {"Samuel Vimes",
                                               "Carrot Ironfoundersson",
-                                              "Angua von Uberwald",
+                                              "Angua von Überwald",
                                               "Cheery Littlebottom",
                                               "Detritus",
                                               "Mr Slant"};
diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp
index e8953ab9a30..15d014f9d9c 100644
--- a/cpp/tests/unary/cast_tests.cpp
+++ b/cpp/tests/unary/cast_tests.cpp
@@ -537,6 +537,9 @@ inline auto make_fixed_point_data_type(int32_t scale)
   return cudf::data_type{cudf::type_to_id<T>(), scale};
 }
 
+struct FixedPointTestSingleType : public cudf::test::BaseFixture {
+};
+
 template <typename T>
 struct FixedPointTests : public cudf::test::BaseFixture {
 };
@@ -592,6 +595,18 @@ TYPED_TEST(FixedPointTests, CastToInt32)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
+TEST_F(FixedPointTestSingleType, CastDecimal64ToInt32)
+{
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<int64_t>;
+  using fw_wrapper = cudf::test::fixed_width_column_wrapper<int32_t>;
+
+  auto const input    = fp_wrapper{{7246212000}, numeric::scale_type{-5}};
+  auto const expected = fw_wrapper{72462};
+  auto const result   = cudf::cast(input, make_data_type<int32_t>());
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
 TYPED_TEST(FixedPointTests, CastToIntLarge)
 {
   using namespace numeric;
@@ -659,6 +674,18 @@ TYPED_TEST(FixedPointTests, CastFromInt)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
+TEST_F(FixedPointTestSingleType, CastInt32ToDecimal64)
+{
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<int64_t>;
+  using fw_wrapper = cudf::test::fixed_width_column_wrapper<int32_t>;
+
+  auto const input    = fw_wrapper{-48938};
+  auto const expected = fp_wrapper{{-4893800000LL}, numeric::scale_type{-5}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<numeric::decimal64>(-5));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
 TYPED_TEST(FixedPointTests, CastFromIntLarge)
 {
   using namespace numeric;
diff --git a/docs/cudf/source/groupby.md b/docs/cudf/source/groupby.md
index 7e96d4fe38c..5376df261e7 100644
--- a/docs/cudf/source/groupby.md
+++ b/docs/cudf/source/groupby.md
@@ -137,6 +137,7 @@ The following table summarizes the available aggregations and the types that sup
 | nunique             | ✅       | ✅       | ✅       | ✅          |      |        |
 | nth                 | ✅       | ✅       | ✅       |             |      |        |
 | collect             | ✅       | ✅       | ✅       |             | ✅   |        |
+| unique              | ✅       | ✅       | ✅       | ✅          |      |        |
 
 ## GroupBy apply
 
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index e50a9e86ead..90fe3553abc 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -288,19 +288,34 @@ public final ColumnVector isNull() {
   /**
    * Returns a Boolean vector with the same number of rows as this instance, that has
    * TRUE for any entry that is an integer, and FALSE if its not an integer. A null will be returned
-   * for null entries
+   * for null entries.
    *
    * NOTE: Integer doesn't mean a 32-bit integer. It means a number that is not a fraction.
    * i.e. If this method returns true for a value it could still result in an overflow or underflow
    * if you convert it to a Java integral type
    *
-   * @return - Boolean vector
+   * @return Boolean vector
    */
   public final ColumnVector isInteger() {
     assert type.equals(DType.STRING);
     return new ColumnVector(isInteger(getNativeView()));
   }
 
+  /**
+   * Returns a Boolean vector with the same number of rows as this instance, that has
+   * TRUE for any entry that is an integer, and FALSE if its not an integer. A null will be returned
+   * for null entries.
+   *
+   * @param intType the data type that should be used for bounds checking. Note that only
+   *                integer types are allowed.
+   * @return Boolean vector
+   */
+  public final ColumnVector isInteger(DType intType) {
+    assert type.equals(DType.STRING);
+    return new ColumnVector(isIntegerWithType(getNativeView(),
+        intType.getTypeId().getNativeId(), intType.getScale()));
+  }
+
   /**
    * Returns a Boolean vector with the same number of rows as this instance, that has
    * TRUE for any entry that is a float, and FALSE if its not a float. A null will be returned
@@ -373,7 +388,19 @@ public final ColumnVector findAndReplaceAll(ColumnView oldValues, ColumnView new
    * @return - ColumnVector with nulls replaced by scalar
    */
   public final ColumnVector replaceNulls(Scalar scalar) {
-    return new ColumnVector(replaceNulls(getNativeView(), scalar.getScalarHandle()));
+    return new ColumnVector(replaceNullsScalar(getNativeView(), scalar.getScalarHandle()));
+  }
+
+  /**
+   * Returns a ColumnVector with any null values replaced with the corresponding row in the
+   * specified replacement column.
+   * This column and the replacement column must have the same type and number of rows.
+   *
+   * @param replacements column of replacement values
+   * @return column with nulls replaced by corresponding row of replacements column
+   */
+  public final ColumnVector replaceNulls(ColumnView replacements) {
+    return new ColumnVector(replaceNullsColumn(getNativeView(), replacements.getNativeView()));
   }
 
   /**
@@ -2825,7 +2852,9 @@ private static native long rollingWindow(
 
   private static native long charLengths(long viewHandle) throws CudfException;
 
-  private static native long replaceNulls(long viewHandle, long scalarHandle) throws CudfException;
+  private static native long replaceNullsScalar(long viewHandle, long scalarHandle) throws CudfException;
+
+  private static native long replaceNullsColumn(long viewHandle, long replaceViewHandle) throws CudfException;
 
   private static native long ifElseVV(long predVec, long trueVec, long falseVec) throws CudfException;
 
@@ -2845,6 +2874,8 @@ private static native long rollingWindow(
 
   private static native long isInteger(long viewHandle);
 
+  private static native long isIntegerWithType(long viewHandle, int typeId, int typeScale);
+
   private static native long isNotNanNative(long viewHandle);
 
   private static native long isNotNullNative(long viewHandle);
diff --git a/java/src/main/java/ai/rapids/cudf/OrderByArg.java b/java/src/main/java/ai/rapids/cudf/OrderByArg.java
new file mode 100644
index 00000000000..fbdd7035c76
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/OrderByArg.java
@@ -0,0 +1,59 @@
+/*
+ *
+ *  Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+import java.io.Serializable;
+
+/**
+ * Provides the ordering for specific columns.
+ */
+public final class OrderByArg implements Serializable {
+  final int index;
+  final boolean isDescending;
+  final boolean isNullSmallest;
+
+  OrderByArg(int index, boolean isDescending, boolean isNullSmallest) {
+    this.index = index;
+    this.isDescending = isDescending;
+    this.isNullSmallest = isNullSmallest;
+  }
+
+  public static OrderByArg asc(final int index) {
+    return new OrderByArg(index, false, false);
+  }
+
+  public static OrderByArg desc(final int index) {
+    return new OrderByArg(index, true, false);
+  }
+
+  public static OrderByArg asc(final int index, final boolean isNullSmallest) {
+    return new OrderByArg(index, false, isNullSmallest);
+  }
+
+  public static OrderByArg desc(final int index, final boolean isNullSmallest) {
+    return new OrderByArg(index, true, isNullSmallest);
+  }
+
+  @Override
+  public String toString() {
+    return "ORDER BY " + index +
+        (isDescending ? " DESC " : " ASC ") +
+        (isNullSmallest ? "NULL SMALLEST" : "NULL LARGEST");
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 4da99d811f2..6e0b7d3bb94 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -25,7 +25,6 @@
 import ai.rapids.cudf.HostColumnVector.StructType;
 
 import java.io.File;
-import java.io.Serializable;
 import java.math.BigDecimal;
 import java.math.RoundingMode;
 import java.nio.ByteBuffer;
@@ -1444,7 +1443,7 @@ public ColumnVector sortOrder(OrderByArg... args) {
    * responsible for cleaning up
    * the {@link ColumnVector} returned as part of the output {@link Table}
    * <p>
-   * Example usage: orderBy(true, Table.asc(0), Table.desc(3)...);
+   * Example usage: orderBy(true, OrderByArg.asc(0), OrderByArg.desc(3)...);
    * @param args Suppliers to initialize sortKeys.
    * @return Sorted Table
    */
@@ -1512,22 +1511,6 @@ public static Table merge(List<Table> tables, OrderByArg... args) {
     return merge(tables.toArray(new Table[tables.size()]), args);
   }
 
-  public static OrderByArg asc(final int index) {
-    return new OrderByArg(index, false, false);
-  }
-
-  public static OrderByArg desc(final int index) {
-    return new OrderByArg(index, true, false);
-  }
-
-  public static OrderByArg asc(final int index, final boolean isNullSmallest) {
-    return new OrderByArg(index, false, isNullSmallest);
-  }
-
-  public static OrderByArg desc(final int index, final boolean isNullSmallest) {
-    return new OrderByArg(index, true, isNullSmallest);
-  }
-
   /**
    * Returns count aggregation with only valid values.
    * Null values are skipped.
@@ -2093,25 +2076,6 @@ public static Table fromPackedTable(ByteBuffer metadata, DeviceMemoryBuffer data
   // HELPER CLASSES
   /////////////////////////////////////////////////////////////////////////////
 
-  public static final class OrderByArg implements Serializable {
-    final int index;
-    final boolean isDescending;
-    final boolean isNullSmallest;
-
-    OrderByArg(int index, boolean isDescending, boolean isNullSmallest) {
-      this.index = index;
-      this.isDescending = isDescending;
-      this.isNullSmallest = isNullSmallest;
-    }
-
-    @Override
-    public String toString() {
-      return "ORDER BY " + index +
-          (isDescending ? " DESC " : " ASC ") +
-          (isNullSmallest ? "NULL SMALLEST" : "NULL LARGEST");
-    }
-  }
-
   /**
    * class to encapsulate indices and table
    */
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 4132016d85c..dc1acc50b5f 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -121,8 +121,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_lowerStrings(JNIEnv *env,
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNulls(JNIEnv *env, jclass,
-                                                                    jlong j_col, jlong j_scalar) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNullsScalar(JNIEnv *env, jclass,
+                                                                          jlong j_col,
+                                                                          jlong j_scalar) {
   JNI_NULL_CHECK(env, j_col, "column is null", 0);
   JNI_NULL_CHECK(env, j_scalar, "scalar is null", 0);
   try {
@@ -135,6 +136,21 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNulls(JNIEnv *env,
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNullsColumn(JNIEnv *env, jclass,
+                                                                          jlong j_col,
+                                                                          jlong j_replace_col) {
+  JNI_NULL_CHECK(env, j_col, "column is null", 0);
+  JNI_NULL_CHECK(env, j_replace_col, "replacement column is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto col = reinterpret_cast<cudf::column_view *>(j_col);
+    auto replacements = reinterpret_cast<cudf::column_view *>(j_replace_col);
+    std::unique_ptr<cudf::column> result = cudf::replace_nulls(*col, *replacements);
+    return reinterpret_cast<jlong>(result.release());
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseVV(JNIEnv *env, jclass,
                                                                 jlong j_pred_vec,
                                                                 jlong j_true_vec,
@@ -1788,6 +1804,23 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isInteger(JNIEnv *env, jo
   CATCH_STD(env, 0)
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isIntegerWithType(JNIEnv *env, jobject,
+                                                                         jlong handle,
+                                                                         jint j_dtype,
+                                                                         jint scale) {
+
+  JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::data_type int_dtype = cudf::jni::make_data_type(j_dtype, scale);
+    std::unique_ptr<cudf::column> result = cudf::strings::is_integer(*view, int_dtype);
+    return reinterpret_cast<jlong>(result.release());
+  }
+  CATCH_STD(env, 0)
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyColumnViewToCV(JNIEnv *env, jobject j_object,
                                                                           jlong handle) {
 
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 02fbe56431b..fe1cba5ceb1 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -1368,7 +1368,7 @@ void testFromScalarNullByte() {
   }
 
   @Test
-  void testReplaceEmptyColumn() {
+  void testReplaceNullsScalarEmptyColumn() {
     try (ColumnVector input = ColumnVector.fromBoxedBooleans();
          ColumnVector expected = ColumnVector.fromBoxedBooleans();
          Scalar s = Scalar.fromBool(false);
@@ -1378,7 +1378,7 @@ void testReplaceEmptyColumn() {
   }
 
   @Test
-  void testReplaceNullBoolsWithAllNulls() {
+  void testReplaceNullsScalarBoolsWithAllNulls() {
     try (ColumnVector input = ColumnVector.fromBoxedBooleans(null, null, null, null);
          ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, false, false);
          Scalar s = Scalar.fromBool(false);
@@ -1388,7 +1388,7 @@ void testReplaceNullBoolsWithAllNulls() {
   }
 
   @Test
-  void testReplaceSomeNullBools() {
+  void testReplaceNullsScalarSomeNullBools() {
     try (ColumnVector input = ColumnVector.fromBoxedBooleans(false, null, null, false);
          ColumnVector expected = ColumnVector.fromBoxedBooleans(false, true, true, false);
          Scalar s = Scalar.fromBool(true);
@@ -1398,7 +1398,7 @@ void testReplaceSomeNullBools() {
   }
 
   @Test
-  void testReplaceNullIntegersWithAllNulls() {
+  void testReplaceNullsScalarIntegersWithAllNulls() {
     try (ColumnVector input = ColumnVector.fromBoxedInts(null, null, null, null);
          ColumnVector expected = ColumnVector.fromBoxedInts(0, 0, 0, 0);
          Scalar s = Scalar.fromInt(0);
@@ -1408,7 +1408,7 @@ void testReplaceNullIntegersWithAllNulls() {
   }
 
   @Test
-  void testReplaceSomeNullIntegers() {
+  void testReplaceNullsScalarSomeNullIntegers() {
     try (ColumnVector input = ColumnVector.fromBoxedInts(1, 2, null, 4, null);
          ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, 999, 4, 999);
          Scalar s = Scalar.fromInt(999);
@@ -1418,7 +1418,7 @@ void testReplaceSomeNullIntegers() {
   }
 
   @Test
-  void testReplaceNullsFailsOnTypeMismatch() {
+  void testReplaceNullsScalarFailsOnTypeMismatch() {
     try (ColumnVector input = ColumnVector.fromBoxedInts(1, 2, null, 4, null);
          Scalar s = Scalar.fromBool(true)) {
       assertThrows(CudfException.class, () -> input.replaceNulls(s).close());
@@ -1434,6 +1434,44 @@ void testReplaceNullsWithNullScalar() {
     }
   }
 
+  @Test
+  void testReplaceNullsColumnEmptyColumn() {
+    try (ColumnVector input = ColumnVector.fromBoxedBooleans();
+         ColumnVector r = ColumnVector.fromBoxedBooleans();
+         ColumnVector expected = ColumnVector.fromBoxedBooleans();
+         ColumnVector result = input.replaceNulls(r)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testReplaceNullsColumnBools() {
+    try (ColumnVector input = ColumnVector.fromBoxedBooleans(null, true, null, false);
+         ColumnVector r = ColumnVector.fromBoxedBooleans(false, null, true, true);
+         ColumnVector expected = ColumnVector.fromBoxedBooleans(false, true, true, false);
+         ColumnVector result = input.replaceNulls(r)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testReplaceNullsColumnIntegers() {
+    try (ColumnVector input = ColumnVector.fromBoxedInts(1, 2, null, 4, null);
+         ColumnVector r = ColumnVector.fromBoxedInts(996, 997, 998, 909, null);
+         ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, 998, 4, null);
+         ColumnVector result = input.replaceNulls(r)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testReplaceNullsColumnFailsOnTypeMismatch() {
+    try (ColumnVector input = ColumnVector.fromBoxedInts(1, 2, null, 4, null);
+         ColumnVector r = ColumnVector.fromBoxedBooleans(true)) {
+      assertThrows(CudfException.class, () -> input.replaceNulls(r).close());
+    }
+  }
+
   static QuantileMethod[] methods = {LINEAR, LOWER, HIGHER, MIDPOINT, NEAREST};
   static double[] quantiles = {0.0, 0.25, 0.33, 0.5, 1.0};
 
@@ -3339,6 +3377,69 @@ void testNansToNulls() {
     }
   }
 
+  @Test
+  void testIsIntegerWithBounds() {
+    String[] intStrings = {"A", "nan", "Inf", "-Inf", "3.5",
+        String.valueOf(Byte.MIN_VALUE),
+        String.valueOf(Byte.MIN_VALUE + 1L),
+        String.valueOf(Byte.MIN_VALUE - 1L),
+        String.valueOf(Byte.MAX_VALUE),
+        String.valueOf(Byte.MAX_VALUE + 1L),
+        String.valueOf(Byte.MAX_VALUE - 1L),
+        String.valueOf(Short.MIN_VALUE),
+        String.valueOf(Short.MIN_VALUE + 1L),
+        String.valueOf(Short.MIN_VALUE - 1L),
+        String.valueOf(Short.MAX_VALUE),
+        String.valueOf(Short.MAX_VALUE + 1L),
+        String.valueOf(Short.MAX_VALUE - 1L),
+        String.valueOf(Integer.MIN_VALUE),
+        String.valueOf(Integer.MIN_VALUE + 1L),
+        String.valueOf(Integer.MIN_VALUE - 1L),
+        String.valueOf(Integer.MAX_VALUE),
+        String.valueOf(Integer.MAX_VALUE + 1L),
+        String.valueOf(Integer.MAX_VALUE - 1L),
+        String.valueOf(Long.MIN_VALUE),
+        String.valueOf(Long.MIN_VALUE + 1L),
+        "-9223372036854775809",
+        String.valueOf(Long.MAX_VALUE),
+        "9223372036854775808",
+        String.valueOf(Long.MAX_VALUE - 1L)};
+    try (ColumnVector intStringCV = ColumnVector.fromStrings(intStrings);
+         ColumnVector isByte = intStringCV.isInteger(DType.INT8);
+         ColumnVector expectedByte = ColumnVector.fromBoxedBooleans(
+             false, false, false, false, false,
+             true, true, false, true, false, true,
+             false, false, false, false, false, false,
+             false, false, false, false, false, false,
+             false, false, false, false, false, false);
+         ColumnVector isShort = intStringCV.isInteger(DType.INT16);
+         ColumnVector expectedShort = ColumnVector.fromBoxedBooleans(
+             false, false, false, false, false,
+             true, true, true, true, true, true,
+             true, true, false, true, false, true,
+             false, false, false, false, false, false,
+             false, false, false, false, false, false);
+         ColumnVector isInt = intStringCV.isInteger(DType.INT32);
+         ColumnVector expectedInt = ColumnVector.fromBoxedBooleans(
+             false, false, false, false, false,
+             true, true, true, true, true, true,
+             true, true, true, true, true, true,
+             true, true, false, true, false, true,
+             false, false, false, false, false, false);
+         ColumnVector isLong = intStringCV.isInteger(DType.INT64);
+         ColumnVector expectedLong = ColumnVector.fromBoxedBooleans(
+             false, false, false, false, false,
+             true, true, true, true, true, true,
+             true, true, true, true, true, true,
+             true, true, true, true, true, true,
+             true, true, false, true, false, true)) {
+      assertColumnsAreEqual(expectedByte, isByte);
+      assertColumnsAreEqual(expectedShort, isShort);
+      assertColumnsAreEqual(expectedInt, isInt);
+      assertColumnsAreEqual(expectedLong, isLong);
+    }
+  }
+
   @Test
   void testIsInteger() {
     String[] intStrings = {"A", "nan", "Inf", "-Inf", "Infinity", "infinity", "2147483647",
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 4eee3e97e6e..b6350a207c1 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -369,9 +369,9 @@ void testMergeSimple() {
                  .column(3, 2, 1, 2, null, 3, 5, 2)
                  .column(1, 9, 7, 3, 5, 3, 1, 10)
                  .build();
-         Table sortedTable1 = table1.orderBy(Table.asc(0), Table.desc(1));
-         Table sortedTable2 = table2.orderBy(Table.asc(0), Table.desc(1));
-         Table merged = Table.merge(Arrays.asList(sortedTable1, sortedTable2), Table.asc(0), Table.desc(1))) {
+         Table sortedTable1 = table1.orderBy(OrderByArg.asc(0), OrderByArg.desc(1));
+         Table sortedTable2 = table2.orderBy(OrderByArg.asc(0), OrderByArg.desc(1));
+         Table merged = Table.merge(Arrays.asList(sortedTable1, sortedTable2), OrderByArg.asc(0), OrderByArg.desc(1))) {
       assertTablesAreEqual(expected, merged);
     }
   }
@@ -388,7 +388,7 @@ void testOrderByAD() {
              .column(2, 1, 4, 3, 5)
              .column(9, 7, 5, 3, 1)
              .build();
-         Table sortedTable = table.orderBy(Table.asc(0), Table.desc(1))) {
+         Table sortedTable = table.orderBy(OrderByArg.asc(0), OrderByArg.desc(1))) {
       assertTablesAreEqual(expected, sortedTable);
     }
   }
@@ -405,7 +405,7 @@ void testSortOrderSimple() {
              .column(2, 1, 4, 3, 5)
              .column(9, 7, 5, 3, 1)
              .build();
-         ColumnVector gatherMap = table.sortOrder(Table.asc(0), Table.desc(1));
+         ColumnVector gatherMap = table.sortOrder(OrderByArg.asc(0), OrderByArg.desc(1));
          Table sortedTable = table.gather(gatherMap)) {
       assertTablesAreEqual(expected, sortedTable);
     }
@@ -423,7 +423,7 @@ void testOrderByDD() {
              .column(5, 4, 3, 2, 1)
              .column(1, 5, 3, 9, 7)
              .build();
-         Table sortedTable = table.orderBy(Table.desc(0), Table.desc(1))) {
+         Table sortedTable = table.orderBy(OrderByArg.desc(0), OrderByArg.desc(1))) {
       assertTablesAreEqual(expected, sortedTable);
     }
   }
@@ -442,7 +442,7 @@ void testOrderByWithNulls() {
              .column("1", "0", "2", "4", "3")
              .column(7, 9, 5, 1, 3)
              .build();
-         Table sortedTable = table.orderBy(Table.asc(0), Table.desc(1))) {
+         Table sortedTable = table.orderBy(OrderByArg.asc(0), OrderByArg.desc(1))) {
       assertTablesAreEqual(expected, sortedTable);
     }
   }
@@ -461,7 +461,7 @@ void testOrderByWithNullsAndStrings() {
              .column(null, null, 4, 3, 5)
              .column(9, 7, 5, 3, 1)
              .build();
-         Table sortedTable = table.orderBy(Table.asc(0))) {
+         Table sortedTable = table.orderBy(OrderByArg.asc(0))) {
       assertTablesAreEqual(expected, sortedTable);
     }
   }
@@ -867,7 +867,7 @@ void testLeftJoinWithNulls() {
              .column(null, null, 203, null, null, null, null, 201, 202, 204) // right
              .build();
          Table joinedTable = leftTable.onColumns(0).leftJoin(rightTable.onColumns(0), true);
-         Table orderedJoinedTable = joinedTable.orderBy(Table.asc(1, true))) {
+         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true))) {
       assertTablesAreEqual(expected, orderedJoinedTable);
     }
   }
@@ -891,7 +891,7 @@ void testLeftJoinOnNullKeys() {
            .build();
 
            Table joinedTable = leftTable.onColumns(0).leftJoin(rightTable.onColumns(0));
-           Table orderedJoinedTable = joinedTable.orderBy(Table.asc(1, true))) {
+           Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true))) {
          assertTablesAreEqual(expectedResults, orderedJoinedTable);
        }
 
@@ -902,7 +902,7 @@ void testLeftJoinOnNullKeys() {
            .build();
 
            Table joinedTable = leftTable.onColumns(0).leftJoin(rightTable.onColumns(0), false);
-           Table orderedJoinedTable = joinedTable.orderBy(Table.asc(1, true))) {
+           Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true))) {
          assertTablesAreEqual(expectedResults, orderedJoinedTable);
        }
     }
@@ -919,7 +919,7 @@ void testLeftJoin() {
              .column( 20,  21,  22,  23,  24,  25,  26,  27,  28,  29)
              .build();
          Table joinedTable = leftTable.onColumns(0).leftJoin(rightTable.onColumns(0), true);
-         Table orderedJoinedTable = joinedTable.orderBy(Table.asc(1, true));
+         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true));
          Table expected = new Table.TestBuilder()
              .column(360, 326, 254, 306, 109, 361, 251, 335, 301, 317) // common
              .column( 10,  11,  12,  13,  14,  15,  16,  17,  18,  19) // left
@@ -945,7 +945,7 @@ void testFullJoinWithNonCommonKeys() {
                  .column(null, null, null, null, null, 201, 200, null, 203, 202,  204,  205) // right
                  .build();
          Table joinedTable = leftTable.onColumns(0).fullJoin(rightTable.onColumns(0), true);
-         Table orderedJoinedTable = joinedTable.orderBy(Table.asc(0, true))) {
+         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(0, true))) {
       assertTablesAreEqual(expected, orderedJoinedTable);
     }
   }
@@ -968,7 +968,7 @@ void testFullJoinOnNullKeys() {
               .column( 200,  202,  200,  202, null, null, null, null, null, 201, null, 203,  204,  205) // right
               .build();
            Table joinedTable = leftTable.onColumns(0).fullJoin(rightTable.onColumns(0));
-           Table orderedJoinedTable = joinedTable.orderBy(Table.asc(0, true), Table.asc(1, true))) {
+           Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(0, true), OrderByArg.asc(1, true))) {
         assertTablesAreEqual(expectedResults, orderedJoinedTable);
       }
 
@@ -980,7 +980,7 @@ void testFullJoinOnNullKeys() {
               .build();
            Table joinedTable = leftTable.onColumns(0).fullJoin(rightTable.onColumns(0), false);
            Table orderedJoinedTable = joinedTable.orderBy(
-                   Table.asc(0, true), Table.asc(1, true), Table.asc(2, true))) {
+               OrderByArg.asc(0, true), OrderByArg.asc(1, true), OrderByArg.asc(2, true))) {
         assertTablesAreEqual(expectedResults, orderedJoinedTable);
       }
     }
@@ -997,7 +997,7 @@ void testFullJoinWithOnlyCommonKeys() {
                  .column(200, 201, 202, 203, 204, 205, 206, 207, 208, 209)
                  .build();
          Table joinedTable = leftTable.onColumns(0).fullJoin(rightTable.onColumns(new int[]{0}), true);
-         Table orderedJoinedTable = joinedTable.orderBy(Table.asc(1, true));
+         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true));
          Table expected = new Table.TestBuilder()
                  .column(360, 326, 254, 306, 109, 361, 251, 335, 301, 317) // common
                  .column(100, 101, 102, 103, 104, 105, 106, 107, 108, 109) // left
@@ -1023,7 +1023,7 @@ void testInnerJoinWithNonCommonKeys() {
              .column(202, 200, 201, 203) // right
              .build();
          Table joinedTable = leftTable.onColumns(0).innerJoin(rightTable.onColumns(0), true);
-         Table orderedJoinedTable = joinedTable.orderBy(Table.asc(1, true))) {
+         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true))) {
       assertTablesAreEqual(expected, orderedJoinedTable);
     }
   }
@@ -1046,7 +1046,7 @@ void testInnerJoinOnNullKeys() {
              .column(202, 200,  201, 203) // right
              .build();
          Table joinedTable = leftTable.onColumns(0).innerJoin(rightTable.onColumns(0));
-         Table orderedJoinedTable = joinedTable.orderBy(Table.asc(1, true))) {
+         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true))) {
         assertTablesAreEqual(expected, orderedJoinedTable);
       }
 
@@ -1057,7 +1057,7 @@ void testInnerJoinOnNullKeys() {
               .column(202, 200,  203) // right
               .build();
            Table joinedTable = leftTable.onColumns(0).innerJoin(rightTable.onColumns(0), false);
-           Table orderedJoinedTable = joinedTable.orderBy(Table.asc(1, true))){
+           Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true))){
         assertTablesAreEqual(expected, orderedJoinedTable);
       }
     }
@@ -1074,7 +1074,7 @@ void testInnerJoinWithOnlyCommonKeys() {
              .column(200, 201, 202, 203, 204, 205, 206, 207, 208, 209)
              .build();
          Table joinedTable = leftTable.onColumns(0).innerJoin(rightTable.onColumns(new int[]{0}), true);
-         Table orderedJoinedTable = joinedTable.orderBy(Table.asc(1, true));
+         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true));
          Table expected = new Table.TestBuilder()
              .column(360, 326, 254, 306, 109, 361, 251, 335, 301, 317) // common
              .column(100, 101, 102, 103, 104, 105, 106, 107, 108, 109) // left
@@ -1099,7 +1099,7 @@ void testLeftSemiJoin() {
              .column(102, 107, 108, 109)
              .build();
          Table joinedTable = leftTable.onColumns(0).leftSemiJoin(rightTable.onColumns(0), true);
-         Table orderedJoinedTable = joinedTable.orderBy(Table.asc(1, true))) {
+         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true))) {
       assertTablesAreEqual(expected, orderedJoinedTable);
     }
   }
@@ -1116,7 +1116,7 @@ void testLeftSemiJoinWithNulls() {
              .column("20", "21", "22", "23", "24", "25", "26", "27", "28", "29")
              .build();
          Table joinedTable = leftTable.onColumns(0, 2).leftSemiJoin(rightTable.onColumns(0, 1), true);
-         Table orderedJoinedTable = joinedTable.orderBy(Table.asc(0, true));
+         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(0, true));
          Table expected = new Table.TestBuilder()
              .column(254,   326,   361)
              .column(null,   11,    17)
@@ -1143,7 +1143,7 @@ void testLeftSemiJoinOnNullKeys() {
                .column(102, 107,  108, 109)
                .build();
             Table joinedTable = leftTable.onColumns(0).leftSemiJoin(rightTable.onColumns(0));
-            Table orderedJoinedTable = joinedTable.orderBy(Table.asc(1, true))) {
+            Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true))) {
           assertTablesAreEqual(expected, orderedJoinedTable);
        }
 
@@ -1153,7 +1153,7 @@ void testLeftSemiJoinOnNullKeys() {
               .column(102, 107, 109)
               .build();
            Table joinedTable = leftTable.onColumns(0).leftSemiJoin(rightTable.onColumns(0), false);
-           Table orderedJoinedTable = joinedTable.orderBy(Table.asc(1, true))) {
+           Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true))) {
         assertTablesAreEqual(expected, orderedJoinedTable);
       }
     }
@@ -1174,7 +1174,7 @@ void testLeftAntiJoin() {
              .column(100, 101, 103, 104, 105, 106)
              .build();
          Table joinedTable = leftTable.onColumns(0).leftAntiJoin(rightTable.onColumns(0), true);
-         Table orderedJoinedTable = joinedTable.orderBy(Table.asc(1, true))) {
+         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true))) {
       assertTablesAreEqual(expected, orderedJoinedTable);
     }
   }
@@ -1196,7 +1196,7 @@ void testLeftAntiJoinOnNullKeys() {
               .column(100, 101, 103, 104, 105, 106)
               .build();
            Table joinedTable = leftTable.onColumns(0).leftAntiJoin(rightTable.onColumns(0));
-           Table orderedJoinedTable = joinedTable.orderBy(Table.asc(1, true))) {
+           Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true))) {
         assertTablesAreEqual(expected, orderedJoinedTable);
       }
 
@@ -1206,7 +1206,7 @@ void testLeftAntiJoinOnNullKeys() {
               .column(100, 101, 103, 104, 105, 106,  108)
               .build();
            Table joinedTable = leftTable.onColumns(0).leftAntiJoin(rightTable.onColumns(0), false);
-           Table orderedJoinedTable = joinedTable.orderBy(Table.asc(1, true))) {
+           Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true))) {
         assertTablesAreEqual(expected, orderedJoinedTable);
       }
     }
@@ -1224,7 +1224,7 @@ void testLeftAntiJoinWithNulls() {
              .column("20", "21", "22", "23", "24", "25", "26", "27", "28", "29")
              .build();
          Table joinedTable = leftTable.onColumns(0, 2).leftAntiJoin(rightTable.onColumns(0, 1), true);
-         Table orderedJoinedTable = joinedTable.orderBy(Table.asc(2, true));
+         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(2, true));
          Table expected = new Table.TestBuilder()
              .column( 360,  326, null,  306, null,  251,  301,  317)
              .column(  10,   11, null,   13,   14,   16,   18,   19)
@@ -1249,8 +1249,8 @@ void testCrossJoin() {
          Table joinedTable = leftTable.crossJoin(rightTable);
          Table orderedJoinedTable =
                  joinedTable.orderBy(
-                         Table.asc(0, true),
-                         Table.asc(1, true))) {
+                     OrderByArg.asc(0, true),
+                     OrderByArg.asc(1, true))) {
       assertTablesAreEqual(expected, orderedJoinedTable);
     }
   }
@@ -2297,7 +2297,7 @@ void testGroupByUniqueCount() {
       try (Table t3 = t1
               .groupBy(0, 1)
               .aggregate(Aggregation.nunique().onColumn(0));
-           Table sorted = t3.orderBy(Table.asc(0), Table.asc(1), Table.asc(2));
+           Table sorted = t3.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
            Table expected = new Table.TestBuilder()
                    .column( "1",  "1",  "1",  "1")
                    .column(   0,    1,    3,    5)
@@ -2318,7 +2318,7 @@ void testGroupByUniqueCountNulls() {
       try (Table t3 = t1
               .groupBy(0, 1)
               .aggregate(Aggregation.nunique(true).onColumn(0));
-           Table sorted = t3.orderBy(Table.asc(0), Table.asc(1), Table.asc(2));
+           Table sorted = t3.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
            Table expected = new Table.TestBuilder()
                    .column( "1",  "1",  "1",  "1")
                    .column(   0,    1,    3,    5)
@@ -2370,8 +2370,8 @@ void testWindowingCount() {
         .decimal32Column(-1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3) // Decimal GBY Key
         .decimal64Column(1, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L) // Decimal OBY Key
         .build()) {
-      try (Table sorted = unsorted.orderBy(Table.asc(0), Table.asc(1), Table.asc(2));
-           Table decSorted = unsorted.orderBy(Table.asc(0), Table.asc(4), Table.asc(5));
+      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
+           Table decSorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(4), OrderByArg.asc(5));
            ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6)) {
         ColumnVector sortedAggColumn = sorted.getColumn(3);
         assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
@@ -2406,8 +2406,8 @@ void testWindowingMin() {
         .decimal64Column(1, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L) // Decimal OBY Key
         .decimal64Column(2, 7L, 5L, 1L, 9L, 7L, 9L, 8L, 2L, 8L, 0L, 6L, 6L) // Decimal Agg Column
         .build()) {
-      try (Table sorted = unsorted.orderBy(Table.asc(0), Table.asc(1), Table.asc(2));
-           Table decSorted = unsorted.orderBy(Table.asc(0), Table.asc(4), Table.asc(5));
+      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
+           Table decSorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(4), OrderByArg.asc(5));
            ColumnVector expectSortedAggCol = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6);
            ColumnVector expectDecSortedAggCol = ColumnVector.decimalFromLongs(2, 7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6)) {
         ColumnVector sortedAggColumn = sorted.getColumn(3);
@@ -2444,8 +2444,8 @@ void testWindowingMax() {
         .decimal64Column(1, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L) // Decimal OBY Key
         .decimal64Column(2, 7L, 5L, 1L, 9L, 7L, 9L, 8L, 2L, 8L, 0L, 6L, 6L) // Decimal Agg Column
         .build()) {
-      try (Table sorted = unsorted.orderBy(Table.asc(0), Table.asc(1), Table.asc(2));
-           Table decSorted = unsorted.orderBy(Table.asc(0), Table.asc(4), Table.asc(5));
+      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
+           Table decSorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(4), OrderByArg.asc(5));
            ColumnVector expectSortedAggCol = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6);
            ColumnVector expectDecSortedAggCol = ColumnVector.decimalFromLongs(2, 7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6)) {
         ColumnVector sortedAggColumn = sorted.getColumn(3);
@@ -2479,7 +2479,7 @@ void testWindowingSum() {
         .column(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6) // OBY Key
         .column(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6) // Agg Column
         .build()) {
-      try (Table sorted = unsorted.orderBy(Table.asc(0), Table.asc(1), Table.asc(2));
+      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
            ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6)) {
         ColumnVector sortedAggColumn = sorted.getColumn(3);
         assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
@@ -2509,8 +2509,8 @@ void testWindowingRowNumber() {
         .decimal64Column(1, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L) // Decimal OBY Key
         .decimal64Column(2, 7L, 5L, 1L, 9L, 7L, 9L, 8L, 2L, 8L, 0L, 6L, 6L) // Decimal Agg Column
         .build()) {
-      try (Table sorted = unsorted.orderBy(Table.asc(0), Table.asc(1), Table.asc(2));
-           Table decSorted = unsorted.orderBy(Table.asc(0), Table.asc(4), Table.asc(5));
+      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
+           Table decSorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(4), OrderByArg.asc(5));
            ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6);
            ColumnVector expectDecSortedAggColumn = ColumnVector.decimalFromLongs(2, 7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6)) {
         ColumnVector sortedAggColumn = sorted.getColumn(3);
@@ -2590,7 +2590,7 @@ void testWindowingCollect() {
               ).build();
          ColumnVector expectSortedAggColumn = ColumnVector
              .fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, null, 0, 6, null)) {
-      try (Table sorted = raw.orderBy(Table.asc(0), Table.asc(1), Table.asc(2))) {
+      try (Table sorted = raw.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2))) {
         ColumnVector sortedAggColumn = sorted.getColumn(3);
         assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
 
@@ -2652,8 +2652,8 @@ void testWindowingLead() {
         .decimal64Column(-2, 7L, 5L, 1L, 9L, 7L, 9L, 8L, 2L, 8L, 0L, 6L, 6L) // Decimal Agg Column
         .build()) {
 
-      try (Table sorted = unsorted.orderBy(Table.asc(0), Table.asc(1), Table.asc(2));
-           Table decSorted = unsorted.orderBy(Table.asc(0), Table.asc(4), Table.asc(5));
+      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
+           Table decSorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(4), OrderByArg.asc(5));
            ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6);
            ColumnVector expectDecSortedAggColumn = ColumnVector.decimalFromLongs(-2, 7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6)) {
         ColumnVector sortedAggColumn = sorted.getColumn(3);
@@ -2745,8 +2745,8 @@ void testWindowingLag() {
         .decimal64Column(-2, 7L, 5L, 1L, 9L, 7L, 9L, 8L, 2L, 8L, 0L, 6L, 6L) // Decimal Agg Column
         .build()) {
 
-      try (Table sorted = unsorted.orderBy(Table.asc(0), Table.asc(1), Table.asc(2));
-           Table decSorted = unsorted.orderBy(Table.asc(0), Table.asc(4), Table.asc(5));
+      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
+           Table decSorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(4), OrderByArg.asc(5));
            ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6);
            ColumnVector decExpectSortedAggColumn = ColumnVector.decimalFromLongs(-2, 7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6)) {
         ColumnVector sortedAggColumn = sorted.getColumn(3);
@@ -2833,7 +2833,7 @@ void testWindowingMean() {
                                                  .column( 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6) // OBY Key
                                                  .column( 7, 5, 3, 7, 7, 9, 8, 4, 8, 0, 4, 8) // Agg Column
                                                  .build()) {
-      try (Table sorted = unsorted.orderBy(Table.asc(0), Table.asc(1), Table.asc(2));
+      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
            ColumnVector expectedSortedAggCol = ColumnVector.fromBoxedInts(7, 5, 3, 7, 7, 9, 8, 4, 8, 0, 4, 8)) {
         ColumnVector sortedAggColumn = sorted.getColumn(3);
         assertColumnsAreEqual(expectedSortedAggCol, sortedAggColumn);
@@ -2859,7 +2859,7 @@ void testWindowingOnMultipleDifferentColumns() {
                                                  .column( 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6) // OBY Key
                                                  .column( 7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6) // Agg Column
                                                  .build()) {
-      try (Table sorted = unsorted.orderBy(Table.asc(0), Table.asc(1), Table.asc(2));
+      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
            ColumnVector expectedSortedAggCol = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6)) {
         ColumnVector sortedAggColumn = sorted.getColumn(3);
         assertColumnsAreEqual(expectedSortedAggCol, sortedAggColumn);
@@ -2909,7 +2909,7 @@ void testWindowingWithoutGroupByColumns() {
                                                  .build();
          ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6)) {
 
-      try (Table sorted = unsorted.orderBy(Table.asc(0))) {
+      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0))) {
         ColumnVector sortedAggColumn = sorted.getColumn(1);
         assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
 
@@ -2934,7 +2934,7 @@ void testTimeRangeWindowingCount() {
                                                  .timestampDayColumn( 1, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7) // Timestamp Key
                                                  .column(             7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8) // Agg Column
                                                  .build()) {
-      try (Table sorted = unsorted.orderBy(Table.asc(0), Table.asc(1), Table.asc(2));
+      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
            ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
         ColumnVector sortedAggColumn = sorted.getColumn(3);
         assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
@@ -2962,7 +2962,7 @@ void testTimeRangeWindowingLead() {
         .timestampDayColumn( 1, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7) // Timestamp Key
         .column(             7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8) // Agg Column
         .build()) {
-      try (Table sorted = unsorted.orderBy(Table.asc(0), Table.asc(1), Table.asc(2));
+      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
            ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
         ColumnVector sortedAggColumn = sorted.getColumn(3);
         assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
@@ -2991,7 +2991,7 @@ void testTimeRangeWindowingMax() {
                                                  .timestampDayColumn( 1, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7) // Timestamp Key
                                                  .column(             7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8) // Agg Column
                                                  .build()) {
-      try (Table sorted = unsorted.orderBy(Table.asc(0), Table.asc(1), Table.asc(2));
+      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
            ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
         ColumnVector sortedAggColumn = sorted.getColumn(3);
         assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
@@ -3029,7 +3029,7 @@ void testTimeRangeWindowingRowNumber() {
                                                  .timestampDayColumn( 1, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7) // Timestamp Key
                                                  .column(             7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8) // Agg Column
                                                  .build()) {
-      try (Table sorted = unsorted.orderBy(Table.asc(0), Table.asc(1), Table.asc(2));
+      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
            ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
         ColumnVector sortedAggColumn = sorted.getColumn(3);
         assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
@@ -3056,7 +3056,7 @@ void testTimeRangeWindowingCountDescendingTimestamps() {
                                                  .timestampDayColumn( 7, 6, 6, 5,  5, 4, 4, 3,  3, 3, 2, 1, 1) // Timestamp Key
                                                  .column(             7, 5, 1, 9,  7, 9, 8, 2,  8, 0, 6, 6, 8) // Agg Column
                                                  .build()) {
-      try (Table sorted = unsorted.orderBy(Table.asc(0), Table.asc(1), Table.desc(2));
+      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.desc(2));
            ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
         ColumnVector sortedAggColumn = sorted.getColumn(3);
         assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
@@ -3093,7 +3093,7 @@ void testTimeRangeWindowingWithoutGroupByColumns() {
     try (Table unsorted = new Table.TestBuilder().timestampDayColumn( 1, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7) // Timestamp Key
                                                  .column(             7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8) // Agg Column
                                                  .build()) {
-      try (Table sorted = unsorted.orderBy(Table.asc(0));
+      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0));
            ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
         ColumnVector sortedAggColumn = sorted.getColumn(1);
         assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
@@ -3137,7 +3137,7 @@ void testTimeRangeWindowingCountUnboundedPreceding() {
                                                  .timestampDayColumn( 1, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7) // Timestamp Key
                                                  .column(             7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8) // Agg Column
                                                  .build()) {
-      try (Table sorted = unsorted.orderBy(Table.asc(0), Table.asc(1), Table.asc(2));
+      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
            ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
         ColumnVector sortedAggColumn = sorted.getColumn(3);
         assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
@@ -3167,7 +3167,7 @@ void testTimeRangeWindowingCountUnboundedASCWithNullsFirst() {
                                     .timestampDayColumn( X, X, X, 2, 3, 5,  X, X, 1, 2, 4, 5, 7) // Timestamp Key
                                     .column(             7, 5, 1, 9, 7, 9,  8, 2, 8, 0, 6, 6, 8) // Agg Column
                                     .build()) {
-      try (Table sorted = unsorted.orderBy(Table.asc(0), Table.asc(1), Table.asc(2, true));
+      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2, true));
            ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
         ColumnVector sortedAggColumn = sorted.getColumn(3);
         assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
@@ -3239,7 +3239,7 @@ void testTimeRangeWindowingCountUnboundedDESCWithNullsFirst() {
             .timestampDayColumn( X, X, X, 5, 3, 2,  X, X, 7, 5, 4, 2, 1) // Timestamp Key
             .column(             7, 5, 1, 9, 7, 9,  8, 2, 8, 0, 6, 6, 8) // Agg Column
             .build()) {
-      try (Table sorted = unsorted.orderBy(Table.asc(0), Table.asc(1), Table.desc(2, false));
+      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.desc(2, false));
            ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
         ColumnVector sortedAggColumn = sorted.getColumn(3);
         assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
@@ -3316,7 +3316,7 @@ void testTimeRangeWindowingCountUnboundedASCWithNullsLast() {
             .timestampDayColumn( 2, 3, 5, X, X, X,  1, 2, 4, 5, 7, X, X) // Timestamp Key
             .column(             7, 5, 1, 9, 7, 9,  8, 2, 8, 0, 6, 6, 8) // Agg Column
             .build()) {
-      try (Table sorted = unsorted.orderBy(Table.asc(0), Table.asc(1), Table.asc(2, false));
+      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2, false));
            ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
         ColumnVector sortedAggColumn = sorted.getColumn(3);
         assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
@@ -3387,7 +3387,7 @@ void testTimeRangeWindowingCountUnboundedDESCWithNullsLast() {
             .timestampDayColumn( 5, 3, 2, X, X, X,  7, 5, 4, 2, 1, X, X) // Timestamp Key
             .column(             7, 5, 1, 9, 7, 9,  8, 2, 8, 0, 6, 6, 8) // Agg Column
             .build()) {
-      try (Table sorted = unsorted.orderBy(Table.asc(0), Table.asc(1), Table.desc(2, true));
+      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.desc(2, true));
            ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
         ColumnVector sortedAggColumn = sorted.getColumn(3);
         assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
@@ -3463,7 +3463,7 @@ void testGroupByCountWithNulls() {
                                            .column(   1,    1,    1, null,    1,    1)
                                            .build()) {
       try (Table tmp = t1.groupBy(0).aggregate(count(1), count(2), count(3));
-           Table t3 = tmp.orderBy(Table.asc(0, true));
+           Table t3 = tmp.orderBy(OrderByArg.asc(0, true));
            HostColumnVector groupCol = t3.getColumn(0).copyToHost();
            HostColumnVector countCol = t3.getColumn(1).copyToHost();
            HostColumnVector nullCountCol = t3.getColumn(2).copyToHost();
@@ -3500,7 +3500,7 @@ void testGroupByCountWithNullsIncluded() {
             .column(   1,    1,    1, null,    1,    1)
             .build()) {
       try (Table tmp = t1.groupBy(0).aggregate(count(1, true), count(2, true), count(3, true), count(3));
-           Table t3 = tmp.orderBy(Table.asc(0, true));
+           Table t3 = tmp.orderBy(OrderByArg.asc(0, true));
            HostColumnVector groupCol = t3.getColumn(0).copyToHost();
            HostColumnVector countCol = t3.getColumn(1).copyToHost();
            HostColumnVector nullCountCol = t3.getColumn(2).copyToHost();
@@ -3547,7 +3547,7 @@ void testGroupByCountWithCollapsingNulls() {
           .build();
 
       try (Table tmp = t1.groupBy(options, 0).aggregate(count(1), count(2), count(3));
-           Table t3 = tmp.orderBy(Table.asc(0, true));
+           Table t3 = tmp.orderBy(OrderByArg.asc(0, true));
            HostColumnVector groupCol = t3.getColumn(0).copyToHost();
            HostColumnVector countCol = t3.getColumn(1).copyToHost();
            HostColumnVector nullCountCol = t3.getColumn(2).copyToHost();
@@ -3615,7 +3615,7 @@ void testGroupByArgMax() {
       try (Table t3 = t1.groupBy(0, 1)
               .aggregate(Aggregation.argMax().onColumn(2));
            Table sorted = t3
-              .orderBy(Table.asc(0), Table.asc(1), Table.asc(2));
+              .orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
            Table expected = new Table.TestBuilder()
                    .column(1, 1, 1, 1)
                    .column(0, 1, 2, 3)
@@ -3637,7 +3637,7 @@ void testGroupByArgMin() {
       try (Table t3 = t1.groupBy(0, 1)
               .aggregate(Aggregation.argMin().onColumn(2));
            Table sorted = t3
-                   .orderBy(Table.asc(0), Table.asc(1), Table.asc(2));
+                   .orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
            Table expected = new Table.TestBuilder()
                    .column(1, 1, 1, 1)
                    .column(0, 1, 2, 3)
@@ -3654,7 +3654,7 @@ void testGroupByMinBool() {
         .column(true, null, false, true, null, null)
         .column(   1,    1,     2,    2,    3,    3).build();
          Table other = t1.groupBy(1).aggregate(min(0));
-         Table ordered = other.orderBy(Table.asc(0));
+         Table ordered = other.orderBy(OrderByArg.asc(0));
          Table expected = new Table.TestBuilder()
              .column(1, 2, 3)
              .column (true, false, null)
@@ -3669,7 +3669,7 @@ void testGroupByMaxBool() {
         .column(false, null, false, true, null, null)
         .column(   1,    1,     2,    2,    3,    3).build();
          Table other = t1.groupBy(1).aggregate(max(0));
-         Table ordered = other.orderBy(Table.asc(0));
+         Table ordered = other.orderBy(OrderByArg.asc(0));
          Table expected = new Table.TestBuilder()
              .column(1, 2, 3)
              .column (false, true, null)
@@ -3695,7 +3695,7 @@ void testGroupByDuplicateAggregates() {
              .column(   1,    2,    2,    1).build()) {
       try (Table t3 = t1.groupBy(0, 1)
           .aggregate(max(2), min(2), min(2), max(2), min(2), count(1));
-          Table t4 = t3.orderBy(Table.asc(2))) {
+          Table t4 = t3.orderBy(OrderByArg.asc(2))) {
         // verify t4
         assertEquals(4, t4.getRowCount());
         assertTablesAreEqual(t4, expected);
diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 840f0c98987..7138bb49743 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -41,7 +41,7 @@ class AggregationKind(Enum):
     ALL = libcudf_aggregation.aggregation.Kind.ALL
     SUM_OF_SQUARES = libcudf_aggregation.aggregation.Kind.SUM_OF_SQUARES
     MEAN = libcudf_aggregation.aggregation.Kind.MEAN
-    VARIANCE = libcudf_aggregation.aggregation.Kind.VARIANCE
+    VAR = libcudf_aggregation.aggregation.Kind.VARIANCE
     STD = libcudf_aggregation.aggregation.Kind.STD
     MEDIAN = libcudf_aggregation.aggregation.Kind.MEDIAN
     QUANTILE = libcudf_aggregation.aggregation.Kind.QUANTILE
@@ -50,13 +50,12 @@ class AggregationKind(Enum):
     NUNIQUE = libcudf_aggregation.aggregation.Kind.NUNIQUE
     NTH = libcudf_aggregation.aggregation.Kind.NTH_ELEMENT
     COLLECT = libcudf_aggregation.aggregation.Kind.COLLECT
-    COLLECT_SET = libcudf_aggregation.aggregation.Kind.COLLECT_SET
+    UNIQUE = libcudf_aggregation.aggregation.Kind.COLLECT_SET
     PTX = libcudf_aggregation.aggregation.Kind.PTX
     CUDA = libcudf_aggregation.aggregation.Kind.CUDA
 
 
 cdef class Aggregation:
-
     def __init__(self, op, **kwargs):
         self.c_obj = move(make_aggregation(op, kwargs))
 
@@ -246,7 +245,7 @@ cdef class _AggregationFactory:
         return agg
 
     @classmethod
-    def collect_set(cls):
+    def unique(cls):
         cdef Aggregation agg = Aggregation.__new__(Aggregation)
         agg.c_obj = move(libcudf_aggregation.make_collect_set_aggregation())
         return agg
diff --git a/python/cudf/cudf/_lib/cpp/concatenate.pxd b/python/cudf/cudf/_lib/cpp/concatenate.pxd
index b5ec3bcb7d4..c776d23aa85 100644
--- a/python/cudf/cudf/_lib/cpp/concatenate.pxd
+++ b/python/cudf/cudf/_lib/cpp/concatenate.pxd
@@ -5,12 +5,22 @@ from libcpp.vector cimport vector
 
 from cudf._lib.cpp.column.column cimport column, column_view
 from cudf._lib.cpp.table.table cimport table, table_view
-from rmm._lib.device_buffer cimport device_buffer
+from cudf._lib.cpp.utilities.host_span cimport host_span
 
+from rmm._lib.device_buffer cimport device_buffer
 
 cdef extern from "cudf/concatenate.hpp" namespace "cudf" nogil:
+    # The versions of concatenate taking vectors don't exist in libcudf
+    # C++, but passing a vector works because a host_span is implicitly
+    # constructable from a vector. In case they are needed in the future,
+    # host_span versions can be added, e.g:
+    #
+    # cdef device_buffer concatenate_masks "cudf::concatenate_masks"(
+    #    host_span[column_view] views
+    # ) except +
+
     cdef device_buffer concatenate_masks "cudf::concatenate_masks"(
-        const vector[column_view] columns
+        const vector[column_view] views
     ) except +
     cdef unique_ptr[column] concatenate_columns "cudf::concatenate"(
         const vector[column_view] columns
diff --git a/python/cudf/cudf/_lib/cpp/labeling.pxd b/python/cudf/cudf/_lib/cpp/labeling.pxd
new file mode 100644
index 00000000000..996ae4f9e38
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/labeling.pxd
@@ -0,0 +1,19 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+
+cdef extern from "cudf/labeling/label_bins.hpp" namespace "cudf" nogil:
+    ctypedef enum inclusive:
+        YES "cudf::inclusive::YES"
+        NO "cudf::inclusive::NO"
+
+    cdef unique_ptr[column] label_bins (
+        const column_view &input,
+        const column_view &left_edges,
+        inclusive left_inclusive,
+        const column_view &right_edges,
+        inclusive right_inclusive
+    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/lists/contains.pxd b/python/cudf/cudf/_lib/cpp/lists/contains.pxd
new file mode 100644
index 00000000000..ec2f61d08fa
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/lists/contains.pxd
@@ -0,0 +1,15 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from cudf._lib.cpp.scalar.scalar cimport scalar
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
+
+from cudf._lib.cpp.column.column_view cimport column_view
+
+cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil:
+    cdef unique_ptr[column] contains(
+        lists_column_view lists,
+        scalar search_key,
+    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/utilities/host_span.pxd b/python/cudf/cudf/_lib/cpp/utilities/host_span.pxd
new file mode 100644
index 00000000000..cbbe3710347
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/utilities/host_span.pxd
@@ -0,0 +1,8 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp.vector cimport vector
+
+cdef extern from "cudf/utilities/span.hpp" namespace "cudf" nogil:
+    cdef cppclass host_span[T]:
+        host_span() except +
+        host_span(vector[T]) except +
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 0f5cdc73d3b..713a2274a77 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -35,13 +35,15 @@ _GROUPBY_AGGS = {
     "median",
     "nunique",
     "nth",
-    "collect"
+    "collect",
+    "unique",
 }
 
 _CATEGORICAL_AGGS = {
     "count",
     "size",
     "nunique",
+    "unique",
 }
 
 _STRING_AGGS = {
@@ -51,13 +53,15 @@ _STRING_AGGS = {
     "min",
     "nunique",
     "nth",
-    "collect"
+    "collect",
+    "unique",
 }
 
 _LIST_AGGS = {
-    "collect"
+    "collect",
 }
 
+
 cdef class GroupBy:
     cdef unique_ptr[libcudf_groupby.groupby] c_obj
     cdef dict __dict__
@@ -145,12 +149,23 @@ cdef class GroupBy:
             vector[libcudf_groupby.aggregation_result]
         ] c_result
 
-        with nogil:
-            c_result = move(
-                self.c_obj.get()[0].aggregate(
-                    c_agg_requests
+        try:
+            with nogil:
+                c_result = move(
+                    self.c_obj.get()[0].aggregate(
+                        c_agg_requests
+                    )
                 )
-            )
+        except RuntimeError as e:
+            # TODO: remove this try..except after
+            # https://github.com/rapidsai/cudf/issues/7611
+            # is resolved
+            if ("make_empty_column") in str(e):
+                raise NotImplementedError(
+                    "Aggregation not supported for empty columns"
+                ) from e
+            else:
+                raise
 
         grouped_keys = Table.from_unique_ptr(
             move(c_result.first),
diff --git a/python/cudf/cudf/_lib/labeling.pyx b/python/cudf/cudf/_lib/labeling.pyx
new file mode 100644
index 00000000000..1b553024347
--- /dev/null
+++ b/python/cudf/cudf/_lib/labeling.pyx
@@ -0,0 +1,47 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+import numpy as np
+from enum import IntEnum
+
+from libc.stdint cimport uint32_t
+from libcpp cimport bool as cbool
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.column cimport Column
+from cudf._lib.replace import replace_nulls
+
+from cudf._lib.cpp.labeling cimport inclusive
+from cudf._lib.cpp.labeling cimport label_bins as cpp_label_bins
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+
+
+# Note that the parameter input shadows a Python built-in in the local scope,
+# but I'm not too concerned about that since there's no use-case for actual
+# input in this context.
+def label_bins(Column input, Column left_edges, cbool left_inclusive,
+               Column right_edges, cbool right_inclusive):
+    cdef inclusive c_left_inclusive = \
+        inclusive.YES if left_inclusive else inclusive.NO
+    cdef inclusive c_right_inclusive = \
+        inclusive.YES if right_inclusive else inclusive.NO
+
+    cdef column_view input_view = input.view()
+    cdef column_view left_edges_view = left_edges.view()
+    cdef column_view right_edges_view = right_edges.view()
+
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_label_bins(
+                input_view,
+                left_edges_view,
+                c_left_inclusive,
+                right_edges_view,
+                c_right_inclusive,
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 2971aad8313..7f745e58c67 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -17,6 +17,9 @@ from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.column.column cimport column
 
+from cudf._lib.scalar cimport DeviceScalar
+from cudf._lib.cpp.scalar.scalar cimport scalar
+
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport size_type, order, null_order
@@ -29,6 +32,8 @@ from cudf._lib.types cimport (
 )
 from cudf.core.dtypes import ListDtype
 
+from cudf._lib.cpp.lists.contains cimport contains
+
 from cudf._lib.cpp.lists.extract cimport extract_list_element
 
 
@@ -93,6 +98,7 @@ def extract_element(Column col, size_type index):
     cdef shared_ptr[lists_column_view] list_view = (
         make_shared[lists_column_view](col.view())
     )
+
     cdef unique_ptr[column] c_result
 
     with nogil:
@@ -100,3 +106,21 @@ def extract_element(Column col, size_type index):
 
     result = Column.from_unique_ptr(move(c_result))
     return result
+
+
+def contains_scalar(Column col, DeviceScalar search_key):
+    cdef shared_ptr[lists_column_view] list_view = (
+        make_shared[lists_column_view](col.view())
+    )
+    cdef const scalar* search_key_value = search_key.get_raw_ptr()
+
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(contains(
+            list_view.get()[0],
+            search_key_value[0],
+        ))
+
+    result = Column.from_unique_ptr(move(c_result))
+    return result
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 0158df46cc4..d8b4fbbbe4b 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -294,7 +294,9 @@ cpdef write_parquet(
     cdef unique_ptr[cudf_io_types.data_sink] _data_sink
     cdef cudf_io_types.sink_info sink = make_sink_info(path, _data_sink)
 
-    if index is not False and not isinstance(table._index, cudf.RangeIndex):
+    if index is True or (
+        index is None and not isinstance(table._index, cudf.RangeIndex)
+    ):
         tv = table.view()
         tbl_meta = make_unique[table_input_metadata](tv)
         for level, idx_name in enumerate(table._index.names):
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 4fe795e57a9..13eedb34c18 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -99,15 +99,31 @@ cpdef generate_pandas_metadata(Table table, index):
                 idx = table.index
 
             if isinstance(idx, cudf.core.index.RangeIndex):
-                descr = {
-                    "kind": "range",
-                    "name": table.index.name,
-                    "start": table.index.start,
-                    "stop": table.index.stop,
-                    "step": table.index.step,
-                }
+                if index is None:
+                    descr = {
+                        "kind": "range",
+                        "name": table.index.name,
+                        "start": table.index.start,
+                        "stop": table.index.stop,
+                        "step": table.index.step,
+                    }
+                else:
+                    # When `index=True`, RangeIndex needs to be materialized.
+                    materialized_idx = cudf.Index(idx._values, name=idx.name)
+                    descr = \
+                        _index_level_name(
+                            index_name=materialized_idx.name,
+                            level=level,
+                            column_names=col_names
+                        )
+                    index_levels.append(materialized_idx)
             else:
-                descr = _index_level_name(idx.name, level, col_names)
+                descr = \
+                    _index_level_name(
+                        index_name=idx.name,
+                        level=level,
+                        column_names=col_names
+                    )
                 if is_categorical_dtype(idx):
                     raise ValueError(
                         "'category' column dtypes are currently not "
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index a563248f4ab..0bacbe04356 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -178,6 +178,8 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:
                 return cudf.Scalar(None, dtype=other.dtype)
 
             return cudf.Scalar(other)
+        elif other is None:
+            return cudf.Scalar(other, dtype=self.dtype)
         else:
             raise TypeError(f"cannot normalize {type(other)}")
 
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 7fbe602f07a..4ba675516ae 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -72,6 +72,9 @@ def binary_operator(self, op, other, reflect=False):
         result.dtype.precision = _binop_precision(self.dtype, other.dtype, op)
         return result
 
+    def _apply_scan_op(self, op: str) -> ColumnBase:
+        return libcudf.reduce.scan(op, self, True)
+
     def as_decimal_column(
         self, dtype: Dtype, **kwargs
     ) -> "cudf.core.column.DecimalColumn":
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 2204fbdea1f..b7f34e8c007 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -7,7 +7,12 @@
 
 import cudf
 from cudf._lib.copying import segmented_gather
-from cudf._lib.lists import count_elements, extract_element, sort_lists
+from cudf._lib.lists import (
+    contains_scalar,
+    count_elements,
+    extract_element,
+    sort_lists,
+)
 from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase, as_column, column
 from cudf.core.column.methods import ColumnMethodsMixin
@@ -210,6 +215,44 @@ def get(self, index):
         else:
             raise IndexError("list index out of range")
 
+    def contains(self, search_key):
+        """
+        Creates a column of bool values indicating whether the specified scalar
+        is an element of each row of a list column.
+
+        Parameters
+        ----------
+        search_key : scalar
+            element being searched for in each row of the list column
+
+        Returns
+        -------
+        Column
+
+        Examples
+        --------
+        >>> s = cudf.Series([[1, 2, 3], [3, 4, 5], [4, 5, 6]])
+        >>> s.list.contains(4)
+        Series([False, True, True])
+        dtype: bool
+        """
+        try:
+            res = self._return_or_inplace(
+                contains_scalar(self._column, search_key.device_value)
+            )
+        except RuntimeError as e:
+            if (
+                "Type/Scale of search key does not"
+                "match list column element type" in str(e)
+            ):
+                raise TypeError(
+                    "Type/Scale of search key does not"
+                    "match list column element type"
+                ) from e
+            raise
+        else:
+            return res
+
     @property
     def leaves(self):
         """
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index e22b511db01..a39638106bb 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -275,6 +275,8 @@ def normalize_binop_value(self, other) -> BinaryOperand:
             return cudf.Scalar(other)
         elif np.isscalar(other):
             return cudf.Scalar(other)
+        elif other is None:
+            return cudf.Scalar(other, dtype=self.dtype)
         else:
             raise TypeError(f"cannot normalize {type(other)}")
 
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index bd009a9ad84..b5f57356698 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3841,10 +3841,32 @@ def argsort(self, ascending=True, na_position="last"):
         - Support axis='index' only.
         - Not supporting: inplace, kind
         - Ascending can be a list of bools to control per column
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a':[10, 0, 2], 'b':[-10, 10, 1]})
+        >>> df
+            a   b
+        0  10 -10
+        1   0  10
+        2   2   1
+        >>> inds = df.argsort()
+        >>> inds
+        0    1
+        1    2
+        2    0
+        dtype: int32
+        >>> df.take(inds)
+            a   b
+        1   0  10
+        2   2   1
+        0  10 -10
         """
-        return self._get_sorted_inds(
+        inds_col = self._get_sorted_inds(
             ascending=ascending, na_position=na_position
         )
+        return cudf.Series(inds_col)
 
     @annotate("SORT_INDEX", color="red", domain="cudf_python")
     def sort_index(
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 9d4643da637..a664c4fb182 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -4725,8 +4725,9 @@ def cumsum(self, axis=0, skipna=True, *args, **kwargs):
                 result_col[first_index:] = None
 
         # pandas always returns int64 dtype if original dtype is int or `bool`
-        if np.issubdtype(result_col.dtype, np.integer) or np.issubdtype(
-            result_col.dtype, np.bool_
+        if not is_decimal_dtype(result_col.dtype) and (
+            np.issubdtype(result_col.dtype, np.integer)
+            or np.issubdtype(result_col.dtype, np.bool_)
         ):
             return Series(
                 result_col.astype(np.int64)._apply_scan_op("sum"),
@@ -4774,6 +4775,11 @@ def cumprod(self, axis=0, skipna=True, *args, **kwargs):
         if axis not in (None, 0):
             raise NotImplementedError("axis parameter is not implemented yet")
 
+        if is_decimal_dtype(self.dtype):
+            raise NotImplementedError(
+                "cumprod does not currently support decimal types"
+            )
+
         skipna = True if skipna is None else skipna
 
         if skipna:
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 18f2d7e474b..eb8aaaadd51 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -1773,6 +1773,51 @@ def decimal_series(input, dtype):
     utils.assert_eq(expect, got)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "uint8",
+        "uint16",
+        "uint32",
+        "uint64",
+        "int8",
+        "int16",
+        "int32",
+        "int64",
+        "float32",
+        "float64",
+        "str",
+        "datetime64[ns]",
+        "datetime64[us]",
+        "datetime64[ms]",
+        "datetime64[s]",
+        "timedelta64[ns]",
+        "timedelta64[us]",
+        "timedelta64[ms]",
+        "timedelta64[s]",
+    ],
+)
+@pytest.mark.parametrize("null_scalar", [None, cudf.NA, np.datetime64("NaT")])
+@pytest.mark.parametrize("cmpop", _cmpops)
+def test_column_null_scalar_comparison(dtype, null_scalar, cmpop):
+    # This test is meant to validate that comparing
+    # a series of any dtype with a null scalar produces
+    # a new series where all the elements are <NA>.
+
+    if isinstance(null_scalar, np.datetime64):
+        if np.dtype(dtype).kind not in "mM":
+            pytest.skip()
+        null_scalar = null_scalar.astype(dtype)
+
+    dtype = np.dtype(dtype)
+
+    data = [1, 2, 3, 4, 5]
+    sr = cudf.Series(data, dtype=dtype)
+    result = cmpop(sr, null_scalar)
+
+    assert result.isnull().all()
+
+
 @pytest.mark.parametrize("fn", ["eq", "ne", "lt", "gt", "le", "ge"])
 def test_equality_ops_index_mismatch(fn):
     a = cudf.Series(
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 76a02d5e74a..d72b88f1713 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -8495,3 +8495,24 @@ def test_explode(data, labels, ignore_index, p_index, label_to_explode):
     got = gdf.explode(label_to_explode, ignore_index)
 
     assert_eq(expect, got, check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    "df,ascending,expected",
+    [
+        (
+            cudf.DataFrame({"a": [10, 0, 2], "b": [-10, 10, 1]}),
+            True,
+            cudf.Series([1, 2, 0], dtype="int32"),
+        ),
+        (
+            cudf.DataFrame({"a": [10, 0, 2], "b": [-10, 10, 1]}),
+            False,
+            cudf.Series([0, 2, 1], dtype="int32"),
+        ),
+    ],
+)
+def test_dataframe_argsort(df, ascending, expected):
+    actual = df.argsort(ascending=ascending)
+
+    assert_eq(actual, expected)
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 8011510d340..a96db59dee3 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -12,7 +12,13 @@
 import cudf
 from cudf.core import DataFrame, Series
 from cudf.core._compat import PANDAS_GE_110
-from cudf.tests.utils import assert_eq, assert_exceptions_equal
+from cudf.tests.utils import (
+    DATETIME_TYPES,
+    SIGNED_TYPES,
+    TIMEDELTA_TYPES,
+    assert_eq,
+    assert_exceptions_equal,
+)
 
 _now = np.datetime64("now")
 _tomorrow = _now + np.timedelta64(1, "D")
@@ -1532,3 +1538,26 @@ def test_groupby_nonempty_no_keys(pdf):
         lambda: gdf.groupby([]),
         compare_error_message=False,
     )
+
+
+@pytest.mark.parametrize(
+    "by,data",
+    [
+        # ([], []),  # error?
+        ([1, 1, 2, 2], [0, 0, 1, 1]),
+        ([1, 2, 3, 4], [0, 0, 0, 0]),
+        ([1, 2, 1, 2], [0, 1, 1, 1]),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype",
+    SIGNED_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["string", "category"],
+)
+def test_groupby_unique(by, data, dtype):
+    pdf = pd.DataFrame({"by": by, "data": data})
+    pdf["data"] = pdf["data"].astype(dtype)
+    gdf = cudf.from_pandas(pdf)
+
+    expect = pdf.groupby("by")["data"].unique()
+    got = gdf.groupby("by")["data"].unique()
+    assert_eq(expect, got)
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 2ab1382b34e..5645ce60596 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -246,3 +246,38 @@ def test_get_nulls():
     with pytest.raises(IndexError, match="list index out of range"):
         sr = cudf.Series([[], [], []])
         sr.list.get(100)
+
+
+@pytest.mark.parametrize(
+    "data, scalar, expect",
+    [
+        ([[1, 2, 3], []], 1, [True, False],),
+        ([[1, 2, 3], [], [3, 4, 5]], 6, [False, False, False],),
+        ([[1.0, 2.0, 3.0], None, []], 2.0, [True, None, False],),
+        ([[None, "b", "c"], [], ["b", "e", "f"]], "b", [True, False, True],),
+        ([[None, 2, 3], None, []], 1, [None, None, False]),
+        ([[None, "b", "c"], [], ["b", "e", "f"]], "d", [None, False, False],),
+    ],
+)
+def test_contains_scalar(data, scalar, expect):
+    sr = cudf.Series(data)
+    expect = cudf.Series(expect)
+    got = sr.list.contains(cudf.Scalar(scalar, sr.dtype.element_type))
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data, expect",
+    [
+        ([[1, 2, 3], []], [None, None],),
+        ([[1.0, 2.0, 3.0], None, []], [None, None, None],),
+        ([[None, 2, 3], [], None], [None, None, None],),
+        ([[1, 2, 3], [3, 4, 5]], [None, None],),
+        ([[], [], []], [None, None, None],),
+    ],
+)
+def test_contains_null_search_key(data, expect):
+    sr = cudf.Series(data)
+    expect = cudf.Series(expect, dtype="bool")
+    got = sr.list.contains(cudf.Scalar(cudf.NA, sr.dtype.element_type))
+    assert_eq(expect, got)
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index a7a11c95e30..fe418d1ade1 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+
 import datetime
 import math
 import os
@@ -1718,24 +1719,24 @@ def test_parquet_nullable_boolean(tmpdir, engine):
     ],
 )
 @pytest.mark.parametrize("index", [None, True, False])
-def test_parquet_index(tmpdir, pdf, index):
-    pandas_path = tmpdir.join("pandas_index.parquet")
-    cudf_path = tmpdir.join("pandas_index.parquet")
+def test_parquet_index(pdf, index):
+    pandas_buffer = BytesIO()
+    cudf_buffer = BytesIO()
 
     gdf = cudf.from_pandas(pdf)
 
-    pdf.to_parquet(pandas_path, index=index)
-    gdf.to_parquet(cudf_path, index=index)
+    pdf.to_parquet(pandas_buffer, index=index)
+    gdf.to_parquet(cudf_buffer, index=index)
 
-    expected = pd.read_parquet(cudf_path)
-    actual = cudf.read_parquet(cudf_path)
+    expected = pd.read_parquet(cudf_buffer)
+    actual = cudf.read_parquet(pandas_buffer)
 
-    assert_eq(expected, actual)
+    assert_eq(expected, actual, check_index_type=True)
 
-    expected = pd.read_parquet(pandas_path)
-    actual = cudf.read_parquet(pandas_path)
+    expected = pd.read_parquet(pandas_buffer)
+    actual = cudf.read_parquet(cudf_buffer)
 
-    assert_eq(expected, actual)
+    assert_eq(expected, actual, check_index_type=True)
 
 
 @pytest.mark.parametrize("engine", ["cudf", "pyarrow"])
diff --git a/python/cudf/cudf/tests/test_scan.py b/python/cudf/cudf/tests/test_scan.py
index dce65947460..f7e8c5a8563 100644
--- a/python/cudf/cudf/tests/test_scan.py
+++ b/python/cudf/cudf/tests/test_scan.py
@@ -6,6 +6,7 @@
 
 import cudf
 from cudf.tests.utils import INTEGER_TYPES, NUMERIC_TYPES, assert_eq, gen_rand
+from cudf.core.dtypes import Decimal64Dtype
 
 params_sizes = [0, 1, 2, 5]
 
@@ -61,6 +62,21 @@ def test_cumsum_masked():
         assert_eq(got, expected)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [Decimal64Dtype(8, 4), Decimal64Dtype(10, 5), Decimal64Dtype(12, 7)],
+)
+def test_cumsum_decimal(dtype):
+    data = ["243.32", "48.245", "-7234.298", np.nan, "-467.2"]
+    gser = cudf.Series(data).astype(dtype)
+    pser = pd.Series(data, dtype="float64")
+
+    got = gser.cumsum()
+    expected = cudf.Series.from_pandas(pser.cumsum()).astype(dtype)
+
+    assert_eq(got, expected)
+
+
 @pytest.mark.parametrize("dtype,nelem", list(_gen_params()))
 def test_cummin(dtype, nelem):
     if dtype == np.int8:
@@ -103,6 +119,21 @@ def test_cummin_masked():
         assert_eq(gs.cummin(), expected)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [Decimal64Dtype(8, 4), Decimal64Dtype(11, 6), Decimal64Dtype(14, 7)],
+)
+def test_cummin_decimal(dtype):
+    data = ["8394.294", np.nan, "-9940.444", np.nan, "-23.928"]
+    gser = cudf.Series(data).astype(dtype)
+    pser = pd.Series(data, dtype="float64")
+
+    got = gser.cummin()
+    expected = cudf.Series.from_pandas(pser.cummin()).astype(dtype)
+
+    assert_eq(got, expected)
+
+
 @pytest.mark.parametrize("dtype,nelem", list(_gen_params()))
 def test_cummax(dtype, nelem):
     if dtype == np.int8:
@@ -145,6 +176,21 @@ def test_cummax_masked():
         assert_eq(gs.cummax(), expected)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [Decimal64Dtype(8, 4), Decimal64Dtype(11, 6), Decimal64Dtype(14, 7)],
+)
+def test_cummax_decimal(dtype):
+    data = [np.nan, "54.203", "8.222", "644.32", "-562.272"]
+    gser = cudf.Series(data).astype(dtype)
+    pser = pd.Series(data, dtype="float64")
+
+    got = gser.cummax()
+    expected = cudf.Series.from_pandas(pser.cummax()).astype(dtype)
+
+    assert_eq(got, expected)
+
+
 @pytest.mark.parametrize("dtype,nelem", list(_gen_params()))
 def test_cumprod(dtype, nelem):
     if dtype == np.int8:
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 8875a36dba8..8af225ecb58 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -154,7 +154,15 @@ def is_numerical_dtype(obj):
 
 
 def is_string_dtype(obj):
-    return pd.api.types.is_string_dtype(obj) and not is_categorical_dtype(obj)
+    return (
+        pd.api.types.is_string_dtype(obj)
+        # Reject all cudf extension types.
+        and not is_categorical_dtype(obj)
+        and not is_decimal_dtype(obj)
+        and not is_list_dtype(obj)
+        and not is_struct_dtype(obj)
+        and not is_interval_dtype(obj)
+    )
 
 
 def is_datetime_dtype(obj):
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 5d52d6c7da4..16511627aa2 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 
 import datetime
 import os
@@ -193,7 +193,10 @@
 index : bool, default None
     If ``True``, include the dataframe's index(es) in the file output. If
     ``False``, they will not be written to the file. If ``None``, the
-    engine's default behavior will be used.
+    engine's default behavior will be used. However, instead of being saved
+    as values, the ``RangeIndex`` will be stored as a range in the metadata
+    so it doesn’t require much space and is faster. Other indexes will
+    be included as columns in the file output.
 partition_cols : list, optional, default None
     Column names by which to partition the dataset
     Columns are partitioned in the order they are given