diff --git a/build.sh b/build.sh
index 5eb404d02a8..d75053f8849 100755
--- a/build.sh
+++ b/build.sh
@@ -135,7 +135,7 @@ if hasArg clean; then
 fi
 
 if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then
-    CUDF_CMAKE_CUDA_ARCHITECTURES="-DCMAKE_CUDA_ARCHITECTURES=ALL"
+    CUDF_CMAKE_CUDA_ARCHITECTURES="-DCMAKE_CUDA_ARCHITECTURES="
     echo "Building for the architecture of the GPU in the system..."
 else
     CUDF_CMAKE_CUDA_ARCHITECTURES=""
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index e709824721c..5657d21889f 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -134,6 +134,7 @@ test:
     - test -f $PREFIX/include/cudf/lists/detail/copying.hpp
     - test -f $PREFIX/include/cudf/lists/detail/sorting.hpp
     - test -f $PREFIX/include/cudf/lists/count_elements.hpp
+    - test -f $PREFIX/include/cudf/lists/explode.hpp
     - test -f $PREFIX/include/cudf/lists/drop_list_duplicates.hpp
     - test -f $PREFIX/include/cudf/lists/extract.hpp
     - test -f $PREFIX/include/cudf/lists/contains.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2a51ad5e55a..f15fd649b83 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -117,6 +117,8 @@ include(cmake/Modules/ConfigureCUDA.cmake)
 
 # find zlib
 find_package(ZLIB REQUIRED)
+# find Threads (needed by cudftestutil)
+find_package(Threads REQUIRED)
 # add third party dependencies using CPM
 include(cmake/thirdparty/CUDF_GetCPM.cmake)
 # find boost
@@ -133,8 +135,12 @@ include(cmake/thirdparty/CUDF_GetArrow.cmake)
 include(cmake/thirdparty/CUDF_GetDLPack.cmake)
 # find libcu++
 include(cmake/thirdparty/CUDF_GetLibcudacxx.cmake)
+# find or install GoogleTest
+include(cmake/thirdparty/CUDF_GetGTest.cmake)
 # Stringify libcudf and libcudacxx headers used in JIT operations
 include(cmake/Modules/StringifyJITHeaders.cmake)
+# find cuFile
+include(cmake/Modules/FindcuFile.cmake)
 
 ###################################################################################################
 # - library targets -------------------------------------------------------------------------------
@@ -240,6 +246,7 @@ add_library(cudf
     src/io/statistics/column_stats.cu
     src/io/utilities/data_sink.cpp
     src/io/utilities/datasource.cpp
+    src/io/utilities/file_io_utilities.cpp
     src/io/utilities/parsing_utils.cu
     src/io/utilities/type_conversion.cpp
     src/jit/cache.cpp
@@ -256,6 +263,7 @@ add_library(cudf
     src/lists/copying/gather.cu
     src/lists/copying/segmented_gather.cu
     src/lists/count_elements.cu
+    src/lists/explode.cu
     src/lists/extract.cu
     src/lists/drop_list_duplicates.cu
     src/lists/lists_column_factories.cu
@@ -285,7 +293,6 @@ add_library(cudf
     src/replace/nulls.cu
     src/replace/replace.cu
     src/reshape/byte_cast.cu
-    src/reshape/explode.cu
     src/reshape/interleave_columns.cu
     src/reshape/tile.cu
     src/rolling/grouped_rolling.cu
@@ -417,7 +424,8 @@ target_include_directories(cudf
                        "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/include>"
                        "$<BUILD_INTERFACE:${CUDF_GENERATED_INCLUDE_DIR}/include>"
            PRIVATE     "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
-           INTERFACE   "$<INSTALL_INTERFACE:include>")
+           INTERFACE   "$<INSTALL_INTERFACE:include>"
+                       "$<INSTALL_INTERFACE:include/libcudf/libcudaxx>")
 
 # Add Conda library paths if specified
 if(CONDA_LINK_DIRS)
@@ -464,6 +472,11 @@ else()
     target_link_libraries(cudf PUBLIC CUDA::nvrtc CUDA::cudart CUDA::cuda_driver)
 endif()
 
+# Add cuFile interface if available
+if(TARGET cuFile::cuFile_interface)
+    target_link_libraries(cudf PRIVATE cuFile::cuFile_interface)
+endif()
+
 file(WRITE "${CUDF_BINARY_DIR}/fatbin.ld"
 [=[
 SECTIONS
@@ -480,34 +493,37 @@ add_library(cudf::cudf ALIAS cudf)
 # - tests and benchmarks --------------------------------------------------------------------------
 ###################################################################################################
 
-if (CUDF_BUILD_TESTS OR CUDF_BUILD_BENCHMARKS)
-    # Find or install GoogleTest
-    CPMFindPackage(NAME GTest
-        VERSION         1.10.0
-        GIT_REPOSITORY  https://github.com/google/googletest.git
-        GIT_TAG         release-1.10.0
-        GIT_SHALLOW     TRUE
-        OPTIONS         "INSTALL_GTEST OFF"
-        # googletest >= 1.10.0 provides a cmake config file -- use it if it exists
-        FIND_PACKAGE_ARGUMENTS "CONFIG")
-    # Add GTest aliases if they don't already exist.
-    # Assumes if GTest::gtest doesn't exist, the others don't either.
-    # TODO: Is this always a valid assumption?
-    if(NOT TARGET GTest::gtest)
-        add_library(GTest::gtest ALIAS gtest)
-        add_library(GTest::gmock ALIAS gmock)
-        add_library(GTest::gtest_main ALIAS gtest_main)
-        add_library(GTest::gmock_main ALIAS gmock_main)
-    endif()
-    if(GTest_ADDED)
-        install(TARGETS gmock
-                        gtest
-                        gmock_main
-                        gtest_main
-            DESTINATION lib
-            EXPORT cudf-targets)
-    endif()
-endif()
+###################################################################################################
+# - build cudftestutil ----------------------------------------------------------------------------
+
+add_library(cudftestutil STATIC
+            tests/utilities/base_fixture.cpp
+            tests/utilities/column_utilities.cu
+            tests/utilities/table_utilities.cu
+            tests/strings/utilities.cu)
+
+target_compile_options(cudftestutil
+            PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
+                   "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
+)
+
+target_compile_features(cudftestutil PUBLIC cxx_std_14 cuda_std_14)
+
+target_link_libraries(cudftestutil
+               PUBLIC GTest::gmock
+                      GTest::gtest
+                      Threads::Threads
+                      cudf)
+
+target_include_directories(cudftestutil
+    PUBLIC "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
+           "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>")
+
+install(TARGETS cudftestutil
+        DESTINATION lib
+        EXPORT cudf-targets)
+
+add_library(cudf::cudftestutil ALIAS cudftestutil)
 
 ###################################################################################################
 # - add tests -------------------------------------------------------------------------------------
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 9e11f4df075..cd083ebec7a 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -185,9 +185,11 @@ ConfigureBench(STRINGS_BENCH
   string/convert_floats_benchmark.cpp
   string/copy_benchmark.cpp
   string/extract_benchmark.cpp
+  string/factory_benchmark.cu
   string/filter_benchmark.cpp
   string/find_benchmark.cpp
   string/replace_benchmark.cpp
   string/replace_re_benchmark.cpp
   string/split_benchmark.cpp
+  string/substring_benchmark.cpp
   string/url_decode_benchmark.cpp)
diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp
index ad2ce095b6e..dd1bbcba0b4 100644
--- a/cpp/benchmarks/fixture/benchmark_fixture.hpp
+++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp
@@ -88,4 +88,4 @@ class benchmark : public ::benchmark::Fixture {
   std::shared_ptr<rmm::mr::device_memory_resource> mr;
 };
 
-};  // namespace cudf
+}  // namespace cudf
diff --git a/cpp/benchmarks/string/factory_benchmark.cu b/cpp/benchmarks/string/factory_benchmark.cu
new file mode 100644
index 00000000000..6c5dceffaa8
--- /dev/null
+++ b/cpp/benchmarks/string/factory_benchmark.cu
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "string_bench_args.hpp"
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/execution_policy.h>
+#include <thrust/transform.h>
+
+#include <limits>
+
+namespace {
+using string_pair = thrust::pair<char const*, cudf::size_type>;
+struct string_view_to_pair {
+  __device__ string_pair operator()(thrust::pair<cudf::string_view, bool> const& p)
+  {
+    return (p.second) ? string_pair{p.first.data(), p.first.size_bytes()} : string_pair{nullptr, 0};
+  }
+};
+}  // namespace
+
+class StringsFactory : public cudf::benchmark {
+};
+
+static void BM_factory(benchmark::State& state)
+{
+  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
+  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  auto d_column = cudf::column_device_view::create(table->view().column(0));
+  rmm::device_vector<string_pair> pairs(d_column->size());
+  thrust::transform(thrust::device,
+                    d_column->pair_begin<cudf::string_view, true>(),
+                    d_column->pair_end<cudf::string_view, true>(),
+                    pairs.data(),
+                    string_view_to_pair{});
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    cudf::make_strings_column(pairs);
+  }
+
+  cudf::strings_column_view input(table->view().column(0));
+  state.SetBytesProcessed(state.iterations() * input.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 1 << 5;
+  int const max_rowlen = 1 << 13;
+  int const len_mult   = 4;
+  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+}
+
+#define STRINGS_BENCHMARK_DEFINE(name)          \
+  BENCHMARK_DEFINE_F(StringsFactory, name)      \
+  (::benchmark::State & st) { BM_factory(st); } \
+  BENCHMARK_REGISTER_F(StringsFactory, name)    \
+    ->Apply(generate_bench_args)                \
+    ->UseManualTime()                           \
+    ->Unit(benchmark::kMillisecond);
+
+STRINGS_BENCHMARK_DEFINE(factory)
diff --git a/cpp/benchmarks/string/string_bench_args.hpp b/cpp/benchmarks/string/string_bench_args.hpp
index f81f859de74..9c709b064dd 100644
--- a/cpp/benchmarks/string/string_bench_args.hpp
+++ b/cpp/benchmarks/string/string_bench_args.hpp
@@ -17,6 +17,8 @@
 
 #include <benchmark/benchmark.h>
 
+#include <cudf/types.hpp>
+
 /**
  * @brief Generate row count and row length argument ranges for a string benchmark.
  *
diff --git a/cpp/benchmarks/string/substring_benchmark.cpp b/cpp/benchmarks/string/substring_benchmark.cpp
new file mode 100644
index 00000000000..d47c42e45be
--- /dev/null
+++ b/cpp/benchmarks/string/substring_benchmark.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "string_bench_args.hpp"
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/strings/substring.hpp>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <limits>
+
+#include <thrust/iterator/constant_iterator.h>
+
+class StringSubstring : public cudf::benchmark {
+};
+
+enum substring_type { position, multi_position, delimiter, multi_delimiter };
+
+static void BM_substring(benchmark::State& state, substring_type rt)
+{
+  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
+  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+  auto starts_itr = thrust::constant_iterator<cudf::size_type>(1);
+  auto stops_itr  = thrust::constant_iterator<cudf::size_type>(max_str_length / 2);
+  cudf::test::fixed_width_column_wrapper<int32_t> starts(starts_itr, starts_itr + n_rows);
+  cudf::test::fixed_width_column_wrapper<int32_t> stops(stops_itr, stops_itr + n_rows);
+  auto delim_itr = thrust::constant_iterator<std::string>(" ");
+  cudf::test::strings_column_wrapper delimiters(delim_itr, delim_itr + n_rows);
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    switch (rt) {
+      case position: cudf::strings::slice_strings(input, 1, max_str_length / 2); break;
+      case multi_position: cudf::strings::slice_strings(input, starts, stops); break;
+      case delimiter: cudf::strings::slice_strings(input, std::string{" "}, 1); break;
+      case multi_delimiter:
+        cudf::strings::slice_strings(input, cudf::strings_column_view(delimiters), 1);
+        break;
+    }
+  }
+
+  state.SetBytesProcessed(state.iterations() * input.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 1 << 5;
+  int const max_rowlen = 1 << 13;
+  int const len_mult   = 4;
+  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+}
+
+#define STRINGS_BENCHMARK_DEFINE(name)                                  \
+  BENCHMARK_DEFINE_F(StringSubstring, name)                             \
+  (::benchmark::State & st) { BM_substring(st, substring_type::name); } \
+  BENCHMARK_REGISTER_F(StringSubstring, name)                           \
+    ->Apply(generate_bench_args)                                        \
+    ->UseManualTime()                                                   \
+    ->Unit(benchmark::kMillisecond);
+
+STRINGS_BENCHMARK_DEFINE(position)
+STRINGS_BENCHMARK_DEFINE(multi_position)
+STRINGS_BENCHMARK_DEFINE(delimiter)
+STRINGS_BENCHMARK_DEFINE(multi_delimiter)
diff --git a/cpp/cmake/Modules/ConfigureCUDA.cmake b/cpp/cmake/Modules/ConfigureCUDA.cmake
index 44699a13206..d4be6e65021 100644
--- a/cpp/cmake/Modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/Modules/ConfigureCUDA.cmake
@@ -17,26 +17,16 @@
 # Find the CUDAToolkit
 find_package(CUDAToolkit REQUIRED)
 
-# Must come after find_package(CUDAToolkit) because we symlink
-# ccache as a compiler front-end for nvcc in gpuCI CPU builds.
-enable_language(CUDA)
-
-if(CMAKE_CUDA_COMPILER_VERSION)
-  # Compute the version. from  CMAKE_CUDA_COMPILER_VERSION
-  string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR ${CMAKE_CUDA_COMPILER_VERSION})
-  string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR ${CMAKE_CUDA_COMPILER_VERSION})
-  set(CUDA_VERSION "${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}")
-endif()
-
-message(VERBOSE "CUDF: CUDA_VERSION_MAJOR: ${CUDA_VERSION_MAJOR}")
-message(VERBOSE "CUDF: CUDA_VERSION_MINOR: ${CUDA_VERSION_MINOR}")
-message(STATUS "CUDF: CUDA_VERSION: ${CUDA_VERSION}")
-
 # Auto-detect available GPU compute architectures
-
 include(${CUDF_SOURCE_DIR}/cmake/Modules/SetGPUArchs.cmake)
 message(STATUS "CUDF: Building CUDF for GPU architectures: ${CMAKE_CUDA_ARCHITECTURES}")
 
+# Must come after find_package(CUDAToolkit) because we symlink
+# ccache as a compiler front-end for nvcc in gpuCI CPU builds.
+# Must also come after we detect and potentially rewrite
+# CMAKE_CUDA_ARCHITECTURES
+enable_language(CUDA)
+
 if(CMAKE_COMPILER_IS_GNUCXX)
     list(APPEND CUDF_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations)
     if(CUDF_BUILD_TESTS OR CUDF_BUILD_BENCHMARKS)
diff --git a/cpp/cmake/Modules/EvalGPUArchs.cmake b/cpp/cmake/Modules/EvalGPUArchs.cmake
index 6c747a0b867..09e42c6cc7a 100644
--- a/cpp/cmake/Modules/EvalGPUArchs.cmake
+++ b/cpp/cmake/Modules/EvalGPUArchs.cmake
@@ -14,12 +14,21 @@
 # limitations under the License.
 #=============================================================================
 
+# Unset this first in case it's set to <empty_string>
+set(CMAKE_CUDA_ARCHITECTURES OFF)
+
+# Enable CUDA so we can invoke nvcc
+enable_language(CUDA)
+
+# Function uses the CUDA runtime API to query the compute capability of the device, so if a user
+# doesn't pass any architecture options to CMake we only build the current architecture
 function(evaluate_gpu_archs gpu_archs)
   set(eval_file ${PROJECT_BINARY_DIR}/eval_gpu_archs.cu)
   set(eval_exe ${PROJECT_BINARY_DIR}/eval_gpu_archs)
   set(error_file ${PROJECT_BINARY_DIR}/eval_gpu_archs.stderr.log)
-  file(WRITE ${eval_file}
-[=[
+  file(
+    WRITE ${eval_file}
+    "
 #include <cstdio>
 #include <set>
 #include <string>
@@ -32,32 +41,30 @@ int main(int argc, char** argv) {
       char buff[32];
       cudaDeviceProp prop;
       if(cudaGetDeviceProperties(&prop, dev) != cudaSuccess) continue;
-      sprintf(buff, "%d%d", prop.major, prop.minor);
+      sprintf(buff, \"%d%d\", prop.major, prop.minor);
       archs.insert(buff);
     }
   }
   if(archs.empty()) {
-    printf("ALL");
+    printf(\"${SUPPORTED_CUDA_ARCHITECTURES}\");
   } else {
     bool first = true;
     for(const auto& arch : archs) {
-      printf(first? "%s" : ";%s", arch.c_str());
+      printf(first? \"%s\" : \";%s\", arch.c_str());
       first = false;
     }
   }
-  printf("\n");
+  printf(\"\\n\");
   return 0;
 }
-]=])
+")
   execute_process(
-    COMMAND ${CMAKE_CUDA_COMPILER}
-      -std=c++11
-      -o ${eval_exe}
-      --run
-      ${eval_file}
+    COMMAND ${CMAKE_CUDA_COMPILER} -std=c++11 -o ${eval_exe} --run ${eval_file}
     OUTPUT_VARIABLE __gpu_archs
     OUTPUT_STRIP_TRAILING_WHITESPACE
     ERROR_FILE ${error_file})
-  message(VERBOSE "CUDF: Auto detection of gpu-archs: ${__gpu_archs}")
-  set(${gpu_archs} ${__gpu_archs} PARENT_SCOPE)
-endfunction()
+  message(STATUS "CUDF: Auto detection of gpu-archs: ${__gpu_archs}")
+  set(${gpu_archs}
+      ${__gpu_archs}
+      PARENT_SCOPE)
+endfunction(evaluate_gpu_archs)
diff --git a/cpp/cmake/Modules/FindcuFile.cmake b/cpp/cmake/Modules/FindcuFile.cmake
index e67b79d9d60..4f67e186f42 100644
--- a/cpp/cmake/Modules/FindcuFile.cmake
+++ b/cpp/cmake/Modules/FindcuFile.cmake
@@ -93,6 +93,12 @@ find_package_handle_standard_args(cuFile
     cuFile_VERSION
 )
 
+if (cuFile_INCLUDE_DIR AND NOT TARGET cuFile::cuFile_interface)
+  add_library(cuFile::cuFile_interface IMPORTED INTERFACE)
+  target_include_directories(cuFile::cuFile_interface INTERFACE "$<BUILD_INTERFACE:${cuFile_INCLUDE_DIR}>")
+  target_compile_options(cuFile::cuFile_interface INTERFACE "${cuFile_COMPILE_OPTIONS}")
+  target_compile_definitions(cuFile::cuFile_interface INTERFACE CUFILE_FOUND)
+endif ()
 
 if (cuFile_FOUND AND NOT TARGET cuFile::cuFile)
   add_library(cuFile::cuFile UNKNOWN IMPORTED)
diff --git a/cpp/cmake/Modules/SetGPUArchs.cmake b/cpp/cmake/Modules/SetGPUArchs.cmake
index 396023ee9a9..61e4e6bc198 100644
--- a/cpp/cmake/Modules/SetGPUArchs.cmake
+++ b/cpp/cmake/Modules/SetGPUArchs.cmake
@@ -25,35 +25,41 @@ else()
   list(REMOVE_ITEM SUPPORTED_CUDA_ARCHITECTURES "62" "72")
 endif()
 
-if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11)
+# CMake < 3.20 has a bug in FindCUDAToolkit where it won't properly detect the CUDAToolkit version
+# when find_package(CUDAToolkit) occurs before enable_language(CUDA)
+if(NOT DEFINED CUDAToolkit_VERSION AND CMAKE_CUDA_COMPILER)
+  execute_process(COMMAND ${CMAKE_CUDA_COMPILER} "--version" OUTPUT_VARIABLE NVCC_OUT)
+  if(NVCC_OUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+    set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
+    set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
+    set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
+    set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}")
+  endif()
+  unset(NVCC_OUT)
+endif()
+
+if(CUDAToolkit_VERSION_MAJOR LESS 11)
   list(REMOVE_ITEM SUPPORTED_CUDA_ARCHITECTURES "80")
 endif()
-if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 10)
+if(CUDAToolkit_VERSION_MAJOR LESS 10)
   list(REMOVE_ITEM SUPPORTED_CUDA_ARCHITECTURES "75")
 endif()
-if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 9)
+if(CUDAToolkit_VERSION_MAJOR LESS 9)
   list(REMOVE_ITEM SUPPORTED_CUDA_ARCHITECTURES "70")
 endif()
 
-if(CUDF_BUILD_FOR_DETECTED_ARCHS)
-  include(${CUDF_SOURCE_DIR}/cmake/Modules/EvalGPUArchs.cmake)
-  evaluate_gpu_archs(CMAKE_CUDA_ARCHITECTURES)
-  if(CMAKE_CUDA_ARCHITECTURES STREQUAL "ALL")
-    unset(CMAKE_CUDA_ARCHITECTURES CACHE)
-    set(CUDF_BUILD_FOR_ALL_ARCHS TRUE)
-  else()
-    set(CUDF_BUILD_FOR_ALL_ARCHS FALSE)
-    list(TRANSFORM CMAKE_CUDA_ARCHITECTURES APPEND "-real")
-  endif()
-endif()
-
-if(CUDF_BUILD_FOR_ALL_ARCHS)
+if(${PROJECT_NAME}_BUILD_FOR_ALL_ARCHS)
   set(CMAKE_CUDA_ARCHITECTURES ${SUPPORTED_CUDA_ARCHITECTURES})
-  # CMake architecture list entry of "80" means to build compute and sm.
-  # What we want is for the newest arch only to build that way
-  # while the rest built only for sm.
-  list(SORT CMAKE_CUDA_ARCHITECTURES ORDER ASCENDING)
+
+  # CMake architecture list entry of "80" means to build compute and sm. What we want is for the
+  # newest arch only to build that way while the rest built only for sm.
   list(POP_BACK CMAKE_CUDA_ARCHITECTURES latest_arch)
   list(TRANSFORM CMAKE_CUDA_ARCHITECTURES APPEND "-real")
   list(APPEND CMAKE_CUDA_ARCHITECTURES ${latest_arch})
+
+elseif(${PROJECT_NAME}_BUILD_FOR_DETECTED_ARCHS)
+  include(${PROJECT_SOURCE_DIR}/cmake/Modules/EvalGPUArchs.cmake)
+  evaluate_gpu_archs(CMAKE_CUDA_ARCHITECTURES)
+
+  list(TRANSFORM CMAKE_CUDA_ARCHITECTURES APPEND "-real")
 endif()
diff --git a/cpp/cmake/Modules/StringifyJITHeaders.cmake b/cpp/cmake/Modules/StringifyJITHeaders.cmake
index 36752d7f715..0bfb37773dc 100644
--- a/cpp/cmake/Modules/StringifyJITHeaders.cmake
+++ b/cpp/cmake/Modules/StringifyJITHeaders.cmake
@@ -164,5 +164,5 @@ add_custom_target(stringify_run DEPENDS
 # - copy libcu++ ----------------------------------------------------------------------------------
 
 # `${LIBCUDACXX_INCLUDE_DIR}/` specifies that the contents of this directory will be installed (not the directory itself)
-file(INSTALL "${LIBCUDACXX_INCLUDE_DIR}/" DESTINATION "${CUDF_GENERATED_INCLUDE_DIR}/include/libcudacxx")
-file(INSTALL "${LIBCXX_INCLUDE_DIR}"      DESTINATION "${CUDF_GENERATED_INCLUDE_DIR}/include/libcxx")
+file(COPY "${LIBCUDACXX_INCLUDE_DIR}/" DESTINATION "${CUDF_GENERATED_INCLUDE_DIR}/include/libcudacxx")
+file(COPY "${LIBCXX_INCLUDE_DIR}"      DESTINATION "${CUDF_GENERATED_INCLUDE_DIR}/include/libcxx")
diff --git a/cpp/cmake/cudf-build-config.cmake.in b/cpp/cmake/cudf-build-config.cmake.in
index 5f6b265384e..3f4d2e5586e 100644
--- a/cpp/cmake/cudf-build-config.cmake.in
+++ b/cpp/cmake/cudf-build-config.cmake.in
@@ -36,6 +36,8 @@ include(@CUDF_SOURCE_DIR@/cmake/thirdparty/CUDF_GetThrust.cmake)
 # find rmm
 set(CUDF_MIN_VERSION_rmm "${CUDF_VERSION_MAJOR}.${CUDF_VERSION_MINOR}")
 include(@CUDF_SOURCE_DIR@/cmake/thirdparty/CUDF_GetRMM.cmake)
+# find gtest
+include(@CUDF_SOURCE_DIR@/cmake/thirdparty/CUDF_GetGTest.cmake)
 
 # find arrow
 if(NOT EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-arrow-targets.cmake")
diff --git a/cpp/cmake/cudf-config.cmake.in b/cpp/cmake/cudf-config.cmake.in
index aeb7d9915cf..1147e1160e7 100644
--- a/cpp/cmake/cudf-config.cmake.in
+++ b/cpp/cmake/cudf-config.cmake.in
@@ -19,13 +19,8 @@ find_dependency(Arrow @CUDF_VERSION_Arrow@)
 find_dependency(ArrowCUDA @CUDF_VERSION_Arrow@)
 find_dependency(Boost @CUDF_MIN_VERSION_Boost@)
 
-find_dependency(jitify)
 find_dependency(rmm @CUDF_MIN_VERSION_rmm@)
-find_dependency(Thrust @CUDF_MIN_VERSION_Thrust@)
-find_dependency(dlpack @CUDF_MIN_VERSION_dlpack@)
-find_dependency(libcudacxx @CUDF_MIN_VERSION_libcudacxx@)
-
-thrust_create_target(cudf::Thrust FROM_OPTIONS)
+find_dependency(gtest @CUDF_MIN_VERSION_gtest@)
 
 list(POP_FRONT CMAKE_MODULE_PATH)
 
diff --git a/cpp/cmake/thirdparty/CUDF_GetCPM.cmake b/cpp/cmake/thirdparty/CUDF_GetCPM.cmake
index f50b9e7f646..5162aaf6ce7 100644
--- a/cpp/cmake/thirdparty/CUDF_GetCPM.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetCPM.cmake
@@ -17,3 +17,14 @@ if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION}))
 endif()
 
 include(${CPM_DOWNLOAD_LOCATION})
+
+# If a target is installed, found by the `find_package` step of CPMFindPackage,
+# and marked as IMPORTED, make it globally accessible to consumers of our libs.
+function(fix_cmake_global_defaults target)
+    if(TARGET ${target})
+        get_target_property(_is_imported ${target} IMPORTED)
+        if(_is_imported)
+            set_target_properties(${target} PROPERTIES IMPORTED_GLOBAL TRUE)
+        endif()
+    endif()
+endfunction()
diff --git a/cpp/cmake/thirdparty/CUDF_GetGTest.cmake b/cpp/cmake/thirdparty/CUDF_GetGTest.cmake
new file mode 100644
index 00000000000..2911e4fce29
--- /dev/null
+++ b/cpp/cmake/thirdparty/CUDF_GetGTest.cmake
@@ -0,0 +1,53 @@
+#=============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_gtest VERSION)
+    # Find or install GoogleTest
+    CPMFindPackage(NAME GTest
+        VERSION         ${VERSION}
+        GIT_REPOSITORY  https://github.com/google/googletest.git
+        GIT_TAG         release-${VERSION}
+        GIT_SHALLOW     TRUE
+        OPTIONS         "INSTALL_GTEST OFF"
+        # googletest >= 1.10.0 provides a cmake config file -- use it if it exists
+        FIND_PACKAGE_ARGUMENTS "CONFIG")
+    # Add GTest aliases if they don't already exist.
+    # Assumes if GTest::gtest doesn't exist, the others don't either.
+    # TODO: Is this always a valid assumption?
+    if(NOT TARGET GTest::gtest)
+        add_library(GTest::gtest ALIAS gtest)
+        add_library(GTest::gmock ALIAS gmock)
+        add_library(GTest::gtest_main ALIAS gtest_main)
+        add_library(GTest::gmock_main ALIAS gmock_main)
+    endif()
+    # Make sure consumers of cudf can also see GTest::* targets
+    fix_cmake_global_defaults(GTest::gtest)
+    fix_cmake_global_defaults(GTest::gmock)
+    fix_cmake_global_defaults(GTest::gtest_main)
+    fix_cmake_global_defaults(GTest::gmock_main)
+    if(GTest_ADDED)
+        install(TARGETS gmock
+                        gtest
+                        gmock_main
+                        gtest_main
+            DESTINATION lib
+            EXPORT cudf-targets)
+    endif()
+endfunction()
+
+set(CUDF_MIN_VERSION_gtest 1.10.0)
+
+find_and_configure_gtest(${CUDF_MIN_VERSION_gtest})
diff --git a/cpp/cmake/thirdparty/CUDF_GetRMM.cmake b/cpp/cmake/thirdparty/CUDF_GetRMM.cmake
index 16c8a2b39f4..54e0a8620c5 100644
--- a/cpp/cmake/thirdparty/CUDF_GetRMM.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetRMM.cmake
@@ -48,13 +48,9 @@ function(find_and_configure_rmm VERSION)
     cudf_restore_if_enabled(BUILD_TESTS)
     cudf_restore_if_enabled(BUILD_BENCHMARKS)
 
-    #Make sure consumers of cudf can also see rmm::rmm
-    if(TARGET rmm::rmm)
-        get_target_property(rmm_is_imported rmm::rmm IMPORTED)
-        if(rmm_is_imported)
-            set_target_properties(rmm::rmm PROPERTIES IMPORTED_GLOBAL TRUE)
-        endif()
-    endif()
+    # Make sure consumers of cudf can also see rmm::rmm
+    fix_cmake_global_defaults(rmm::rmm)
+
     if(NOT rmm_BINARY_DIR IN_LIST CMAKE_PREFIX_PATH)
         list(APPEND CMAKE_PREFIX_PATH "${rmm_BINARY_DIR}")
         set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} PARENT_SCOPE)
diff --git a/cpp/include/cudf/io/data_sink.hpp b/cpp/include/cudf/io/data_sink.hpp
index 0ae403458a0..e0eb60af070 100644
--- a/cpp/include/cudf/io/data_sink.hpp
+++ b/cpp/include/cudf/io/data_sink.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -107,23 +107,35 @@ class data_sink {
    */
   virtual bool supports_device_write() const { return false; }
 
+  /**
+   * @brief Estimates whether a direct device write would be more optimal for the given size.
+   *
+   * @param size Number of bytes to write
+   * @return whether the device write is expected to be more performant for the given size
+   */
+  virtual bool is_device_write_preferred(size_t size) const { return supports_device_write(); }
+
   /**
    * @brief Append the buffer content to the sink from a gpu address
    *
-   * @param[in] data Pointer to the buffer to be written into the sink object
-   * @param[in] size Number of bytes to write
+   * For optimal performance, should only be called when `is_device_write_preferred` returns `true`.
+   * Data sink implementations that don't support direct device writes don't need to override
+   * this function.
    *
-   * @return void
+   * @throws cudf::logic_error the object does not support direct device writes, i.e.
+   * `supports_device_write` returns `false`.
+   *
+   * @param gpu_data Pointer to the buffer to be written into the sink object
+   * @param size Number of bytes to write
+   * @param stream CUDA stream to use
    */
   virtual void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream)
   {
-    CUDF_FAIL("data_sink classes that support device_write must override this function.");
+    CUDF_FAIL("data_sink classes that support device_write must override it.");
   }
 
   /**
    * @brief Flush the data written into the sink
-   *
-   * @return void
    */
   virtual void flush() = 0;
 
diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp
index 88f2bd187e2..8fcc045e6d2 100644
--- a/cpp/include/cudf/io/datasource.hpp
+++ b/cpp/include/cudf/io/datasource.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/io/types.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <arrow/buffer.h>
 #include <arrow/io/file.h>
 #include <arrow/io/interfaces.h>
@@ -50,12 +52,15 @@ class datasource {
     /**
      * @brief Returns the address of the data in the buffer.
      */
-    virtual const uint8_t* data() const = 0;
+    virtual uint8_t const* data() const = 0;
 
     /**
      * @brief Base class destructor
      */
     virtual ~buffer() {}
+
+    template <typename Container>
+    static std::unique_ptr<buffer> create(Container&& data_owner);
   };
 
   /**
@@ -147,37 +152,57 @@ class datasource {
    */
   virtual bool supports_device_read() const { return false; }
 
+  /**
+   * @brief Estimates whether a direct device read would be more optimal for the given size.
+   *
+   * @param size Number of bytes to read
+   * @return whether the device read is expected to be more performant for the given size
+   */
+  virtual bool is_device_read_preferred(size_t size) const { return supports_device_read(); }
+
   /**
    * @brief Returns a device buffer with a subset of data from the source.
    *
+   * For optimal performance, should only be called when `is_device_read_preferred` returns `true`.
    * Data source implementations that don't support direct device reads don't need to override this
    * function.
    *
-   * @param[in] offset Bytes from the start
-   * @param[in] size Bytes to read
+   *  @throws cudf::logic_error the object does not support direct device reads, i.e.
+   * `supports_device_read` returns `false`.
+   *
+   * @param offset Number of bytes from the start
+   * @param size Number of bytes to read
+   * @param stream CUDA stream to use
    *
    * @return The data buffer in the device memory
    */
-  virtual std::unique_ptr<datasource::buffer> device_read(size_t offset, size_t size)
+  virtual std::unique_ptr<datasource::buffer> device_read(size_t offset,
+                                                          size_t size,
+                                                          rmm::cuda_stream_view stream)
   {
-    CUDF_FAIL("datasource classes that support device_read must override this function.");
+    CUDF_FAIL("datasource classes that support device_read must override it.");
   }
 
   /**
    * @brief Reads a selected range into a preallocated device buffer
    *
+   * For optimal performance, should only be called when `is_device_read_preferred` returns `true`.
    * Data source implementations that don't support direct device reads don't need to override this
    * function.
    *
-   * @param[in] offset Bytes from the start
-   * @param[in] size Bytes to read
-   * @param[in] dst Address of the existing device memory
+   *  @throws cudf::logic_error when the object does not support direct device reads, i.e.
+   * `supports_device_read` returns `false`.
+   *
+   * @param offset Number of bytes from the start
+   * @param size Number of bytes to read
+   * @param dst Address of the existing device memory
+   * @param stream CUDA stream to use
    *
    * @return The number of bytes read (can be smaller than size)
    */
-  virtual size_t device_read(size_t offset, size_t size, uint8_t* dst)
+  virtual size_t device_read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream)
   {
-    CUDF_FAIL("datasource classes that support device_read must override this function.");
+    CUDF_FAIL("datasource classes that support device_read must override it.");
   }
 
   /**
@@ -205,14 +230,57 @@ class datasource {
 
     size_t size() const override { return _size; }
 
-    const uint8_t* data() const override { return _data; }
+    uint8_t const* data() const override { return _data; }
 
    private:
     uint8_t* const _data;
     size_t const _size;
   };
+
+  /**
+   * @brief Derived implementation of `buffer` that owns the data.
+   *
+   * Can use different container types to hold the data buffer.
+   *
+   * @tparam Container Type of the container object that owns the data
+   */
+  template <typename Container>
+  class owning_buffer : public buffer {
+   public:
+    /**
+     * @brief Moves the input container into the newly created object.
+     */
+    owning_buffer(Container&& data_owner)
+      : _data(std::move(data_owner)), _data_ptr(_data.data()), _size(_data.size())
+    {
+    }
+
+    /**
+     * @brief Moves the input container into the newly created object, and exposes a subspan of the
+     * buffer.
+     */
+    owning_buffer(Container&& data_owner, uint8_t const* data_ptr, size_t size)
+      : _data(std::move(data_owner)), _data_ptr(data_ptr), _size(size)
+    {
+    }
+
+    size_t size() const override { return _size; }
+
+    uint8_t const* data() const override { return static_cast<uint8_t const*>(_data_ptr); }
+
+   private:
+    Container _data;
+    void const* _data_ptr;
+    size_t _size;
+  };
 };
 
+template <typename Container>
+std::unique_ptr<datasource::buffer> datasource::buffer::create(Container&& data_owner)
+{
+  return std::make_unique<owning_buffer<Container>>(std::move(data_owner));
+}
+
 /**
  * @brief Implementation class for reading from an Apache Arrow file. The file
  * could be a memory-mapped file or other implementation supported by Arrow.
@@ -230,7 +298,7 @@ class arrow_io_source : public datasource {
     {
     }
     size_t size() const override { return arrow_buffer->size(); }
-    const uint8_t* data() const override { return arrow_buffer->data(); }
+    uint8_t const* data() const override { return arrow_buffer->data(); }
   };
 
  public:
diff --git a/cpp/include/cudf/lists/explode.hpp b/cpp/include/cudf/lists/explode.hpp
new file mode 100644
index 00000000000..156d4b9275d
--- /dev/null
+++ b/cpp/include/cudf/lists/explode.hpp
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <memory>
+
+namespace cudf {
+
+/**
+ * @brief Explodes a list column's elements.
+ *
+ * Any list is exploded, which means the elements of the list in each row are expanded into new rows
+ * in the output. The corresponding rows for other columns in the input are duplicated. Example:
+ * ```
+ * [[5,10,15], 100],
+ * [[20,25],   200],
+ * [[30],      300],
+ * returns
+ * [5,         100],
+ * [10,        100],
+ * [15,        100],
+ * [20,        200],
+ * [25,        200],
+ * [30,        300],
+ * ```
+ *
+ * Nulls and empty lists propagate in different ways depending on what is null or empty.
+ *```
+ * [[5,null,15], 100],
+ * [null,        200],
+ * [[],          300],
+ * returns
+ * [5,           100],
+ * [null,        100],
+ * [15,          100],
+ * ```
+ * Note that null lists are not included in the resulting table, but nulls inside
+ * lists and empty lists will be represented with a null entry for that column in that row.
+ *
+ * @param input_table Table to explode.
+ * @param explode_column_idx Column index to explode inside the table.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ *
+ * @return A new table with explode_col exploded.
+ */
+std::unique_ptr<table> explode(
+  table_view const& input_table,
+  size_type explode_column_idx,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Explodes a list column's elements and includes a position column.
+ *
+ * Any list is exploded, which means the elements of the list in each row are expanded into new rows
+ * in the output. The corresponding rows for other columns in the input are duplicated. A position
+ * column is added that has the index inside the original list for each row. Example:
+ * ```
+ * [[5,10,15], 100],
+ * [[20,25],   200],
+ * [[30],      300],
+ * returns
+ * [0,   5,     100],
+ * [1,   10,    100],
+ * [2,   15,    100],
+ * [0,   20,    200],
+ * [1,   25,    200],
+ * [0,   30,    300],
+ * ```
+ *
+ * Nulls and empty lists propagate in different ways depending on what is null or empty.
+ *```
+ * [[5,null,15], 100],
+ * [null,        200],
+ * [[],          300],
+ * returns
+ * [0,     5,    100],
+ * [1,  null,    100],
+ * [2,    15,    100],
+ * ```
+ * Note that null lists are not included in the resulting table, but nulls inside
+ * lists and empty lists will be represented with a null entry for that column in that row.
+ *
+ * @param input_table Table to explode.
+ * @param explode_column_idx Column index to explode inside the table.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ *
+ * @return A new table with exploded value and position. The column order of return table is
+ *         [cols before explode_input, explode_position, explode_value, cols after explode_input].
+ */
+std::unique_ptr<table> explode_position(
+  table_view const& input_table,
+  size_type explode_column_idx,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Explodes a list column's elements retaining any null entries or empty lists inside.
+ *
+ * Any list is exploded, which means the elements of the list in each row are expanded into new rows
+ * in the output. The corresponding rows for other columns in the input are duplicated. Example:
+ * ```
+ * [[5,10,15], 100],
+ * [[20,25],   200],
+ * [[30],      300],
+ * returns
+ * [5,         100],
+ * [10,        100],
+ * [15,        100],
+ * [20,        200],
+ * [25,        200],
+ * [30,        300],
+ * ```
+ *
+ * Nulls and empty lists propagate as null entries in the result.
+ *```
+ * [[5,null,15], 100],
+ * [null,        200],
+ * [[],          300],
+ * returns
+ * [5,           100],
+ * [null,        100],
+ * [15,          100],
+ * [null,        200],
+ * [null,        300],
+ * ```
+ *
+ * @param input_table Table to explode.
+ * @param explode_column_idx Column index to explode inside the table.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ *
+ * @return A new table with explode_col exploded.
+ */
+std::unique_ptr<table> explode_outer(
+  table_view const& input_table,
+  size_type explode_column_idx,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Explodes a list column's elements retaining any null entries or empty lists and includes a
+ *position column.
+ *
+ * Any list is exploded, which means the elements of the list in each row are expanded into new rows
+ * in the output. The corresponding rows for other columns in the input are duplicated. A position
+ * column is added that has the index inside the original list for each row. Example:
+ * ```
+ * [[5,10,15], 100],
+ * [[20,25],   200],
+ * [[30],      300],
+ * returns
+ * [0,   5,    100],
+ * [1,  10,    100],
+ * [2,  15,    100],
+ * [0,  20,    200],
+ * [1,  25,    200],
+ * [0,  30,    300],
+ * ```
+ *
+ * Nulls and empty lists propagate as null entries in the result.
+ *```
+ * [[5,null,15], 100],
+ * [null,        200],
+ * [[],          300],
+ * returns
+ * [0,     5,    100],
+ * [1,  null,    100],
+ * [2,    15,    100],
+ * [0,  null,    200],
+ * [0,  null,    300],
+ * ```
+ *
+ * @param input_table Table to explode.
+ * @param explode_column_idx Column index to explode inside the table.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ *
+ * @return A new table with explode_col exploded.
+ */
+std::unique_ptr<table> explode_outer_position(
+  table_view const& input_table,
+  size_type explode_column_idx,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/** @} */  // end of group
+
+}  // namespace cudf
diff --git a/cpp/include/cudf/reshape.hpp b/cpp/include/cudf/reshape.hpp
index a6030f31e6d..74e4ebb8d05 100644
--- a/cpp/include/cudf/reshape.hpp
+++ b/cpp/include/cudf/reshape.hpp
@@ -97,92 +97,6 @@ std::unique_ptr<column> byte_cast(
   flip_endianness endian_configuration,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Explodes a list column's elements.
- *
- * Any list is exploded, which means the elements of the list in each row are expanded into new rows
- * in the output. The corresponding rows for other columns in the input are duplicated. Example:
- * ```
- * [[5,10,15], 100],
- * [[20,25],   200],
- * [[30],      300],
- * returns
- * [5,         100],
- * [10,        100],
- * [15,        100],
- * [20,        200],
- * [25,        200],
- * [30,        300],
- * ```
- *
- * Nulls and empty lists propagate in different ways depending on what is null or empty.
- *```
- * [[5,null,15], 100],
- * [null,        200],
- * [[],          300],
- * returns
- * [5,           100],
- * [null,        100],
- * [15,          100],
- * ```
- * Note that null lists are not included in the resulting table, but nulls inside
- * lists and empty lists will be represented with a null entry for that column in that row.
- *
- * @param input_table Table to explode.
- * @param explode_column_idx Column index to explode inside the table.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- *
- * @return A new table with explode_col exploded.
- */
-std::unique_ptr<table> explode(
-  table_view const& input_table,
-  size_type explode_column_idx,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Explodes a list column's elements and includes a position column.
- *
- * Any list is exploded, which means the elements of the list in each row are expanded into new rows
- * in the output. The corresponding rows for other columns in the input are duplicated. A position
- * column is added that has the index inside the original list for each row. Example:
- * ```
- * [[5,10,15], 100],
- * [[20,25],   200],
- * [[30],      300],
- * returns
- * [0,   5,    100],
- * [1,   10,   100],
- * [2,   15,    100],
- * [0,   20,    200],
- * [1,   25,    200],
- * [0,   30,    300],
- * ```
- *
- * Nulls and empty lists propagate in different ways depending on what is null or empty.
- *```
- * [[5,null,15], 100],
- * [null,        200],
- * [[],          300],
- * returns
- * [0,    5,     100],
- * [1,    null,  100],
- * [2,    15,    100],
- * ```
- * Note that null lists are not included in the resulting table, but nulls inside
- * lists and empty lists will be represented with a null entry for that column in that row.
- *
- * @param input_table Table to explode.
- * @param explode_column_idx Column index to explode inside the table.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- *
- * @return A new table with exploded value and position. The column order of return table is
- *         [cols before explode_input, explode_position, explode_value, cols after explode_input].
- */
-std::unique_ptr<table> explode_position(
-  table_view const& input_table,
-  size_type explode_column_idx,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
 /** @} */  // end of group
 
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp
index 300722920f4..1f5b6241850 100644
--- a/cpp/include/cudf/strings/char_types/char_types.hpp
+++ b/cpp/include/cudf/strings/char_types/char_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -146,82 +146,6 @@ std::unique_ptr<column> filter_characters_of_type(
   string_character_types types_to_keep = string_character_types::ALL_TYPES,
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Returns a boolean column identifying strings in which all
- * characters are valid for conversion to integers.
- *
- * The output row entry will be set to `true` if the corresponding string element
- * has at least one character in [-+0-9].
- *
- * @code{.pseudo}
- * Example:
- * s = ['123', '-456', '', 'A', '+7']
- * b = s.is_integer(s)
- * b is [true, true, false, false, true]
- * @endcode
- *
- * Any null row results in a null entry for that row in the output column.
- *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of boolean results for each string.
- */
-std::unique_ptr<column> is_integer(
-  strings_column_view const& strings,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Returns `true` if all strings contain
- * characters that are valid for conversion to integers.
- *
- * This function will return `true` if all string elements
- * has at least one character in [-+0-9].
- *
- * Any null entry or empty string will cause this function to return `false`.
- *
- * @param strings Strings instance for this operation.
- * @return true if all string are valid
- */
-bool all_integer(strings_column_view const& strings);
-
-/**
- * @brief Returns a boolean column identifying strings in which all
- * characters are valid for conversion to floats.
- *
- * The output row entry will be set to `true` if the corresponding string element
- * has at least one character in [-+0-9eE.].
- *
- * @code{.pseudo}
- * Example:
- * s = ['123', '-456', '', 'A', '+7', '8.9' '3.7e+5']
- * b = s.is_float(s)
- * b is [true, true, false, false, true, true, true]
- * @endcode
- *
- * Any null row results in a null entry for that row in the output column.
- *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of boolean results for each string.
- */
-std::unique_ptr<column> is_float(
-  strings_column_view const& strings,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Returns `true` if all strings contain
- * characters that are valid for conversion to floats.
- *
- * This function will return `true` if all string elements
- * has at least one character in [-+0-9eE.].
- *
- * Any null entry or empty string will cause this function to return `false`.
- *
- * @param strings Strings instance for this operation.
- * @return true if all string are valid
- */
-bool all_float(strings_column_view const& strings);
-
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/convert/convert_floats.hpp b/cpp/include/cudf/strings/convert/convert_floats.hpp
index cb4746dbf40..d1e00b36f6f 100644
--- a/cpp/include/cudf/strings/convert/convert_floats.hpp
+++ b/cpp/include/cudf/strings/convert/convert_floats.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -68,6 +68,30 @@ std::unique_ptr<column> from_floats(
   column_view const& floats,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a boolean column identifying strings in which all
+ * characters are valid for conversion to floats.
+ *
+ * The output row entry will be set to `true` if the corresponding string element
+ * has at least one character in [-+0-9eE.].
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ['123', '-456', '', 'A', '+7', '8.9' '3.7e+5']
+ * b = s.is_float(s)
+ * b is [true, true, false, false, true, true, true]
+ * @endcode
+ *
+ * Any null row results in a null entry for that row in the output column.
+ *
+ * @param strings Strings instance for this operation.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return New column of boolean results for each string.
+ */
+std::unique_ptr<column> is_float(
+  strings_column_view const& strings,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/convert/convert_integers.hpp b/cpp/include/cudf/strings/convert/convert_integers.hpp
index 8f42deb380d..1e2fa80b129 100644
--- a/cpp/include/cudf/strings/convert/convert_integers.hpp
+++ b/cpp/include/cudf/strings/convert/convert_integers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -73,6 +73,30 @@ std::unique_ptr<column> from_integers(
   column_view const& integers,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a boolean column identifying strings in which all
+ * characters are valid for conversion to integers.
+ *
+ * The output row entry will be set to `true` if the corresponding string element
+ * has at least one character in [-+0-9].
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ['123', '-456', '', 'A', '+7']
+ * b = s.is_integer(s)
+ * b is [true, true, false, false, true]
+ * @endcode
+ *
+ * Any null row results in a null entry for that row in the output column.
+ *
+ * @param strings Strings instance for this operation.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return New column of boolean results for each string.
+ */
+std::unique_ptr<column> is_integer(
+  strings_column_view const& strings,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Returns a new integer numeric column parsing hexadecimal values from the
  * provided strings column.
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 28da8ef4324..988fa552100 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -31,15 +31,60 @@
 #include <thrust/transform.h>
 
 namespace cudf {
+namespace strings {
+namespace detail {
 
-template <typename Iterator>
-constexpr inline bool is_signed_iterator()
+/**
+ * @brief Returns a new chars column using the specified indices to select
+ * strings from the input iterator.
+ *
+ * This uses a character-parallel gather CUDA kernel that performs very
+ * well on a strings column with long strings (e.g. average > 64 bytes).
+ *
+ * @tparam StringIterator Iterator should produce `string_view` objects.
+ * @tparam MapIterator Iterator for retrieving integer indices of the `StringIterator`.
+ *
+ * @param strings_begin Start of the iterator to retrieve `string_view` instances
+ * @param map_begin Start of index iterator.
+ * @param map_end End of index iterator.
+ * @param offsets The offset values to be associated with the output chars column.
+ * @param chars_bytes The total number of bytes for the output chars column.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @return New chars column fit for a strings column.
+ */
+template <typename StringIterator, typename MapIterator>
+std::unique_ptr<cudf::column> gather_chars(StringIterator strings_begin,
+                                           MapIterator map_begin,
+                                           MapIterator map_end,
+                                           cudf::device_span<int32_t const> const offsets,
+                                           size_type chars_bytes,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
 {
-  return std::is_signed<typename std::iterator_traits<Iterator>::value_type>::value;
-}
+  auto const output_count = std::distance(map_begin, map_end);
+  if (output_count == 0) return make_empty_column(data_type{type_id::INT8});
 
-namespace strings {
-namespace detail {
+  auto chars_column  = create_chars_child_column(output_count, 0, chars_bytes, stream, mr);
+  auto const d_chars = chars_column->mutable_view().template data<char>();
+
+  auto gather_chars_fn = [strings_begin, map_begin, offsets] __device__(size_type out_idx) -> char {
+    auto const out_row =
+      thrust::prev(thrust::upper_bound(thrust::seq, offsets.begin(), offsets.end(), out_idx));
+    auto const row_idx = map_begin[thrust::distance(offsets.begin(), out_row)];  // get row index
+    auto const d_str   = strings_begin[row_idx];                                 // get row's string
+    auto const offset  = out_idx - *out_row;  // get string's char
+    return d_str.data()[offset];
+  };
+
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(chars_bytes),
+                    d_chars,
+                    gather_chars_fn);
+
+  return chars_column;
+}
 
 /**
  * @brief Returns a new strings column using the specified indices to select
@@ -107,29 +152,15 @@ std::unique_ptr<cudf::column> gather(
     rmm::exec_policy(stream), d_out_offsets, d_out_offsets + output_count + 1, d_out_offsets);
 
   // build chars column
-  size_type const out_chars_bytes = static_cast<size_type>(total_bytes);
-  auto out_chars_column  = create_chars_child_column(output_count, 0, out_chars_bytes, stream, mr);
-  auto const d_out_chars = out_chars_column->mutable_view().template data<char>();
-
-  // fill in chars
   cudf::device_span<int32_t const> const d_out_offsets_span(d_out_offsets, output_count + 1);
-  auto const d_in_chars = (strings_count > 0) ? strings.chars().data<char>() : nullptr;
-  auto gather_chars_fn =
-    [d_out_offsets_span, begin, d_in_offsets, d_in_chars] __device__(size_type out_char_idx) {
-      // find output row index for this output char index
-      auto const next_row_ptr = thrust::upper_bound(
-        thrust::seq, d_out_offsets_span.begin(), d_out_offsets_span.end(), out_char_idx);
-      auto const out_row_idx     = thrust::distance(d_out_offsets_span.begin(), next_row_ptr) - 1;
-      auto const str_char_offset = out_char_idx - d_out_offsets_span[out_row_idx];
-      auto const in_row_idx      = begin[out_row_idx];
-      auto const in_char_offset  = d_in_offsets[in_row_idx] + str_char_offset;
-      return d_in_chars[in_char_offset];
-    };
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(out_chars_bytes),
-                    d_out_chars,
-                    gather_chars_fn);
+  auto const d_strings  = column_device_view::create(strings.parent(), stream);
+  auto out_chars_column = gather_chars(d_strings->begin<string_view>(),
+                                       begin,
+                                       end,
+                                       d_out_offsets_span,
+                                       static_cast<size_type>(total_bytes),
+                                       stream,
+                                       mr);
 
   return make_strings_column(output_count,
                              std::move(out_offsets_column),
diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
index 8e843c555c5..932f7eb0926 100644
--- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh
+++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -20,6 +20,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/valid_if.cuh>
+#include <cudf/strings/detail/gather.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -27,6 +28,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/copy.h>
 #include <thrust/for_each.h>
 #include <thrust/transform_reduce.h>
 
@@ -34,7 +36,27 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
-// Create a strings-type column from iterators of pointer/size pairs
+/**
+ * @brief Average string byte-length threshold for deciding character-level
+ * vs. row-level parallel algorithm.
+ *
+ * This value was determined by running the factory_benchmark against different
+ * string lengths and observing the point where the performance is faster for
+ * long strings.
+ */
+constexpr size_type FACTORY_BYTES_PER_ROW_THRESHOLD = 64;
+
+/**
+ * @brief Create a strings-type column from iterators of pointer/size pairs
+ *
+ * @tparam IndexPairIterator iterator over type `pair<char const*,size_type>` values
+ *
+ * @param begin First string row (inclusive)
+ * @param end Last string row (exclusive)
+ * @param stream CUDA stream used for device memory operations
+ * @param mr  Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
+ */
 template <typename IndexPairIterator>
 std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
                                             IndexPairIterator end,
@@ -51,7 +73,7 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
   auto size_checker = [] __device__(string_index_pair const& item) {
     return (item.first != nullptr) ? item.second : 0;
   };
-  size_t bytes = thrust::transform_reduce(
+  size_t const bytes = thrust::transform_reduce(
     rmm::exec_policy(stream), begin, end, size_checker, 0, thrust::plus<size_t>());
   CUDF_EXPECTS(bytes < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
                "total size of strings is too large for cudf column");
@@ -65,26 +87,49 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
     offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
 
   // create null mask
-  auto validator  = [] __device__(string_index_pair const item) { return item.first != nullptr; };
-  auto new_nulls  = cudf::detail::valid_if(begin, end, validator, stream, mr);
-  auto null_count = new_nulls.second;
+  auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; };
+  auto new_nulls = cudf::detail::valid_if(begin, end, validator, stream, mr);
+  auto const null_count = new_nulls.second;
   auto null_mask =
     (null_count > 0) ? std::move(new_nulls.first) : rmm::device_buffer{0, stream, mr};
 
   // build chars column
-  auto chars_column =
-    strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
-  auto d_chars    = chars_column->mutable_view().template data<char>();
-  auto copy_chars = [d_chars] __device__(auto item) {
-    string_index_pair str = thrust::get<0>(item);
-    size_type offset      = thrust::get<1>(item);
-    if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second);
-  };
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_zip_iterator(
-                       thrust::make_tuple(begin, offsets_column->view().template begin<int32_t>())),
-                     strings_count,
-                     copy_chars);
+  std::unique_ptr<column> chars_column = [&] {
+    // use a character-parallel kernel for long string lengths
+    auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1);
+    if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) {
+      auto const d_offsets =
+        device_span<size_type const>{offsets_column->view().template data<int32_t>(),
+                                     static_cast<std::size_t>(offsets_column->size())};
+      auto const str_begin = thrust::make_transform_iterator(begin, [] __device__(auto ip) {
+        return string_view{ip.first, ip.second};
+      });
+
+      return gather_chars(str_begin,
+                          thrust::make_counting_iterator<size_type>(0),
+                          thrust::make_counting_iterator<size_type>(strings_count),
+                          d_offsets,
+                          static_cast<size_type>(bytes),
+                          stream,
+                          mr);
+    } else {
+      // this approach is 2-3x faster for a large number of smaller string lengths
+      auto chars_column =
+        strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
+      auto d_chars    = chars_column->mutable_view().template data<char>();
+      auto copy_chars = [d_chars] __device__(auto item) {
+        string_index_pair const str = thrust::get<0>(item);
+        size_type const offset      = thrust::get<1>(item);
+        if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second);
+      };
+      thrust::for_each_n(rmm::exec_policy(stream),
+                         thrust::make_zip_iterator(thrust::make_tuple(
+                           begin, offsets_column->view().template begin<int32_t>())),
+                         strings_count,
+                         copy_chars);
+      return chars_column;
+    }
+  }();
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
@@ -95,7 +140,22 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
                              mr);
 }
 
-// Create a strings-type column from iterators to chars, offsets, and bitmask.
+/**
+ * @brief Create a strings-type column from iterators to chars, offsets, and bitmask.
+ *
+ * @tparam CharIterator iterator over character bytes (int8)
+ * @tparam OffsetIterator iterator over offset values (size_type)
+ *
+ * @param chars_begin First character byte (inclusive)
+ * @param chars_end Last character byte (exclusive)
+ * @param offset_begin First offset value (inclusive)
+ * @param offset_end Last offset value (exclusive)
+ * @param null_count Number of null rows
+ * @param null_mask The validity bitmask in Arrow format
+ * @param stream CUDA stream used for device memory operations
+ * @param mr  Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
+ */
 template <typename CharIterator, typename OffsetIterator>
 std::unique_ptr<column> make_strings_column(CharIterator chars_begin,
                                             CharIterator chars_end,
diff --git a/cpp/include/cudf/table/table.hpp b/cpp/include/cudf/table/table.hpp
index 553cf5e9096..4571362076c 100644
--- a/cpp/include/cudf/table/table.hpp
+++ b/cpp/include/cudf/table/table.hpp
@@ -110,6 +110,27 @@ class table {
    */
   std::vector<std::unique_ptr<column>> release();
 
+  /**
+   * @brief Returns a table_view built from a range of column indices.
+   *
+   * @throws std::out_of_range
+   * If any index is outside [0, num_columns())
+   *
+   * @param begin Beginning of the range
+   * @param end Ending of the range
+   * @return A table_view consisting of columns from the original table
+   * specified by the elements of `column_indices`
+   */
+
+  template <typename InputIterator>
+  table_view select(InputIterator begin, InputIterator end) const
+  {
+    std::vector<column_view> columns(std::distance(begin, end));
+    std::transform(
+      begin, end, columns.begin(), [this](auto index) { return _columns.at(index)->view(); });
+    return table_view(columns);
+  }
+
   /**
    * @brief Returns a table_view with set of specified columns.
    *
@@ -120,7 +141,10 @@ class table {
    * @return A table_view consisting of columns from the original table
    * specified by the elements of `column_indices`
    */
-  table_view select(std::vector<cudf::size_type> const& column_indices) const;
+  table_view select(std::vector<cudf::size_type> const& column_indices) const
+  {
+    return select(column_indices.begin(), column_indices.end());
+  };
 
   /**
    * @brief Returns a reference to the specified column
diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp
index 22f2073f73c..083366cc310 100644
--- a/cpp/include/cudf/table/table_view.hpp
+++ b/cpp/include/cudf/table/table_view.hpp
@@ -174,6 +174,25 @@ class table_view : public detail::table_view_base<column_view> {
    */
   table_view(std::vector<table_view> const& views);
 
+  /**
+   * @brief Returns a table_view built from a range of column indices.
+   *
+   * @throws std::out_of_range
+   * If any index is outside [0, num_columns())
+   *
+   * @param begin Beginning of the range
+   * @param end Ending of the range
+   * @return A table_view consisting of columns from the original table
+   * specified by the elements of `column_indices`
+   */
+  template <typename InputIterator>
+  table_view select(InputIterator begin, InputIterator end) const
+  {
+    std::vector<column_view> columns(std::distance(begin, end));
+    std::transform(begin, end, columns.begin(), [this](auto index) { return this->column(index); });
+    return table_view(columns);
+  }
+
   /**
    * @brief Returns a table_view with set of specified columns.
    *
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index e045476ea77..1e0d45d081d 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -224,6 +224,18 @@ constexpr inline bool is_unsigned(data_type type)
   return cudf::type_dispatcher(type, is_unsigned_impl{});
 }
 
+/**
+ * @brief Indicates whether the `Iterator` value type is unsigned.
+ *
+ * @tparam Iterator  The type to verify
+ * @return true if the iterator's value type is unsigned
+ */
+template <typename Iterator>
+constexpr inline bool is_signed_iterator()
+{
+  return std::is_signed<typename std::iterator_traits<Iterator>::value_type>::value;
+}
+
 /**
  * @brief Indicates whether the type `T` is a floating point type.
  *
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index dda2e0704f6..f7e153d71f4 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -416,36 +416,28 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view,
   auto total_num_bytes      = strings_column.chars_size();
   char const* ptr_all_bytes = strings_column.chars().data<char>();
 
-  if (out_sink_->supports_device_write()) {
-    // host algorithm call, but the underlying call
-    // is a device_write taking a device buffer;
-    //
+  if (out_sink_->is_device_write_preferred(total_num_bytes)) {
+    // Direct write from device memory
     out_sink_->device_write(ptr_all_bytes, total_num_bytes, stream);
-    out_sink_->device_write(newline.data(),
-                            newline.size(),
-                            stream);  // needs newline at the end, to separate from next chunk
   } else {
-    // no device write possible;
-    //
-    // copy the bytes to host, too:
-    //
+    // copy the bytes to host to write them out
     thrust::host_vector<char> h_bytes(total_num_bytes);
     CUDA_TRY(cudaMemcpyAsync(h_bytes.data(),
                              ptr_all_bytes,
                              total_num_bytes * sizeof(char),
                              cudaMemcpyDeviceToHost,
                              stream.value()));
-
     stream.synchronize();
 
-    // host algorithm call, where the underlying call
-    // is also host_write taking a host buffer;
-    //
-    char const* ptr_h_bytes = h_bytes.data();
-    out_sink_->host_write(ptr_h_bytes, total_num_bytes);
+    out_sink_->host_write(h_bytes.data(), total_num_bytes);
+  }
+
+  // Needs newline at the end, to separate from next chunk
+  if (out_sink_->is_device_write_preferred(newline.size())) {
+    out_sink_->device_write(newline.data(), newline.size(), stream);
+  } else {
     out_sink_->host_write(options_.get_line_terminator().data(),
-                          options_.get_line_terminator()
-                            .size());  // needs newline at the end, to separate from next chunk
+                          options_.get_line_terminator().size());
   }
 }
 
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 80935e3fbd5..61adef26dab 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -139,7 +139,7 @@ struct orc_stream_info {
   }
   uint64_t offset;      // offset in file
   size_t dst_pos;       // offset in memory relative to start of compressed stripe data
-  uint32_t length;      // length in file
+  size_t length;        // length in file
   uint32_t gdf_idx;     // column index
   uint32_t stripe_idx;  // stripe index
 };
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 4bca725a16b..1ff752034ad 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1455,8 +1455,9 @@ __global__ void __launch_bounds__(block_size)
     __syncthreads();
     // Decode data streams
     {
-      uint32_t numvals      = s->top.data.max_vals, secondary_val;
-      uint32_t vals_skipped = 0;
+      uint32_t numvals       = s->top.data.max_vals;
+      uint64_t secondary_val = 0;
+      uint32_t vals_skipped  = 0;
       if (s->is_string || s->chunk.type_kind == TIMESTAMP) {
         // For these data types, we have a secondary unsigned 32-bit data stream
         orc_bytestream_s *bs = (is_dictionary(s->chunk.encoding_kind)) ? &s->bs : &s->bs2;
@@ -1471,9 +1472,15 @@ __global__ void __launch_bounds__(block_size)
         }
         if (numvals > ofs) {
           if (is_rlev1(s->chunk.encoding_kind)) {
-            numvals = ofs + Integer_RLEv1(bs, &s->u.rlev1, &s->vals.u32[ofs], numvals - ofs, t);
+            if (s->chunk.type_kind == TIMESTAMP)
+              numvals = ofs + Integer_RLEv1(bs, &s->u.rlev1, &s->vals.u64[ofs], numvals - ofs, t);
+            else
+              numvals = ofs + Integer_RLEv1(bs, &s->u.rlev1, &s->vals.u32[ofs], numvals - ofs, t);
           } else {
-            numvals = ofs + Integer_RLEv2(bs, &s->u.rlev2, &s->vals.u32[ofs], numvals - ofs, t);
+            if (s->chunk.type_kind == TIMESTAMP)
+              numvals = ofs + Integer_RLEv2(bs, &s->u.rlev2, &s->vals.u64[ofs], numvals - ofs, t);
+            else
+              numvals = ofs + Integer_RLEv2(bs, &s->u.rlev2, &s->vals.u32[ofs], numvals - ofs, t);
           }
           __syncthreads();
           if (numvals <= ofs && t >= ofs && t < s->top.data.max_vals) { s->vals.u32[t] = 0; }
@@ -1487,15 +1494,24 @@ __global__ void __launch_bounds__(block_size)
             __syncthreads();
             if (t == 0) { s->top.data.index.run_pos[cid] = 0; }
             numvals -= vals_skipped;
-            if (t < numvals) { secondary_val = s->vals.u32[vals_skipped + t]; }
+            if (t < numvals) {
+              secondary_val = (s->chunk.type_kind == TIMESTAMP) ? s->vals.u64[vals_skipped + t]
+                                                                : s->vals.u32[vals_skipped + t];
+            }
             __syncthreads();
-            if (t < numvals) { s->vals.u32[t] = secondary_val; }
+            if (t < numvals) {
+              if (s->chunk.type_kind == TIMESTAMP)
+                s->vals.u64[t] = secondary_val;
+              else
+                s->vals.u32[t] = secondary_val;
+            }
           }
         }
         __syncthreads();
         // For strings with direct encoding, we need to convert the lengths into an offset
         if (!is_dictionary(s->chunk.encoding_kind)) {
-          secondary_val = (t < numvals) ? s->vals.u32[t] : 0;
+          if (t < numvals)
+            secondary_val = (s->chunk.type_kind == TIMESTAMP) ? s->vals.u64[t] : s->vals.u32[t];
           if (s->chunk.type_kind != TIMESTAMP) {
             lengths_to_positions(s->vals.u32, numvals, t);
             __syncthreads();
@@ -1693,7 +1709,7 @@ __global__ void __launch_bounds__(block_size)
             }
             case TIMESTAMP: {
               int64_t seconds = s->vals.i64[t + vals_skipped] + s->top.data.utc_epoch;
-              uint32_t nanos  = secondary_val;
+              uint64_t nanos  = secondary_val;
               nanos           = (nanos >> 3) * kTimestampNanoScale[nanos & 7];
               if (!tz_table.ttimes.empty()) {
                 seconds += get_gmt_offset(tz_table.ttimes, tz_table.offsets, seconds);
@@ -1716,7 +1732,7 @@ __global__ void __launch_bounds__(block_size)
       if (s->chunk.type_kind == TIMESTAMP) {
         int buffer_pos = s->top.data.max_vals;
         if (t >= buffer_pos && t < buffer_pos + s->top.data.buffered_count) {
-          s->vals.u32[t - buffer_pos] = secondary_val;
+          s->vals.u64[t - buffer_pos] = secondary_val;
         }
       } else if (s->chunk.type_kind == BOOLEAN && t < s->top.data.buffered_count) {
         s->vals.u8[t] = secondary_val;
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 88cad005817..aef32efaf6e 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -92,6 +92,7 @@ struct orcenc_state_s {
   union {
     uint8_t u8[2048];
     uint32_t u32[1024];
+    uint64_t u64[1024];
   } lengths;
 };
 
@@ -101,6 +102,7 @@ static inline __device__ uint32_t zigzag(int32_t v)
   int32_t s = (v >> 31);
   return ((v ^ s) * 2) - s;
 }
+static inline __device__ uint64_t zigzag(uint64_t v) { return v; }
 static inline __device__ uint64_t zigzag(int64_t v)
 {
   int64_t s = (v < 0) ? 1 : 0;
@@ -286,24 +288,6 @@ static inline __device__ uint32_t StoreVarint(uint8_t *dst, uint64_t v)
   return bytecnt;
 }
 
-static inline __device__ void intrle_minmax(int64_t &vmin, int64_t &vmax)
-{
-  vmin = INT64_MIN;
-  vmax = INT64_MAX;
-}
-// static inline __device__ void intrle_minmax(uint64_t &vmin, uint64_t &vmax) { vmin = UINT64_C(0);
-// vmax = UINT64_MAX; }
-static inline __device__ void intrle_minmax(int32_t &vmin, int32_t &vmax)
-{
-  vmin = INT32_MIN;
-  vmax = INT32_MAX;
-}
-static inline __device__ void intrle_minmax(uint32_t &vmin, uint32_t &vmax)
-{
-  vmin = UINT32_C(0);
-  vmax = UINT32_MAX;
-}
-
 template <class T>
 static inline __device__ void StoreBytesBigEndian(uint8_t *dst, T v, uint32_t w)
 {
@@ -412,13 +396,9 @@ static __device__ uint32_t IntegerRLE(orcenc_state_s *s,
     // Find minimum and maximum values
     if (literal_run > 0) {
       // Find min & max
-      T vmin, vmax;
+      T vmin = (t < literal_run) ? v0 : std::numeric_limits<T>::max();
+      T vmax = (t < literal_run) ? v0 : std::numeric_limits<T>::min();
       uint32_t literal_mode, literal_w;
-      if (t < literal_run) {
-        vmin = vmax = v0;
-      } else {
-        intrle_minmax(vmax, vmin);
-      }
       vmin = block_reduce(temp_storage).Reduce(vmin, cub::Min());
       __syncthreads();
       vmax = block_reduce(temp_storage).Reduce(vmax, cub::Max());
@@ -652,6 +632,7 @@ __global__ void __launch_bounds__(block_size)
     typename cub::BlockReduce<int32_t, block_size>::TempStorage i32;
     typename cub::BlockReduce<int64_t, block_size>::TempStorage i64;
     typename cub::BlockReduce<uint32_t, block_size>::TempStorage u32;
+    typename cub::BlockReduce<uint64_t, block_size>::TempStorage u64;
   } temp_storage;
 
   orcenc_state_s *const s = &state_g;
@@ -763,7 +744,7 @@ __global__ void __launch_bounds__(block_size)
             int64_t ts       = static_cast<const int64_t *>(base)[row];
             int32_t ts_scale = kTimeScale[min(s->chunk.scale, 9)];
             int64_t seconds  = ts / ts_scale;
-            int32_t nanos    = (ts - seconds * ts_scale);
+            int64_t nanos    = (ts - seconds * ts_scale);
             // There is a bug in the ORC spec such that for negative timestamps, it is understood
             // between the writer and reader that nanos will be adjusted to their positive component
             // but the negative seconds will be left alone. This means that -2.6 is encoded as
@@ -786,7 +767,7 @@ __global__ void __launch_bounds__(block_size)
               }
               nanos = (nanos << 3) + zeroes;
             }
-            s->lengths.u32[nz_idx] = nanos;
+            s->lengths.u64[nz_idx] = nanos;
             break;
           }
           case STRING:
@@ -897,6 +878,9 @@ __global__ void __launch_bounds__(block_size)
         uint32_t flush = (s->cur_row == s->chunk.num_rows) ? 1 : 0, n;
         switch (s->chunk.type_kind) {
           case TIMESTAMP:
+            n = IntegerRLE<CI_DATA2, uint64_t, false, 0x3ff, block_size>(
+              s, s->lengths.u64, s->nnz - s->numlengths, s->numlengths, flush, t, temp_storage.u64);
+            break;
           case STRING:
             n = IntegerRLE<CI_DATA2, uint32_t, false, 0x3ff, block_size>(
               s, s->lengths.u32, s->nnz - s->numlengths, s->numlengths, flush, t, temp_storage.u32);
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 1c99c99369b..eb5e90bbeec 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -476,7 +476,6 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
         break;
       case TypeKind::STRING: {
         bool enable_dict           = enable_dictionary_;
-        size_t direct_data_size    = 0;
         size_t dict_data_size      = 0;
         size_t dict_strings        = 0;
         size_t dict_lengths_div512 = 0;
@@ -488,11 +487,15 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
             dict_lengths_div512 += (sd->num_strings + 0x1ff) >> 9;
             dict_data_size += sd->dict_char_count;
           }
-          direct_data_size += std::accumulate(
-            stripe.cbegin(), stripe.cend(), direct_data_size, [&](auto data_size, auto rg_idx) {
-              return data_size + column.host_dict_chunk(rg_idx)->string_char_count;
-            });
         }
+
+        auto const direct_data_size =
+          std::accumulate(stripe_bounds.front().cbegin(),
+                          stripe_bounds.back().cend(),
+                          size_t{0},
+                          [&](auto data_size, auto rg_idx) {
+                            return data_size + column.host_dict_chunk(rg_idx)->string_char_count;
+                          });
         if (enable_dict) {
           uint32_t dict_bits = 0;
           for (dict_bits = 1; dict_bits < 32; dict_bits <<= 1) {
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 43d144ec980..f920aee1c29 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -184,7 +184,7 @@ struct ColumnChunkDesc {
   {
   }
 
-  uint8_t *compressed_data;                        // pointer to compressed column chunk data
+  uint8_t const *compressed_data;                  // pointer to compressed column chunk data
   size_t compressed_size;                          // total compressed data size for this chunk
   size_t num_values;                               // total number of values in this column
   size_t start_row;                                // starting row of this chunk
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index a7a02cc6108..16cf0877c23 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -822,7 +822,7 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
  * @copydoc cudf::io::detail::parquet::read_column_chunks
  */
 void reader::impl::read_column_chunks(
-  std::vector<rmm::device_buffer> &page_data,
+  std::vector<std::unique_ptr<datasource::buffer>> &page_data,
   hostdevice_vector<gpu::ColumnChunkDesc> &chunks,  // TODO const?
   size_t begin_chunk,
   size_t end_chunk,
@@ -850,9 +850,15 @@ void reader::impl::read_column_chunks(
       next_chunk++;
     }
     if (io_size != 0) {
-      auto buffer         = _sources[chunk_source_map[chunk]]->host_read(io_offset, io_size);
-      page_data[chunk]    = rmm::device_buffer(buffer->data(), buffer->size(), stream);
-      uint8_t *d_compdata = static_cast<uint8_t *>(page_data[chunk].data());
+      auto &source = _sources[chunk_source_map[chunk]];
+      if (source->is_device_read_preferred(io_size)) {
+        page_data[chunk] = source->device_read(io_offset, io_size, stream);
+      } else {
+        auto const buffer = source->host_read(io_offset, io_size);
+        page_data[chunk] =
+          datasource::buffer::create(rmm::device_buffer(buffer->data(), buffer->size(), stream));
+      }
+      auto d_compdata = page_data[chunk]->data();
       do {
         chunks[chunk].compressed_data = d_compdata;
         d_compdata += chunks[chunk].compressed_size;
@@ -1414,7 +1420,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
     std::vector<size_type> chunk_source_map(num_chunks);
 
     // Tracker for eventually deallocating compressed and uncompressed data
-    std::vector<rmm::device_buffer> page_data(num_chunks);
+    std::vector<std::unique_ptr<datasource::buffer>> page_data(num_chunks);
 
     // Keep track of column chunk file offsets
     std::vector<size_t> column_chunk_offsets(num_chunks);
@@ -1516,10 +1522,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
         decomp_page_data = decompress_page_data(chunks, pages, stream);
         // Free compressed data
         for (size_t c = 0; c < chunks.size(); c++) {
-          if (chunks[c].codec != parquet::Compression::UNCOMPRESSED && page_data[c].size() != 0) {
-            page_data[c].resize(0);
-            page_data[c].shrink_to_fit();
-          }
+          if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) { page_data[c].reset(); }
         }
       }
 
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 137fca03bfd..ca200936134 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -91,7 +91,7 @@ class reader::impl {
    * @param stream CUDA stream used for device memory operations and kernel launches.
    *
    */
-  void read_column_chunks(std::vector<rmm::device_buffer> &page_data,
+  void read_column_chunks(std::vector<std::unique_ptr<datasource::buffer>> &page_data,
                           hostdevice_vector<gpu::ColumnChunkDesc> &chunks,
                           size_t begin_chunk,
                           size_t end_chunk,
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index a645ca0fd91..dd68bc50043 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1108,19 +1108,7 @@ void writer::impl::write(table_view const &table)
                        num_stats_bfr);
   }
 
-  auto host_bfr = [&]() {
-    // if the writer supports device_write(), we don't need this scratch space
-    if (out_sink_->supports_device_write()) {
-      return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
-    } else {
-      return pinned_buffer<uint8_t>{[](size_t size) {
-                                      uint8_t *ptr = nullptr;
-                                      CUDA_TRY(cudaMallocHost(&ptr, size));
-                                      return ptr;
-                                    }(max_chunk_bfr_size),
-                                    cudaFreeHost};
-    }
-  }();
+  pinned_buffer<uint8_t> host_bfr{nullptr, cudaFreeHost};
 
   // Encode row groups in batches
   for (uint32_t b = 0, r = 0, global_r = global_rowgroup_base; b < (uint32_t)batch_list.size();
@@ -1155,7 +1143,7 @@ void writer::impl::write(table_view const &table)
           dev_bfr = ck->uncompressed_bfr;
         }
 
-        if (out_sink_->supports_device_write()) {
+        if (out_sink_->is_device_write_preferred(ck->compressed_size)) {
           // let the writer do what it wants to retrieve the data from the gpu.
           out_sink_->device_write(dev_bfr + ck->ck_stat_size, ck->compressed_size, stream);
           // we still need to do a (much smaller) memcpy for the statistics.
@@ -1170,6 +1158,14 @@ void writer::impl::write(table_view const &table)
             stream.synchronize();
           }
         } else {
+          if (!host_bfr) {
+            host_bfr = pinned_buffer<uint8_t>{[](size_t size) {
+                                                uint8_t *ptr = nullptr;
+                                                CUDA_TRY(cudaMallocHost(&ptr, size));
+                                                return ptr;
+                                              }(max_chunk_bfr_size),
+                                              cudaFreeHost};
+          }
           // copy the full data
           CUDA_TRY(cudaMemcpyAsync(host_bfr.get(),
                                    dev_bfr,
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index 48558005303..10af7bcb0bd 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <cudf/io/data_sink.hpp>
 #include <cudf/utilities/error.hpp>
+#include <io/utilities/file_io_utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -29,24 +30,44 @@ namespace io {
 class file_sink : public data_sink {
  public:
   explicit file_sink(std::string const& filepath)
+    : _cufile_out(detail::make_cufile_output(filepath))
   {
-    outfile_.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc);
-    CUDF_EXPECTS(outfile_.is_open(), "Cannot open output file");
+    _output_stream.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc);
+    CUDF_EXPECTS(_output_stream.is_open(), "Cannot open output file");
   }
 
   virtual ~file_sink() { flush(); }
 
   void host_write(void const* data, size_t size) override
   {
-    outfile_.write(static_cast<char const*>(data), size);
+    _output_stream.seekp(_bytes_written);
+    _output_stream.write(static_cast<char const*>(data), size);
+    _bytes_written += size;
   }
 
-  void flush() override { outfile_.flush(); }
+  void flush() override { _output_stream.flush(); }
 
-  size_t bytes_written() override { return outfile_.tellp(); }
+  size_t bytes_written() override { return _bytes_written; }
+
+  bool supports_device_write() const override { return _cufile_out != nullptr; }
+
+  bool is_device_write_preferred(size_t size) const override
+  {
+    return _cufile_out != nullptr && _cufile_out->is_cufile_io_preferred(size);
+  }
+
+  void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
+  {
+    if (!supports_device_write()) CUDF_FAIL("Device writes are not supported for this file.");
+
+    _cufile_out->write(gpu_data, _bytes_written, size);
+    _bytes_written += size;
+  }
 
  private:
-  std::ofstream outfile_;
+  std::ofstream _output_stream;
+  size_t _bytes_written = 0;
+  std::unique_ptr<detail::cufile_output_impl> _cufile_out;
 };
 
 /**
@@ -77,25 +98,25 @@ class host_buffer_sink : public data_sink {
  */
 class void_sink : public data_sink {
  public:
-  explicit void_sink() : bytes_written_(0) {}
+  explicit void_sink() : _bytes_written(0) {}
 
   virtual ~void_sink() {}
 
-  void host_write(void const* data, size_t size) override { bytes_written_ += size; }
+  void host_write(void const* data, size_t size) override { _bytes_written += size; }
 
   bool supports_device_write() const override { return true; }
 
   void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
   {
-    bytes_written_ += size;
+    _bytes_written += size;
   }
 
   void flush() override {}
 
-  size_t bytes_written() override { return bytes_written_; }
+  size_t bytes_written() override { return _bytes_written; }
 
  private:
-  size_t bytes_written_;
+  size_t _bytes_written;
 };
 
 class user_sink_wrapper : public data_sink {
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 74163d023be..3f2884d5b7d 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
+#include <cudf/io/datasource.hpp>
+
 #include <fcntl.h>
 #include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
 #include <unistd.h>
 
-#include <cudf/io/datasource.hpp>
 #include <cudf/utilities/error.hpp>
+#include <io/utilities/file_io_utilities.hpp>
 
 namespace cudf {
 namespace io {
@@ -34,12 +34,6 @@ namespace io {
  * mapping a subset of the file where the starting offset may not be zero.
  */
 class memory_mapped_source : public datasource {
-  struct file_wrapper {
-    const int fd = -1;
-    explicit file_wrapper(const char *filepath) : fd(open(filepath, O_RDONLY)) {}
-    ~file_wrapper() { close(fd); }
-  };
-
   class memory_mapped_buffer : public buffer {
     size_t _size   = 0;
     uint8_t *_data = nullptr;
@@ -52,77 +46,99 @@ class memory_mapped_source : public datasource {
 
  public:
   explicit memory_mapped_source(const char *filepath, size_t offset, size_t size)
+    : _cufile_in(detail::make_cufile_input(filepath))
   {
-    auto const file = file_wrapper(filepath);
-    CUDF_EXPECTS(file.fd != -1, "Cannot open file");
-
-    struct stat st;
-    CUDF_EXPECTS(fstat(file.fd, &st) != -1, "Cannot query file size");
-    file_size_ = static_cast<size_t>(st.st_size);
-
-    if (file_size_ != 0) { map(file.fd, offset, size); }
+    auto const file = detail::file_wrapper(filepath, O_RDONLY);
+    _file_size      = file.size();
+    if (_file_size != 0) { map(file.desc(), offset, size); }
   }
 
   virtual ~memory_mapped_source()
   {
-    if (map_addr_ != nullptr) { munmap(map_addr_, map_size_); }
+    if (_map_addr != nullptr) { munmap(_map_addr, _map_size); }
   }
 
   std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
   {
-    CUDF_EXPECTS(offset >= map_offset_, "Requested offset is outside mapping");
+    CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping");
 
     // Clamp length to available data in the mapped region
-    auto const read_size = std::min(size, map_size_ - (offset - map_offset_));
+    auto const read_size = std::min(size, _map_size - (offset - _map_offset));
 
     return std::make_unique<memory_mapped_buffer>(
-      static_cast<uint8_t *>(map_addr_) + (offset - map_offset_), read_size);
+      static_cast<uint8_t *>(_map_addr) + (offset - _map_offset), read_size);
   }
 
   size_t host_read(size_t offset, size_t size, uint8_t *dst) override
   {
-    CUDF_EXPECTS(offset >= map_offset_, "Requested offset is outside mapping");
+    CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping");
 
     // Clamp length to available data in the mapped region
-    auto const read_size = std::min(size, map_size_ - (offset - map_offset_));
+    auto const read_size = std::min(size, _map_size - (offset - _map_offset));
 
-    auto const src = static_cast<uint8_t *>(map_addr_) + (offset - map_offset_);
+    auto const src = static_cast<uint8_t *>(_map_addr) + (offset - _map_offset);
     std::memcpy(dst, src, read_size);
     return read_size;
   }
 
-  size_t size() const override { return file_size_; }
+  bool supports_device_read() const override { return _cufile_in != nullptr; }
+
+  bool is_device_read_preferred(size_t size) const
+  {
+    return _cufile_in != nullptr && _cufile_in->is_cufile_io_preferred(size);
+  }
+
+  std::unique_ptr<datasource::buffer> device_read(size_t offset,
+                                                  size_t size,
+                                                  rmm::cuda_stream_view stream) override
+  {
+    if (!supports_device_read()) CUDF_FAIL("Device reads are not supported for this file.");
+
+    auto const read_size = std::min(size, _map_size - (offset - _map_offset));
+    return _cufile_in->read(offset, read_size, stream);
+  }
+
+  size_t device_read(size_t offset,
+                     size_t size,
+                     uint8_t *dst,
+                     rmm::cuda_stream_view stream) override
+  {
+    if (!supports_device_read()) CUDF_FAIL("Device reads are not supported for this file.");
+    auto const read_size = std::min(size, _map_size - (offset - _map_offset));
+    return _cufile_in->read(offset, read_size, dst, stream);
+  }
+
+  size_t size() const override { return _file_size; }
 
  private:
   void map(int fd, size_t offset, size_t size)
   {
-    CUDF_EXPECTS(offset < file_size_, "Offset is past end of file");
+    CUDF_EXPECTS(offset < _file_size, "Offset is past end of file");
 
     // Offset for `mmap()` must be page aligned
-    auto const map_offset = offset & ~(sysconf(_SC_PAGESIZE) - 1);
+    _map_offset = offset & ~(sysconf(_SC_PAGESIZE) - 1);
 
     // Clamp length to available data in the file
     if (size == 0) {
-      size = file_size_ - offset;
+      size = _file_size - offset;
     } else {
-      if ((offset + size) > file_size_) { size = file_size_ - offset; }
+      if ((offset + size) > _file_size) { size = _file_size - offset; }
     }
 
     // Size for `mmap()` needs to include the page padding
-    const auto map_size = size + (offset - map_offset);
+    _map_size = size + (offset - _map_offset);
 
     // Check if accessing a region within already mapped area
-    map_addr_ = mmap(NULL, map_size, PROT_READ, MAP_PRIVATE, fd, map_offset);
-    CUDF_EXPECTS(map_addr_ != MAP_FAILED, "Cannot create memory mapping");
-    map_offset_ = map_offset;
-    map_size_   = map_size;
+    _map_addr = mmap(nullptr, _map_size, PROT_READ, MAP_PRIVATE, fd, _map_offset);
+    CUDF_EXPECTS(_map_addr != MAP_FAILED, "Cannot create memory mapping");
   }
 
  private:
-  size_t file_size_  = 0;
-  void *map_addr_    = nullptr;
-  size_t map_size_   = 0;
-  size_t map_offset_ = 0;
+  size_t _file_size  = 0;
+  void *_map_addr    = nullptr;
+  size_t _map_size   = 0;
+  size_t _map_offset = 0;
+  std::unique_ptr<detail::cufile_input_impl> _cufile_in;
 };
 
 /**
@@ -148,14 +164,19 @@ class user_datasource_wrapper : public datasource {
 
   bool supports_device_read() const override { return source->supports_device_read(); }
 
-  size_t device_read(size_t offset, size_t size, uint8_t *dst) override
+  size_t device_read(size_t offset,
+                     size_t size,
+                     uint8_t *dst,
+                     rmm::cuda_stream_view stream) override
   {
-    return source->device_read(offset, size, dst);
+    return source->device_read(offset, size, dst, stream);
   }
 
-  std::unique_ptr<buffer> device_read(size_t offset, size_t size) override
+  std::unique_ptr<buffer> device_read(size_t offset,
+                                      size_t size,
+                                      rmm::cuda_stream_view stream) override
   {
-    return source->device_read(offset, size);
+    return source->device_read(offset, size, stream);
   }
 
   size_t size() const override { return source->size(); }
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
new file mode 100644
index 00000000000..22ff057cbc1
--- /dev/null
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf_test/file_utilities.hpp>
+#include <io/utilities/file_io_utilities.hpp>
+
+#include <rmm/device_buffer.hpp>
+
+#include <dlfcn.h>
+
+#include <fstream>
+
+namespace cudf {
+namespace io {
+namespace detail {
+
+file_wrapper::file_wrapper(std::string const &filepath, int flags)
+  : fd(open(filepath.c_str(), flags))
+{
+  CUDF_EXPECTS(fd != -1, "Cannot open file " + filepath);
+}
+
+file_wrapper::file_wrapper(std::string const &filepath, int flags, mode_t mode)
+  : fd(open(filepath.c_str(), flags, mode))
+{
+  CUDF_EXPECTS(fd != -1, "Cannot open file " + filepath);
+}
+
+file_wrapper::~file_wrapper() { close(fd); }
+
+long file_wrapper::size() const
+{
+  if (_size < 0) {
+    struct stat st;
+    CUDF_EXPECTS(fstat(fd, &st) != -1, "Cannot query file size");
+    _size = static_cast<size_t>(st.st_size);
+  }
+  return _size;
+}
+
+#ifdef CUFILE_FOUND
+
+/**
+ * @brief Class that manages cuFile configuration.
+ */
+class cufile_config {
+  std::string const default_policy    = "OFF";
+  std::string const json_path_env_var = "CUFILE_ENV_PATH_JSON";
+
+  std::string const policy = default_policy;
+  temp_directory tmp_config_dir{"cudf_cufile_config"};
+
+  std::string getenv_or(std::string const &env_var_name, std::string const &default_val)
+  {
+    auto const env_val = std::getenv(env_var_name.c_str());
+    return (env_val == nullptr) ? default_val : std::string(env_val);
+  }
+
+  cufile_config() : policy{getenv_or("LIBCUDF_CUFILE_POLICY", default_policy)}
+  {
+    if (is_enabled()) {
+      // Modify the config file based on the policy
+      auto const config_file_path = getenv_or(json_path_env_var, "/etc/cufile.json");
+      std::ifstream user_config_file(config_file_path);
+      // Modified config file is stored in a temporary directory
+      auto const cudf_config_path = tmp_config_dir.path() + "/cufile.json";
+      std::ofstream cudf_config_file(cudf_config_path);
+
+      std::string line;
+      while (std::getline(user_config_file, line)) {
+        std::string const tag = "\"allow_compat_mode\"";
+        if (line.find(tag) != std::string::npos) {
+          // TODO: only replace the true/false value
+          // Enable compatiblity mode when cuDF does not fall back to host path
+          cudf_config_file << tag << ": " << (is_required() ? "true" : "false") << ",\n";
+        } else {
+          cudf_config_file << line << '\n';
+        }
+
+        // Point libcufile to the modified config file
+        CUDF_EXPECTS(setenv(json_path_env_var.c_str(), cudf_config_path.c_str(), 0) == 0,
+                     "Failed to set the cuFile config file environment variable.");
+      }
+    }
+  }
+
+ public:
+  /**
+   * @brief Returns true when cuFile use is enabled.
+   */
+  bool is_enabled() const { return policy == "ALWAYS" or policy == "GDS"; }
+
+  /**
+   * @brief Returns true when cuDF should not fall back to host IO.
+   */
+  bool is_required() const { return policy == "ALWAYS"; }
+
+  static cufile_config const *instance()
+  {
+    static cufile_config _instance;
+    return &_instance;
+  }
+};
+
+/**
+ * @brief Class that dynamically loads the cuFile library and manages the cuFile driver.
+ */
+class cufile_shim {
+ private:
+  cufile_shim();
+
+  void *cf_lib                              = nullptr;
+  decltype(cuFileDriverOpen) *driver_open   = nullptr;
+  decltype(cuFileDriverClose) *driver_close = nullptr;
+
+  std::unique_ptr<cudf::logic_error> init_error;
+  auto is_valid() const noexcept { return init_error == nullptr; }
+
+ public:
+  cufile_shim(cufile_shim const &) = delete;
+  cufile_shim &operator=(cufile_shim const &) = delete;
+
+  static cufile_shim const *instance();
+
+  ~cufile_shim()
+  {
+    driver_close();
+    dlclose(cf_lib);
+  }
+
+  decltype(cuFileHandleRegister) *handle_register     = nullptr;
+  decltype(cuFileHandleDeregister) *handle_deregister = nullptr;
+  decltype(cuFileRead) *read                          = nullptr;
+  decltype(cuFileWrite) *write                        = nullptr;
+};
+
+cufile_shim::cufile_shim()
+{
+  try {
+    cf_lib      = dlopen("libcufile.so", RTLD_NOW);
+    driver_open = reinterpret_cast<decltype(driver_open)>(dlsym(cf_lib, "cuFileDriverOpen"));
+    CUDF_EXPECTS(driver_open != nullptr, "could not find cuFile cuFileDriverOpen symbol");
+    driver_close = reinterpret_cast<decltype(driver_close)>(dlsym(cf_lib, "cuFileDriverClose"));
+    CUDF_EXPECTS(driver_close != nullptr, "could not find cuFile cuFileDriverClose symbol");
+    handle_register =
+      reinterpret_cast<decltype(handle_register)>(dlsym(cf_lib, "cuFileHandleRegister"));
+    CUDF_EXPECTS(handle_register != nullptr, "could not find cuFile cuFileHandleRegister symbol");
+    handle_deregister =
+      reinterpret_cast<decltype(handle_deregister)>(dlsym(cf_lib, "cuFileHandleDeregister"));
+    CUDF_EXPECTS(handle_deregister != nullptr,
+                 "could not find cuFile cuFileHandleDeregister symbol");
+    read = reinterpret_cast<decltype(read)>(dlsym(cf_lib, "cuFileRead"));
+    CUDF_EXPECTS(read != nullptr, "could not find cuFile cuFileRead symbol");
+    write = reinterpret_cast<decltype(write)>(dlsym(cf_lib, "cuFileWrite"));
+    CUDF_EXPECTS(write != nullptr, "could not find cuFile cuFileWrite symbol");
+
+    CUDF_EXPECTS(driver_open().err == CU_FILE_SUCCESS, "Failed to initialize cuFile driver");
+  } catch (cudf::logic_error const &err) {
+    init_error = std::make_unique<cudf::logic_error>(err);
+  }
+}
+
+cufile_shim const *cufile_shim::instance()
+{
+  static cufile_shim _instance;
+  // Defer throwing to avoid repeated attempts to load the library
+  if (!_instance.is_valid()) CUDF_FAIL("" + std::string(_instance.init_error->what()));
+
+  return &_instance;
+}
+
+void cufile_registered_file::register_handle()
+{
+  CUfileDescr_t cufile_desc{};
+  cufile_desc.handle.fd = _file.desc();
+  cufile_desc.type      = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
+  CUDF_EXPECTS(shim->handle_register(&cf_handle, &cufile_desc).err == CU_FILE_SUCCESS,
+               "Cannot register file handle with cuFile");
+}
+
+cufile_registered_file::~cufile_registered_file() { shim->handle_deregister(cf_handle); }
+
+cufile_input_impl::cufile_input_impl(std::string const &filepath)
+  : shim{cufile_shim::instance()}, cf_file(shim, filepath, O_RDONLY | O_DIRECT)
+{
+}
+
+std::unique_ptr<datasource::buffer> cufile_input_impl::read(size_t offset,
+                                                            size_t size,
+                                                            rmm::cuda_stream_view stream)
+{
+  rmm::device_buffer out_data(size, stream);
+  CUDF_EXPECTS(shim->read(cf_file.handle(), out_data.data(), size, offset, 0) != -1,
+               "cuFile error reading from a file");
+
+  return datasource::buffer::create(std::move(out_data));
+}
+
+size_t cufile_input_impl::read(size_t offset,
+                               size_t size,
+                               uint8_t *dst,
+                               rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(shim->read(cf_file.handle(), dst, size, offset, 0) != -1,
+               "cuFile error reading from a file");
+  // always read the requested size for now
+  return size;
+}
+
+cufile_output_impl::cufile_output_impl(std::string const &filepath)
+  : shim{cufile_shim::instance()}, cf_file(shim, filepath, O_CREAT | O_RDWR | O_DIRECT, 0664)
+{
+}
+
+void cufile_output_impl::write(void const *data, size_t offset, size_t size)
+{
+  CUDF_EXPECTS(shim->write(cf_file.handle(), data, size, offset, 0) != -1,
+               "cuFile error writing to a file");
+}
+#endif
+
+std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const &filepath)
+{
+#ifdef CUFILE_FOUND
+  if (cufile_config::instance()->is_enabled()) {
+    try {
+      return std::make_unique<cufile_input_impl>(filepath);
+    } catch (...) {
+      if (cufile_config::instance()->is_required()) throw;
+    }
+  }
+#endif
+  return nullptr;
+}
+
+std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const &filepath)
+{
+#ifdef CUFILE_FOUND
+  if (cufile_config::instance()->is_enabled()) {
+    try {
+      return std::make_unique<cufile_output_impl>(filepath);
+    } catch (...) {
+      if (cufile_config::instance()->is_required()) throw;
+    }
+  }
+#endif
+  return nullptr;
+}
+
+}  // namespace detail
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
new file mode 100644
index 00000000000..85399bdd44d
--- /dev/null
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef CUFILE_FOUND
+#include <cufile.h>
+#endif
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cudf/io/datasource.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <string>
+
+namespace cudf {
+namespace io {
+namespace detail {
+
+/**
+ * @brief Class that provides RAII for file handling.
+ */
+class file_wrapper {
+  int const fd       = -1;
+  long mutable _size = -1;
+
+ public:
+  explicit file_wrapper(std::string const &filepath, int flags);
+  explicit file_wrapper(std::string const &filepath, int flags, mode_t mode);
+  ~file_wrapper();
+  long size() const;
+  auto desc() const { return fd; }
+};
+
+/**
+ * @brief Base class for cuFile input/output.
+ *
+ * Contains the common API for cuFile input and output classes.
+ */
+class cufile_io_base {
+ public:
+  /**
+   * @brief Returns an estimate of whether the cuFile operation is the optimal option.
+   *
+   * @param size Read/write operation size, in bytes.
+   * @return Whether a cuFile operation with the given size is expected to be faster than a host
+   * read + H2D copy
+   */
+  static bool is_cufile_io_preferred(size_t size) { return size > op_size_threshold; }
+
+ protected:
+  /**
+   * @brief The read/write size above which cuFile is faster then host read + copy
+   *
+   * This may not be the optimal threshold for all systems. Derived `is_cufile_io_preferred`
+   * implementations can use a different logic.
+   */
+  static constexpr size_t op_size_threshold = 128 << 10;
+};
+
+/**
+ * @brief Interface class for cufile input.
+ */
+class cufile_input : public cufile_io_base {
+ public:
+  /**
+   * @brief Reads into a new device buffer.
+   *
+   *  @throws cudf::logic_error on cuFile error
+   *
+   * @param offset Number of bytes from the start
+   * @param size Number of bytes to read
+   * @param stream CUDA stream to use
+   *
+   * @return The data buffer in the device memory
+   */
+  virtual std::unique_ptr<datasource::buffer> read(size_t offset,
+                                                   size_t size,
+                                                   rmm::cuda_stream_view stream) = 0;
+
+  /**
+   * @brief Reads into existing device memory.
+   *
+   *  @throws cudf::logic_error on cuFile error
+   *
+   * @param offset Number of bytes from the start
+   * @param size Number of bytes to read
+   * @param dst Address of the existing device memory
+   * @param stream CUDA stream to use
+   *
+   * @return The number of bytes read
+   */
+  virtual size_t read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream) = 0;
+};
+
+/**
+ * @brief Interface class for cufile output.
+ */
+class cufile_output : public cufile_io_base {
+ public:
+  /**
+   * @brief Writes the data from a device buffer into a file.
+   *
+   *  @throws cudf::logic_error on cuFile error
+   *
+   * @param data Pointer to the buffer to be written into the output file
+   * @param offset Number of bytes from the start
+   * @param size Number of bytes to write
+   */
+  virtual void write(void const *data, size_t offset, size_t size) = 0;
+};
+
+#ifdef CUFILE_FOUND
+
+class cufile_shim;
+
+/**
+ * @brief Class that provides RAII for cuFile file registration.
+ */
+struct cufile_registered_file {
+  void register_handle();
+
+ public:
+  cufile_registered_file(cufile_shim const *shim, std::string const &filepath, int flags)
+    : _file(filepath, flags), shim{shim}
+  {
+    register_handle();
+  }
+
+  cufile_registered_file(cufile_shim const *shim,
+                         std::string const &filepath,
+                         int flags,
+                         mode_t mode)
+    : _file(filepath, flags, mode), shim{shim}
+  {
+    register_handle();
+  }
+
+  auto const &handle() const noexcept { return cf_handle; }
+
+  ~cufile_registered_file();
+
+ private:
+  file_wrapper const _file;
+  CUfileHandle_t cf_handle = nullptr;
+  cufile_shim const *shim  = nullptr;
+};
+
+/**
+ * @brief Adapter for the `cuFileRead` API.
+ *
+ * Exposes APIs to read directly from a file into device memory.
+ */
+class cufile_input_impl final : public cufile_input {
+ public:
+  cufile_input_impl(std::string const &filepath);
+
+  std::unique_ptr<datasource::buffer> read(size_t offset,
+                                           size_t size,
+                                           rmm::cuda_stream_view stream) override;
+
+  size_t read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream) override;
+
+ private:
+  cufile_shim const *shim = nullptr;
+  cufile_registered_file const cf_file;
+};
+
+/**
+ * @brief Adapter for the `cuFileWrite` API.
+ *
+ * Exposes an API to write directly into a file from device memory.
+ */
+class cufile_output_impl final : public cufile_output {
+ public:
+  cufile_output_impl(std::string const &filepath);
+
+  void write(void const *data, size_t offset, size_t size) override;
+
+ private:
+  cufile_shim const *shim = nullptr;
+  cufile_registered_file const cf_file;
+};
+#else
+
+class cufile_input_impl final : public cufile_input {
+ public:
+  std::unique_ptr<datasource::buffer> read(size_t offset,
+                                           size_t size,
+                                           rmm::cuda_stream_view stream) override
+  {
+    CUDF_FAIL("Only used to compile without cufile library, should not be called");
+  }
+
+  size_t read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream) override
+  {
+    CUDF_FAIL("Only used to compile without cufile library, should not be called");
+  }
+};
+
+class cufile_output_impl final : public cufile_output {
+ public:
+  void write(void const *data, size_t offset, size_t size) override
+  {
+    CUDF_FAIL("Only used to compile without cufile library, should not be called");
+  }
+};
+#endif
+
+/**
+ * @brief Creates a `cufile_input_impl` object
+ *
+ * Returns a null pointer if an exception occurs in the `cufile_input_impl` constructor, or if the
+ * cuFile library is not installed.
+ */
+std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const &filepath);
+
+/**
+ * @brief Creates a `cufile_output_impl` object
+ *
+ * Returns a null pointer if an exception occurs in the `cufile_output_impl` constructor, or if the
+ * cuFile library is not installed.
+ */
+std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const &filepath);
+
+}  // namespace detail
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/lists/explode.cu b/cpp/src/lists/explode.cu
new file mode 100644
index 00000000000..336aabde15e
--- /dev/null
+++ b/cpp/src/lists/explode.cu
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/gather.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/lists/explode.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include <memory>
+#include <type_traits>
+
+namespace cudf {
+namespace detail {
+namespace {
+
+std::unique_ptr<table> build_table(
+  table_view const& input_table,
+  size_type const explode_column_idx,
+  column_view const& sliced_child,
+  cudf::device_span<size_type const> gather_map,
+  thrust::optional<cudf::device_span<size_type const>> explode_col_gather_map,
+  thrust::optional<rmm::device_uvector<size_type>> position_array,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  auto select_iter = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(0),
+    [explode_column_idx](size_type i) { return i >= explode_column_idx ? i + 1 : i; });
+
+  auto gathered_table =
+    detail::gather(input_table.select(select_iter, select_iter + input_table.num_columns() - 1),
+                   gather_map.begin(),
+                   gather_map.end(),
+                   cudf::out_of_bounds_policy::DONT_CHECK,
+                   stream,
+                   mr);
+
+  std::vector<std::unique_ptr<column>> columns = gathered_table.release()->release();
+
+  columns.insert(columns.begin() + explode_column_idx,
+                 explode_col_gather_map
+                   ? std::move(detail::gather(table_view({sliced_child}),
+                                              explode_col_gather_map->begin(),
+                                              explode_col_gather_map->end(),
+                                              cudf::out_of_bounds_policy::NULLIFY,
+                                              stream,
+                                              mr)
+                                 ->release()[0])
+                   : std::make_unique<column>(sliced_child, stream, mr));
+
+  if (position_array) {
+    size_type position_size = position_array->size();
+    columns.insert(columns.begin() + explode_column_idx,
+                   std::make_unique<column>(
+                     data_type(type_to_id<size_type>()), position_size, position_array->release()));
+  }
+
+  return std::make_unique<table>(std::move(columns));
+}
+}  // namespace
+
+std::unique_ptr<table> explode(table_view const& input_table,
+                               size_type const explode_column_idx,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
+{
+  lists_column_view explode_col{input_table.column(explode_column_idx)};
+  auto sliced_child = explode_col.get_sliced_child(stream);
+  rmm::device_uvector<size_type> gather_map(sliced_child.size(), stream);
+
+  // Sliced columns may require rebasing of the offsets.
+  auto offsets = explode_col.offsets_begin();
+  // offsets + 1 here to skip the 0th offset, which removes a - 1 operation later.
+  auto offsets_minus_one = thrust::make_transform_iterator(
+    thrust::next(offsets), [offsets] __device__(auto i) { return (i - offsets[0]) - 1; });
+  auto counting_iter = thrust::make_counting_iterator(0);
+
+  // This looks like an off-by-one bug, but what is going on here is that we need to reduce each
+  // result from `lower_bound` by 1 to build the correct gather map. This can be accomplished by
+  // skipping the first entry and using the result of `lower_bound` directly.
+  thrust::lower_bound(rmm::exec_policy(stream),
+                      offsets_minus_one,
+                      offsets_minus_one + explode_col.size(),
+                      counting_iter,
+                      counting_iter + gather_map.size(),
+                      gather_map.begin());
+
+  return build_table(input_table,
+                     explode_column_idx,
+                     sliced_child,
+                     gather_map,
+                     thrust::nullopt,
+                     thrust::nullopt,
+                     stream,
+                     mr);
+}
+
+std::unique_ptr<table> explode_position(table_view const& input_table,
+                                        size_type const explode_column_idx,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  lists_column_view explode_col{input_table.column(explode_column_idx)};
+  auto sliced_child = explode_col.get_sliced_child(stream);
+  rmm::device_uvector<size_type> gather_map(sliced_child.size(), stream);
+
+  // Sliced columns may require rebasing of the offsets.
+  auto offsets = explode_col.offsets_begin();
+  // offsets + 1 here to skip the 0th offset, which removes a - 1 operation later.
+  auto offsets_minus_one = thrust::make_transform_iterator(
+    offsets + 1, [offsets] __device__(auto i) { return (i - offsets[0]) - 1; });
+  auto counting_iter = thrust::make_counting_iterator(0);
+
+  rmm::device_uvector<size_type> pos(sliced_child.size(), stream, mr);
+
+  // This looks like an off-by-one bug, but what is going on here is that we need to reduce each
+  // result from `lower_bound` by 1 to build the correct gather map. This can be accomplished by
+  // skipping the first entry and using the result of `lower_bound` directly.
+  thrust::transform(
+    rmm::exec_policy(stream),
+    counting_iter,
+    counting_iter + gather_map.size(),
+    gather_map.begin(),
+    [position_array = pos.data(),
+     offsets_minus_one,
+     offsets,
+     offset_size = explode_col.size()] __device__(auto idx) -> size_type {
+      auto lb_idx = thrust::distance(
+        offsets_minus_one,
+        thrust::lower_bound(thrust::seq, offsets_minus_one, offsets_minus_one + offset_size, idx));
+      position_array[idx] = idx - (offsets[lb_idx] - offsets[0]);
+      return lb_idx;
+    });
+
+  return build_table(input_table,
+                     explode_column_idx,
+                     sliced_child,
+                     gather_map,
+                     thrust::nullopt,
+                     std::move(pos),
+                     stream,
+                     mr);
+}
+
+std::unique_ptr<table> explode_outer(table_view const& input_table,
+                                     size_type const explode_column_idx,
+                                     bool include_position,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+{
+  lists_column_view explode_col{input_table.column(explode_column_idx)};
+  auto sliced_child  = explode_col.get_sliced_child(stream);
+  auto counting_iter = thrust::make_counting_iterator(0);
+  auto offsets       = explode_col.offsets_begin();
+
+  // number of nulls or empty lists found so far in the explode column
+  rmm::device_uvector<size_type> null_or_empty_offset(explode_col.size(), stream);
+
+  auto null_or_empty = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(0),
+    [offsets, offsets_size = explode_col.size() - 1] __device__(int idx) {
+      return (idx > offsets_size || (offsets[idx + 1] != offsets[idx])) ? 0 : 1;
+    });
+  thrust::inclusive_scan(rmm::exec_policy(stream),
+                         null_or_empty,
+                         null_or_empty + sliced_child.size(),
+                         null_or_empty_offset.begin());
+
+  auto null_or_empty_count =
+    null_or_empty_offset.size() > 0 ? null_or_empty_offset.back_element(stream) : 0;
+  if (null_or_empty_count == 0) {
+    // performance penalty to run the below loop if there are no nulls or empty lists.
+    // run simple explode instead
+    return include_position ? explode_position(input_table, explode_column_idx, stream, mr)
+                            : explode(input_table, explode_column_idx, stream, mr);
+  }
+
+  auto gather_map_size = sliced_child.size() + null_or_empty_count;
+
+  rmm::device_uvector<size_type> gather_map(gather_map_size, stream);
+  rmm::device_uvector<size_type> explode_col_gather_map(gather_map_size, stream);
+  rmm::device_uvector<size_type> pos(include_position ? gather_map_size : 0, stream, mr);
+
+  // offsets + 1 here to skip the 0th offset, which removes a - 1 operation later.
+  auto offsets_minus_one = thrust::make_transform_iterator(
+    thrust::next(offsets), [offsets] __device__(auto i) { return (i - offsets[0]) - 1; });
+  // Fill in gather map with all the child column's entries
+  thrust::for_each(rmm::exec_policy(stream),
+                   counting_iter,
+                   counting_iter + sliced_child.size(),
+                   [offsets_minus_one,
+                    gather_map             = gather_map.begin(),
+                    explode_col_gather_map = explode_col_gather_map.begin(),
+                    position_array         = pos.begin(),
+                    include_position,
+                    offsets,
+                    null_or_empty_offset = null_or_empty_offset.begin(),
+                    null_or_empty,
+                    offset_size = explode_col.offsets().size() - 1] __device__(auto idx) {
+                     auto lb_idx = thrust::distance(
+                       offsets_minus_one,
+                       thrust::lower_bound(
+                         thrust::seq, offsets_minus_one, offsets_minus_one + (offset_size), idx));
+                     auto index_to_write                    = null_or_empty_offset[lb_idx] + idx;
+                     gather_map[index_to_write]             = lb_idx;
+                     explode_col_gather_map[index_to_write] = idx;
+                     if (include_position) {
+                       position_array[index_to_write] = idx - (offsets[lb_idx] - offsets[0]);
+                     }
+                     if (null_or_empty[idx]) {
+                       auto invalid_index = null_or_empty_offset[idx] == 0
+                                              ? offsets[idx]
+                                              : offsets[idx] + null_or_empty_offset[idx] - 1;
+                       gather_map[invalid_index] = idx;
+
+                       // negative one to indicate a null value
+                       explode_col_gather_map[invalid_index] = -1;
+
+                       if (include_position) { position_array[invalid_index] = 0; }
+                     }
+                   });
+
+  return build_table(
+    input_table,
+    explode_column_idx,
+    sliced_child,
+    gather_map,
+    explode_col_gather_map,
+    include_position ? std::move(pos) : thrust::optional<rmm::device_uvector<size_type>>{},
+    stream,
+    mr);
+}
+
+}  // namespace detail
+
+/**
+ * @copydoc cudf::explode(input_table,explode_column_idx,rmm::mr::device_memory_resource)
+ */
+std::unique_ptr<table> explode(table_view const& input_table,
+                               size_type explode_column_idx,
+                               rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
+               "Unsupported non-list column");
+  return detail::explode(input_table, explode_column_idx, rmm::cuda_stream_default, mr);
+}
+
+/**
+ * @copydoc cudf::explode_position(input_table,explode_column_idx,rmm::mr::device_memory_resource)
+ */
+std::unique_ptr<table> explode_position(table_view const& input_table,
+                                        size_type explode_column_idx,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
+               "Unsupported non-list column");
+  return detail::explode_position(input_table, explode_column_idx, rmm::cuda_stream_default, mr);
+}
+
+/**
+ * @copydoc cudf::explode_outer(input_table,explode_column_idx,rmm::mr::device_memory_resource)
+ */
+std::unique_ptr<table> explode_outer(table_view const& input_table,
+                                     size_type explode_column_idx,
+                                     rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
+               "Unsupported non-list column");
+  return detail::explode_outer(
+    input_table, explode_column_idx, false, rmm::cuda_stream_default, mr);
+}
+
+/**
+ * @copydoc
+ * cudf::explode_outer_position(input_table,explode_column_idx,rmm::mr::device_memory_resource)
+ */
+std::unique_ptr<table> explode_outer_position(table_view const& input_table,
+                                              size_type explode_column_idx,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
+               "Unsupported non-list column");
+  return detail::explode_outer(input_table, explode_column_idx, true, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace cudf
diff --git a/cpp/src/reshape/explode.cu b/cpp/src/reshape/explode.cu
deleted file mode 100644
index 34d7d8fe31d..00000000000
--- a/cpp/src/reshape/explode.cu
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/detail/gather.cuh>
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/lists/lists_column_view.hpp>
-#include <cudf/reshape.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/binary_search.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-
-#include <memory>
-#include <type_traits>
-
-namespace cudf {
-namespace detail {
-namespace {
-/**
- * @brief Function object for exploding a column.
- */
-struct explode_functor {
-  /**
-   * @brief Function object for exploding a column.
-   */
-  template <typename T>
-  std::unique_ptr<table> operator()(table_view const& input_table,
-                                    size_type const explode_column_idx,
-                                    bool include_pos,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr) const
-  {
-    CUDF_FAIL("Unsupported non-list column");
-
-    return std::make_unique<table>();
-  }
-};
-
-template <>
-std::unique_ptr<table> explode_functor::operator()<list_view>(
-  table_view const& input_table,
-  size_type const explode_column_idx,
-  bool include_pos,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
-{
-  lists_column_view lc{input_table.column(explode_column_idx)};
-  auto sliced_child = lc.get_sliced_child(stream);
-  rmm::device_uvector<size_type> gather_map_indices(sliced_child.size(), stream);
-
-  // Sliced columns may require rebasing of the offsets.
-  auto offsets = lc.offsets_begin();
-  // offsets + 1 here to skip the 0th offset, which removes a - 1 operation later.
-  auto offsets_minus_one = thrust::make_transform_iterator(
-    offsets + 1, [offsets] __device__(auto i) { return (i - offsets[0]) - 1; });
-  auto counting_iter = thrust::make_counting_iterator(0);
-
-  rmm::device_uvector<size_type> pos(include_pos ? sliced_child.size() : 0, stream, mr);
-
-  // This looks like an off-by-one bug, but what is going on here is that we need to reduce each
-  // result from `lower_bound` by 1 to build the correct gather map. This can be accomplished by
-  // skipping the first entry and using the result of `lower_bound` directly.
-  if (include_pos) {
-    thrust::transform(
-      rmm::exec_policy(stream),
-      counting_iter,
-      counting_iter + gather_map_indices.size(),
-      gather_map_indices.begin(),
-      [position_array = pos.data(), offsets_minus_one, offsets, offset_size = lc.size()] __device__(
-        auto idx) -> size_type {
-        auto lb_idx = thrust::lower_bound(
-                        thrust::seq, offsets_minus_one, offsets_minus_one + offset_size, idx) -
-                      offsets_minus_one;
-        position_array[idx] = idx - (offsets[lb_idx] - offsets[0]);
-        return lb_idx;
-      });
-  } else {
-    thrust::lower_bound(rmm::exec_policy(stream),
-                        offsets_minus_one,
-                        offsets_minus_one + lc.size(),
-                        counting_iter,
-                        counting_iter + gather_map_indices.size(),
-                        gather_map_indices.begin());
-  }
-
-  auto select_iter = thrust::make_transform_iterator(
-    thrust::make_counting_iterator(0),
-    [explode_column_idx](size_type i) { return i >= explode_column_idx ? i + 1 : i; });
-  std::vector<size_type> selected_columns(select_iter, select_iter + input_table.num_columns() - 1);
-
-  auto gathered_table = cudf::detail::gather(input_table.select(selected_columns),
-                                             gather_map_indices.begin(),
-                                             gather_map_indices.end(),
-                                             cudf::out_of_bounds_policy::DONT_CHECK,
-                                             stream,
-                                             mr);
-
-  std::vector<std::unique_ptr<column>> columns = gathered_table.release()->release();
-
-  columns.insert(columns.begin() + explode_column_idx,
-                 std::make_unique<column>(sliced_child, stream, mr));
-
-  if (include_pos) {
-    columns.insert(columns.begin() + explode_column_idx,
-                   std::make_unique<column>(
-                     data_type(type_to_id<size_type>()), sliced_child.size(), pos.release()));
-  }
-
-  return std::make_unique<table>(std::move(columns));
-}
-}  // namespace
-
-/**
- * @copydoc
- * cudf::explode(input_table,explode_column_idx,rmm::mr::device_memory_resource)
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
- */
-std::unique_ptr<table> explode(table_view const& input_table,
-                               size_type explode_column_idx,
-                               bool include_pos,
-                               rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
-{
-  return type_dispatcher(input_table.column(explode_column_idx).type(),
-                         explode_functor{},
-                         input_table,
-                         explode_column_idx,
-                         include_pos,
-                         stream,
-                         mr);
-}
-
-}  // namespace detail
-
-/**
- * @copydoc cudf::explode(input_table,explode_column_idx,rmm::mr::device_memory_resource)
- */
-std::unique_ptr<table> explode(table_view const& input_table,
-                               size_type explode_column_idx,
-                               rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::explode(input_table, explode_column_idx, false, rmm::cuda_stream_default, mr);
-}
-
-/**
- * @copydoc cudf::explode_position(input_table,explode_column_idx,rmm::mr::device_memory_resource)
- */
-std::unique_ptr<table> explode_position(table_view const& input_table,
-                                        size_type explode_column_idx,
-                                        rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::explode(input_table, explode_column_idx, true, rmm::cuda_stream_default, mr);
-}
-
-}  // namespace cudf
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index 135df6bdfe2..c1ebc9f3f9f 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -384,7 +384,7 @@ get_null_bounds_for_timestamp_column(column_view const& timestamp_column,
 
   if (timestamp_column.has_nulls()) {
     auto p_timestamps_device_view = column_device_view::create(timestamp_column);
-    auto num_groups               = group_offsets.size();
+    auto num_groups               = group_offsets.size() - 1;
 
     // Null timestamps exist. Find null bounds, per group.
     thrust::for_each(
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index 10496b89328..0b384ad0631 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -186,91 +186,6 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
                              mr);
 }
 
-std::unique_ptr<column> is_integer(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_column       = *strings_column;
-  // create output column
-  auto results   = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings.size(),
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
-  auto d_results = results->mutable_view().data<bool>();
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(strings.size()),
-                    d_results,
-                    [d_column] __device__(size_type idx) {
-                      if (d_column.is_null(idx)) return false;
-                      return string::is_integer(d_column.element<string_view>(idx));
-                    });
-  results->set_null_count(strings.null_count());
-  return results;
-}
-
-bool all_integer(strings_column_view const& strings, rmm::cuda_stream_view stream)
-{
-  auto strings_column  = column_device_view::create(strings.parent(), stream);
-  auto d_column        = *strings_column;
-  auto transformer_itr = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_type>(0), [d_column] __device__(size_type idx) {
-      if (d_column.is_null(idx)) return false;
-      return string::is_integer(d_column.element<string_view>(idx));
-    });
-  return thrust::all_of(rmm::exec_policy(stream),
-                        transformer_itr,
-                        transformer_itr + strings.size(),
-                        thrust::identity<bool>());
-}
-
-std::unique_ptr<column> is_float(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_column       = *strings_column;
-  // create output column
-  auto results   = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings.size(),
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
-  auto d_results = results->mutable_view().data<bool>();
-  // check strings for valid float chars
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(strings.size()),
-                    d_results,
-                    [d_column] __device__(size_type idx) {
-                      if (d_column.is_null(idx)) return false;
-                      return string::is_float(d_column.element<string_view>(idx));
-                    });
-  results->set_null_count(strings.null_count());
-  return results;
-}
-
-bool all_float(strings_column_view const& strings, rmm::cuda_stream_view stream)
-{
-  auto strings_column  = column_device_view::create(strings.parent(), stream);
-  auto d_column        = *strings_column;
-  auto transformer_itr = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_type>(0), [d_column] __device__(size_type idx) {
-      if (d_column.is_null(idx)) return false;
-      return string::is_float(d_column.element<string_view>(idx));
-    });
-  return thrust::all_of(rmm::exec_policy(stream),
-                        transformer_itr,
-                        transformer_itr + strings.size(),
-                        thrust::identity<bool>());
-}
-
 }  // namespace detail
 
 // external API
@@ -295,31 +210,5 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
     strings, types_to_remove, replacement, types_to_keep, rmm::cuda_stream_default, mr);
 }
 
-std::unique_ptr<column> is_integer(strings_column_view const& strings,
-                                   rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::is_integer(strings, rmm::cuda_stream_default, mr);
-}
-
-std::unique_ptr<column> is_float(strings_column_view const& strings,
-                                 rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::is_float(strings, rmm::cuda_stream_default, mr);
-}
-
-bool all_integer(strings_column_view const& strings)
-{
-  CUDF_FUNC_RANGE();
-  return detail::all_integer(strings, rmm::cuda_stream_default);
-}
-
-bool all_float(strings_column_view const& strings)
-{
-  CUDF_FUNC_RANGE();
-  return detail::all_float(strings, rmm::cuda_stream_default);
-}
-
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index 2bf65976986..b6d99efd51f 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -21,6 +21,7 @@
 #include <cudf/strings/convert/convert_floats.hpp>
 #include <cudf/strings/detail/converters.hpp>
 #include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/string.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -536,12 +537,50 @@ std::unique_ptr<column> from_floats(column_view const& floats,
 }  // namespace detail
 
 // external API
-
 std::unique_ptr<column> from_floats(column_view const& floats, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_floats(floats, rmm::cuda_stream_default, mr);
 }
 
+namespace detail {
+std::unique_ptr<column> is_float(
+  strings_column_view const& strings,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto d_column       = *strings_column;
+  // create output column
+  auto results   = make_numeric_column(data_type{type_id::BOOL8},
+                                     strings.size(),
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
+  auto d_results = results->mutable_view().data<bool>();
+  // check strings for valid float chars
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(strings.size()),
+                    d_results,
+                    [d_column] __device__(size_type idx) {
+                      if (d_column.is_null(idx)) return false;
+                      return string::is_float(d_column.element<string_view>(idx));
+                    });
+  results->set_null_count(strings.null_count());
+  return results;
+}
+
+}  // namespace detail
+
+// external API
+std::unique_ptr<column> is_float(strings_column_view const& strings,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::is_float(strings, rmm::cuda_stream_default, mr);
+}
+
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index 112550fc25b..5c5032b5c87 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/strings/convert/convert_integers.hpp>
 #include <cudf/strings/detail/converters.hpp>
 #include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/string.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -245,7 +246,6 @@ std::unique_ptr<column> from_integers(column_view const& integers,
 }  // namespace detail
 
 // external API
-
 std::unique_ptr<column> from_integers(column_view const& integers,
                                       rmm::mr::device_memory_resource* mr)
 {
@@ -253,5 +253,42 @@ std::unique_ptr<column> from_integers(column_view const& integers,
   return detail::from_integers(integers, rmm::cuda_stream_default, mr);
 }
 
+namespace detail {
+std::unique_ptr<column> is_integer(
+  strings_column_view const& strings,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto d_column       = *strings_column;
+  // create output column
+  auto results   = make_numeric_column(data_type{type_id::BOOL8},
+                                     strings.size(),
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
+  auto d_results = results->mutable_view().data<bool>();
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(strings.size()),
+                    d_results,
+                    [d_column] __device__(size_type idx) {
+                      if (d_column.is_null(idx)) return false;
+                      return string::is_integer(d_column.element<string_view>(idx));
+                    });
+  results->set_null_count(strings.null_count());
+  return results;
+}
+}  // namespace detail
+
+// external API
+std::unique_ptr<column> is_integer(strings_column_view const& strings,
+                                   rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::is_integer(strings, rmm::cuda_stream_default, mr);
+}
+
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu
index 68080c0eb89..f712b0cb6aa 100644
--- a/cpp/src/strings/substring.cu
+++ b/cpp/src/strings/substring.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,17 +43,25 @@ namespace {
  * using the provided start, stop, and step parameters.
  */
 struct substring_fn {
-  const column_device_view d_column;
-  numeric_scalar_device_view<size_type> d_start, d_stop, d_step;
-  const int32_t* d_offsets{};
+  column_device_view const d_column;
+  numeric_scalar_device_view<size_type> const d_start;
+  numeric_scalar_device_view<size_type> const d_stop;
+  numeric_scalar_device_view<size_type> const d_step;
+  int32_t* d_offsets{};
   char* d_chars{};
 
-  __device__ cudf::size_type operator()(size_type idx)
+  __device__ void operator()(size_type idx)
   {
-    if (d_column.is_null(idx)) return 0;  // null string
-    string_view d_str = d_column.template element<string_view>(idx);
+    if (d_column.is_null(idx)) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
+    auto const d_str  = d_column.template element<string_view>(idx);
     auto const length = d_str.length();
-    if (length == 0) return 0;  // empty string
+    if (length == 0) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
     size_type const step = d_step.is_valid() ? d_step.value() : 1;
     auto const begin     = [&] {  // always inclusive
       // when invalid, default depends on step
@@ -88,7 +96,7 @@ struct substring_fn {
       if (d_buffer) d_buffer += from_char_utf8(*itr, d_buffer);
       itr += step;
     }
-    return bytes;
+    if (!d_chars) d_offsets[idx] = bytes;
   }
 };
 
@@ -103,42 +111,26 @@ std::unique_ptr<column> slice_strings(
   rmm::cuda_stream_view stream           = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource())
 {
-  size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(stream, mr);
+  if (strings.is_empty()) return make_empty_strings_column(stream, mr);
 
   if (step.is_valid()) CUDF_EXPECTS(step.value(stream) != 0, "Step parameter must not be 0");
 
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_column       = *strings_column;
-  auto d_start        = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(start));
-  auto d_stop         = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(stop));
-  auto d_step         = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(step));
-
-  // copy the null mask
-  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
-
-  // build offsets column
-  auto offsets_transformer_itr = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<int32_t>(0), substring_fn{d_column, d_start, d_stop, d_step});
-  auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_new_offsets = offsets_column->view().data<int32_t>();
-
-  // build chars column
-  auto bytes = cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
-  auto chars_column = strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), bytes, stream, mr);
-  auto d_chars = chars_column->mutable_view().data<char>();
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     strings_count,
-                     substring_fn{d_column, d_start, d_stop, d_step, d_new_offsets, d_chars});
+  auto const d_column = column_device_view::create(strings.parent(), stream);
+  auto const d_start  = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(start));
+  auto const d_stop   = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(stop));
+  auto const d_step   = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(step));
 
-  return make_strings_column(strings_count,
-                             std::move(offsets_column),
-                             std::move(chars_column),
+  auto children = make_strings_children(substring_fn{*d_column, d_start, d_stop, d_step},
+                                        strings.size(),
+                                        strings.null_count(),
+                                        stream,
+                                        mr);
+
+  return make_strings_column(strings.size(),
+                             std::move(children.first),
+                             std::move(children.second),
                              strings.null_count(),
-                             std::move(null_mask),
+                             cudf::detail::copy_bitmask(strings.parent(), stream, mr),
                              stream,
                              mr);
 }
@@ -166,25 +158,33 @@ namespace {
  * This both calculates the output size and executes the substring.
  */
 struct substring_from_fn {
-  const column_device_view d_column;
-  const cudf::detail::input_indexalator starts;
-  const cudf::detail::input_indexalator stops;
-  const int32_t* d_offsets{};
+  column_device_view const d_column;
+  cudf::detail::input_indexalator const starts;
+  cudf::detail::input_indexalator const stops;
+  int32_t* d_offsets{};
   char* d_chars{};
 
-  __device__ size_type operator()(size_type idx)
+  __device__ void operator()(size_type idx)
   {
-    if (d_column.is_null(idx)) return 0;  // null string
-    string_view d_str = d_column.template element<string_view>(idx);
+    if (d_column.is_null(idx)) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
+    auto const d_str  = d_column.template element<string_view>(idx);
     auto const length = d_str.length();
     auto const start  = starts[idx];
-    if (start >= length) return 0;  // empty string
+    if (start >= length) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
     auto const stop = stops[idx];
     auto const end  = (((stop < 0) || (stop > length)) ? length : stop);
 
-    string_view d_substr = d_str.substr(start, end - start);
-    if (d_chars) memcpy(d_chars + d_offsets[idx], d_substr.data(), d_substr.size_bytes());
-    return d_substr.size_bytes();
+    auto const d_substr = d_str.substr(start, end - start);
+    if (d_chars)
+      memcpy(d_chars + d_offsets[idx], d_substr.data(), d_substr.size_bytes());
+    else
+      d_offsets[idx] = d_substr.size_bytes();
   }
 };
 
@@ -212,32 +212,18 @@ std::unique_ptr<column> compute_substrings_from_fn(column_device_view const& d_c
   auto strings_count = d_column.size();
 
   // Copy the null mask
-  rmm::device_buffer null_mask{0, stream, mr};
-  if (d_column.nullable())
-    null_mask = rmm::device_buffer(
-      d_column.null_mask(), cudf::bitmask_allocation_size_bytes(strings_count), stream, mr);
-
-  // Build offsets column
-  auto offsets_transformer_itr = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_type>(0), substring_from_fn{d_column, starts, stops});
-  auto offsets_column = cudf::strings::detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_new_offsets = offsets_column->view().data<int32_t>();
-
-  // Build chars column
-  auto bytes = cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
-  auto chars_column =
-    cudf::strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
-  auto chars_view = chars_column->mutable_view();
-  auto d_chars    = chars_view.template data<char>();
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<cudf::size_type>(0),
-                     strings_count,
-                     substring_from_fn{d_column, starts, stops, d_new_offsets, d_chars});
+  rmm::device_buffer null_mask =
+    !d_column.nullable()
+      ? rmm::device_buffer{0, stream, mr}
+      : rmm::device_buffer(
+          d_column.null_mask(), cudf::bitmask_allocation_size_bytes(strings_count), stream, mr);
+
+  auto children = make_strings_children(
+    substring_from_fn{d_column, starts, stops}, strings_count, null_count, stream, mr);
 
   return make_strings_column(strings_count,
-                             std::move(offsets_column),
-                             std::move(chars_column),
+                             std::move(children.first),
+                             std::move(children.second),
                              null_count,
                              std::move(null_mask),
                              stream,
diff --git a/cpp/src/table/table.cpp b/cpp/src/table/table.cpp
index afda6313254..4cd85fc5e7e 100644
--- a/cpp/src/table/table.cpp
+++ b/cpp/src/table/table.cpp
@@ -81,12 +81,4 @@ std::vector<std::unique_ptr<column>> table::release()
   return std::move(_columns);
 }
 
-// Returns a table_view with set of specified columns
-table_view table::select(std::vector<cudf::size_type> const& column_indices) const
-{
-  std::vector<column_view> columns;
-  for (auto index : column_indices) { columns.push_back(_columns.at(index)->view()); }
-  return table_view(columns);
-}
-
 }  // namespace cudf
diff --git a/cpp/src/table/table_view.cpp b/cpp/src/table/table_view.cpp
index 9c421f6fd36..c64bf5b2823 100644
--- a/cpp/src/table/table_view.cpp
+++ b/cpp/src/table/table_view.cpp
@@ -63,11 +63,7 @@ template class table_view_base<mutable_column_view>;
 // Returns a table_view with set of specified columns
 table_view table_view::select(std::vector<size_type> const& column_indices) const
 {
-  std::vector<column_view> columns(column_indices.size());
-  std::transform(column_indices.begin(), column_indices.end(), columns.begin(), [this](auto index) {
-    return this->column(index);
-  });
-  return table_view(columns);
+  return select(column_indices.begin(), column_indices.end());
 }
 
 // Convert mutable view to immutable view
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 40829c74957..e95aab16098 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -14,38 +14,6 @@
 # limitations under the License.
 #=============================================================================
 
-###################################################################################################
-# - common test utils -----------------------------------------------------------------------------
-
-find_package(Threads REQUIRED)
-
-add_library(cudftestutil STATIC
-            utilities/base_fixture.cpp
-            utilities/column_utilities.cu
-            utilities/table_utilities.cu
-            strings/utilities.cu)
-
-target_compile_options(cudftestutil
-            PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
-                   "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
-)
-
-target_compile_features(cudftestutil PUBLIC cxx_std_14 cuda_std_14)
-
-target_link_libraries(cudftestutil
-               PUBLIC GTest::gmock
-                      GTest::gtest
-                      Threads::Threads
-                      cudf)
-
-target_include_directories(cudftestutil
-    PUBLIC "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
-           "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>")
-
-install(TARGETS cudftestutil
-        DESTINATION lib
-        EXPORT cudf-targets)
-
 ###################################################################################################
 # - compiler function -----------------------------------------------------------------------------
 
@@ -319,7 +287,6 @@ ConfigureTest(SEARCH_TEST search/search_test.cpp)
 # - reshape test ----------------------------------------------------------------------------------
 ConfigureTest(RESHAPE_TEST
     reshape/byte_cast_tests.cpp
-    reshape/explode_tests.cpp
     reshape/interleave_columns_tests.cpp
     reshape/tile_tests.cpp)
 
@@ -422,6 +389,7 @@ ConfigureTest(AST_TEST ast/transform_tests.cpp)
 ConfigureTest(LISTS_TEST
     lists/contains_tests.cpp
     lists/count_elements_tests.cpp
+    lists/explode_tests.cpp
     lists/drop_list_duplicates_tests.cpp
     lists/extract_tests.cpp
     lists/sort_lists_tests.cpp)
diff --git a/cpp/tests/lists/explode_tests.cpp b/cpp/tests/lists/explode_tests.cpp
new file mode 100644
index 00000000000..2ec9294d118
--- /dev/null
+++ b/cpp/tests/lists/explode_tests.cpp
@@ -0,0 +1,819 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/lists/explode.hpp>
+
+using namespace cudf::test;
+using FCW = fixed_width_column_wrapper<int32_t>;
+using LCW = lists_column_wrapper<int32_t>;
+
+class ExplodeTest : public cudf::test::BaseFixture {
+};
+
+class ExplodeOuterTest : public cudf::test::BaseFixture {
+};
+
+template <typename T>
+class ExplodeTypedTest : public cudf::test::BaseFixture {
+};
+
+template <typename T>
+class ExplodeOuterTypedTest : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_CASE(ExplodeTypedTest, cudf::test::FixedPointTypes);
+
+TYPED_TEST_CASE(ExplodeOuterTypedTest, cudf::test::FixedPointTypes);
+
+TEST_F(ExplodeTest, Empty)
+{
+  cudf::table_view t({LCW{}, FCW{}});
+
+  auto ret = cudf::explode(t, 0);
+
+  cudf::table_view expected({FCW{}, FCW{}});
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  auto pos_ret = cudf::explode_position(t, 0);
+
+  cudf::table_view pos_expected({FCW{}, FCW{}, FCW{}});
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeTest, NonList)
+{
+  cudf::table_view t({FCW{100, 200, 300}, FCW{100, 200, 300}});
+
+  EXPECT_THROW(cudf::explode(t, 1), cudf::logic_error);
+  EXPECT_THROW(cudf::explode_position(t, 1), cudf::logic_error);
+}
+
+TEST_F(ExplodeTest, Basics)
+{
+  //    a                   b                  c
+  //    100                [1, 2, 7]           string0
+  //    200                [5, 6]              string1
+  //    300                [0, 3]              string2
+
+  FCW a{100, 200, 300};
+  LCW b{LCW{1, 2, 7}, LCW{5, 6}, LCW{0, 3}};
+  strings_column_wrapper c{"string0", "string1", "string2"};
+
+  FCW expected_a{100, 100, 100, 200, 200, 300, 300};
+  FCW expected_b{1, 2, 7, 5, 6, 0, 3};
+  strings_column_wrapper expected_c{
+    "string0", "string0", "string0", "string1", "string1", "string2", "string2"};
+
+  cudf::table_view t({a, b, c});
+  cudf::table_view expected({expected_a, expected_b, expected_c});
+
+  auto ret = cudf::explode(t, 1);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 2, 0, 1, 0, 1};
+  cudf::table_view pos_expected({expected_a, expected_pos_col, expected_b, expected_c});
+
+  auto pos_ret = cudf::explode_position(t, 1);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeTest, SingleNull)
+{
+  //    a                   b
+  //    [1, 2, 7]           100
+  //    [5, 6]              200
+  //    []                  300
+  //    [0, 3]              400
+
+  auto first_invalid =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 0 ? false : true; });
+
+  LCW a({LCW{1, 2, 7}, LCW{5, 6}, LCW{}, LCW{0, 3}}, first_invalid);
+  FCW b({100, 200, 300, 400});
+
+  FCW expected_a{5, 6, 0, 3};
+  FCW expected_b{200, 200, 400, 400};
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 1};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeTest, Nulls)
+{
+  //    a                   b
+  //    [1, 2, 7]           100
+  //    [5, 6]              200
+  //    [0, 3]              300
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+  auto always_valid =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+
+  LCW a({LCW{1, 2, 7}, LCW{5, 6}, LCW{0, 3}}, valids);
+  FCW b({100, 200, 300}, valids);
+
+  FCW expected_a({1, 2, 7, 0, 3});
+  FCW expected_b({100, 100, 100, 300, 300}, always_valid);
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 2, 0, 1};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeTest, NullsInList)
+{
+  //    a                   b
+  //    [1, 2, 7]           100
+  //    [5, 6, 0, 9]        200
+  //    []                  300
+  //    [0, 3, 8]           400
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  LCW a{LCW({1, 2, 7}, valids), LCW({5, 6, 0, 9}, valids), LCW{}, LCW({0, 3, 8}, valids)};
+  FCW b{100, 200, 300, 400};
+
+  FCW expected_a({1, 2, 7, 5, 6, 0, 9, 0, 3, 8}, {1, 0, 1, 1, 0, 1, 0, 1, 0, 1});
+  FCW expected_b{100, 100, 100, 200, 200, 200, 200, 400, 400, 400};
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 2, 0, 1, 2, 3, 0, 1, 2};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeTest, Nested)
+{
+  //    a                      b
+  //    [[1, 2], [7, 6, 5]]    100
+  //    [[5, 6]]               200
+  //    [[0, 3],[],[5],[2, 1]] 300
+
+  LCW a{LCW{LCW{1, 2}, LCW{7, 6, 5}}, LCW{LCW{5, 6}}, LCW{LCW{0, 3}, LCW{}, LCW{5}, LCW{2, 1}}};
+  FCW b{100, 200, 300};
+
+  LCW expected_a{LCW{1, 2}, LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{}, LCW{5}, LCW{2, 1}};
+  FCW expected_b{100, 100, 200, 300, 300, 300, 300};
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 0, 1, 2, 3};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeTest, NestedNulls)
+{
+  //    a                   b
+  //    [[1, 2], [7, 6, 5]] 100
+  //    [[5, 6]]            200
+  //    [[0, 3],[5],[2, 1]] 300
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+  auto always_valid =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+
+  LCW a({LCW{LCW{1, 2}, LCW{7, 6, 5}}, LCW{LCW{5, 6}}, LCW{LCW{0, 3}, LCW{5}, LCW{2, 1}}}, valids);
+  FCW b({100, 200, 300}, valids);
+
+  LCW expected_a{LCW{1, 2}, LCW{7, 6, 5}, LCW{0, 3}, LCW{5}, LCW{2, 1}};
+  FCW expected_b({100, 100, 300, 300, 300}, always_valid);
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 1, 2};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeTest, NullsInNested)
+{
+  //    a                   b
+  //    [[1, 2], [7, 6, 5]] 100
+  //    [[5, 6]]            200
+  //    [[0, 3],[5],[2, 1]] 300
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}},
+         LCW{LCW{5, 6}},
+         LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}});
+  FCW b({100, 200, 300});
+
+  LCW expected_a{
+    LCW({1, 2}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)};
+  FCW expected_b{100, 100, 200, 300, 300, 300};
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 0, 1, 2};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeTest, NullsInNestedDoubleExplode)
+{
+  //    a                       b
+  //    [[1, 2], [], [7, 6, 5]] 100
+  //    [[5, 6]]                200
+  //    [[0, 3],[5],[2, 1]]     300
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  LCW a{LCW{LCW({1, 2}, valids), LCW{}, LCW{7, 6, 5}},
+        LCW{LCW{5, 6}},
+        LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}};
+  FCW b{100, 200, 300};
+
+  FCW expected_a({1, 2, 7, 6, 5, 5, 6, 0, 3, 5, 2, 1}, {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+  FCW expected_b{100, 100, 100, 100, 100, 200, 200, 300, 300, 300, 300, 300};
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto first_explode_ret = cudf::explode(t, 0);
+  auto ret               = cudf::explode(first_explode_ret->view(), 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 1, 2, 0, 1, 0, 1, 0, 0, 1};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_position(first_explode_ret->view(), 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeTest, NestedStructs)
+{
+  //    a                   b
+  //    [[1, 2], [7, 6, 5]] {100, "100"}
+  //    [[5, 6]]            {200, "200"}
+  //    [[0, 3],[5],[2, 1]] {300, "300"}
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}},
+         LCW{LCW{5, 6}},
+         LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}});
+  FCW b1({100, 200, 300});
+  strings_column_wrapper b2{"100", "200", "300"};
+  structs_column_wrapper b({b1, b2});
+
+  LCW expected_a{
+    LCW({1, 2}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)};
+  FCW expected_b1{100, 100, 200, 300, 300, 300};
+  strings_column_wrapper expected_b2{"100", "100", "200", "300", "300", "300"};
+  structs_column_wrapper expected_b({expected_b1, expected_b2});
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 0, 1, 2};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TYPED_TEST(ExplodeTypedTest, ListOfStructs)
+{
+  //  a                        b
+  //  [{70, "70"}, {75, "75"}] 100
+  //  [{50, "50"}, {55, "55"}] 200
+  //  [{35, "35"}, {45, "45"}] 300
+  //  [{25, "25"}, {30, "30"}] 400
+  //  [{15, "15"}, {20, "20"}] 500
+
+  auto numeric_col =
+    fixed_width_column_wrapper<TypeParam, int32_t>{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}};
+  strings_column_wrapper string_col{"70", "75", "50", "55", "35", "45", "25", "30", "15", "20"};
+  auto struct_col = structs_column_wrapper{{numeric_col, string_col}}.release();
+  auto a          = cudf::make_lists_column(
+    5, FCW{0, 2, 4, 6, 8, 10}.release(), std::move(struct_col), cudf::UNKNOWN_NULL_COUNT, {});
+
+  FCW b{100, 200, 300, 400, 500};
+
+  cudf::table_view t({a->view(), b});
+  auto ret = cudf::explode(t, 0);
+
+  auto expected_numeric_col =
+    fixed_width_column_wrapper<TypeParam, int32_t>{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}};
+  strings_column_wrapper expected_string_col{
+    "70", "75", "50", "55", "35", "45", "25", "30", "15", "20"};
+
+  auto expected_a = structs_column_wrapper{{expected_numeric_col, expected_string_col}}.release();
+  FCW expected_b{100, 100, 200, 200, 300, 300, 400, 400, 500, 500};
+
+  cudf::table_view expected({expected_a->view(), expected_b});
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
+  cudf::table_view pos_expected({expected_pos_col, expected_a->view(), expected_b});
+
+  auto pos_ret = cudf::explode_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeTest, SlicedList)
+{
+  //    a                        b
+  //    [[1, 2],[7, 6, 5]]       100
+  //    [[5, 6]]                 200
+  //    [[0, 3],[5],[2, 1]]      300
+  //    [[8, 3],[],[4, 3, 1, 2]] 400
+  //    [[2, 3, 4],[9, 8]]       500
+
+  //    slicing the top 2 rows and the bottom row off
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}},
+         LCW{LCW{5, 6}},
+         LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)},
+         LCW{LCW{8, 3}, LCW{}, LCW({4, 3, 1, 2}, valids)},
+         LCW{LCW{2, 3, 4}, LCW{9, 8}}});
+  FCW b({100, 200, 300, 400, 500});
+
+  LCW expected_a{
+    LCW{0, 3}, LCW{5}, LCW({2, 1}, valids), LCW{8, 3}, LCW{}, LCW({4, 3, 1, 2}, valids)};
+  FCW expected_b{300, 300, 300, 400, 400, 400};
+
+  cudf::table_view t({a, b});
+  auto sliced_t = cudf::slice(t, {2, 4});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode(sliced_t[0], 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 2, 0, 1, 2};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_position(sliced_t[0], 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, Empty)
+{
+  LCW a{};
+  FCW b{};
+
+  cudf::table_view t({LCW{}, FCW{}});
+
+  auto ret = cudf::explode_outer(t, 0);
+
+  FCW expected_a{};
+  FCW expected_b{};
+  cudf::table_view expected({FCW{}, FCW{}});
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+}
+
+TEST_F(ExplodeOuterTest, NonList)
+{
+  cudf::table_view t({FCW{100, 200, 300}, FCW{100, 200, 300}});
+
+  EXPECT_THROW(cudf::explode_outer(t, 1), cudf::logic_error);
+  EXPECT_THROW(cudf::explode_outer_position(t, 1), cudf::logic_error);
+}
+
+TEST_F(ExplodeOuterTest, Basics)
+{
+  //    a                   b                  c
+  //    100                [1, 2, 7]           string0
+  //    200                [5, 6]              string1
+  //    300                [0, 3]              string2
+
+  FCW a{100, 200, 300};
+  LCW b{LCW{1, 2, 7}, LCW{5, 6}, LCW{0, 3}};
+  strings_column_wrapper c{"string0", "string1", "string2"};
+
+  FCW expected_a{100, 100, 100, 200, 200, 300, 300};
+  FCW expected_b{1, 2, 7, 5, 6, 0, 3};
+  strings_column_wrapper expected_c{
+    "string0", "string0", "string0", "string1", "string1", "string2", "string2"};
+
+  cudf::table_view t({a, b, c});
+  cudf::table_view expected({expected_a, expected_b, expected_c});
+
+  auto ret = cudf::explode_outer(t, 1);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 2, 0, 1, 0, 1};
+  cudf::table_view pos_expected({expected_a, expected_pos_col, expected_b, expected_c});
+
+  auto pos_ret = cudf::explode_outer_position(t, 1);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, SingleNull)
+{
+  //    a                   b
+  //    [1, 2, 7]           100
+  //    [5, 6]              200
+  //    []                  300
+  //    [0, 3]              400
+
+  auto first_invalid =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 0 ? false : true; });
+
+  LCW a({LCW{1, 2, 7}, LCW{5, 6}, LCW{}, LCW{0, 3}}, first_invalid);
+  FCW b({100, 200, 300, 400});
+
+  FCW expected_a{{0, 5, 6, 0, 0, 3}, {0, 1, 1, 0, 1, 1}};
+  FCW expected_b{100, 200, 200, 300, 400, 400};
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode_outer(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 0, 1, 0, 0, 1};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_outer_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, Nulls)
+{
+  //    a                   b
+  //    [1, 2, 7]           100
+  //    [5, 6]              200
+  //    [0, 3]              300
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  LCW a({LCW{1, 2, 7}, LCW{5, 6}, LCW{0, 3}}, valids);
+  FCW b({100, 200, 300}, valids);
+
+  FCW expected_a({1, 2, 7, 0, 0, 3}, {1, 1, 1, 0, 1, 1});
+  FCW expected_b({100, 100, 100, 200, 300, 300}, {1, 1, 1, 0, 1, 1});
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode_outer(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 2, 0, 0, 1};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_outer_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, NullsInList)
+{
+  //    a                   b
+  //    [1, 2, 7]           100
+  //    [5, 6, 0, 9]        200
+  //    []                  300
+  //    [0, 3, 8]           400
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  LCW a{LCW({1, 2, 7}, valids), LCW({5, 6, 0, 9}, valids), LCW{}, LCW({0, 3, 8}, valids)};
+  FCW b{100, 200, 300, 400};
+
+  FCW expected_a({1, 2, 7, 5, 6, 0, 9, 0, 0, 3, 8}, {1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1});
+  FCW expected_b{100, 100, 100, 200, 200, 200, 200, 300, 400, 400, 400};
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode_outer(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 2, 0, 1, 2, 3, 0, 0, 1, 2};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_outer_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, Nested)
+{
+  //    a                      b
+  //    [[1, 2], [7, 6, 5]]    100
+  //    [[5, 6]]               200
+  //    [[0, 3],[],[5],[2, 1]] 300
+
+  LCW a{LCW{LCW{1, 2}, LCW{7, 6, 5}}, LCW{LCW{5, 6}}, LCW{LCW{0, 3}, LCW{}, LCW{5}, LCW{2, 1}}};
+  FCW b{100, 200, 300};
+
+  LCW expected_a{LCW{1, 2}, LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{}, LCW{5}, LCW{2, 1}};
+  FCW expected_b{100, 100, 200, 300, 300, 300, 300};
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode_outer(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 0, 1, 2, 3};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_outer_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, NestedNulls)
+{
+  //    a                   b
+  //    [[1, 2], [7, 6, 5]] 100
+  //    [[5, 6]]            200
+  //    [[0, 3],[5],[2, 1]] 300
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  LCW a({LCW{LCW{1, 2}, LCW{7, 6, 5}}, LCW{LCW{5, 6}}, LCW{LCW{0, 3}, LCW{5}, LCW{2, 1}}}, valids);
+  FCW b({100, 200, 300});
+
+  auto expected_valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 2 ? false : true; });
+  LCW expected_a({LCW{1, 2}, LCW{7, 6, 5}, LCW{}, LCW{0, 3}, LCW{5}, LCW{2, 1}}, expected_valids);
+  FCW expected_b({100, 100, 200, 300, 300, 300});
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode_outer(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 0, 1, 2};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_outer_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, NullsInNested)
+{
+  //    a                   b
+  //    [[1, 2], [7, 6, 5]] 100
+  //    [[5, 6]]            200
+  //    [[0, 3],[5],[2, 1]] 300
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}},
+         LCW{LCW{5, 6}},
+         LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}});
+  FCW b({100, 200, 300});
+
+  LCW expected_a{
+    LCW({1, 2}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)};
+  FCW expected_b{100, 100, 200, 300, 300, 300};
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode_outer(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 0, 1, 2};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_outer_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, NullsInNestedDoubleExplode)
+{
+  //    a                       b
+  //    [[1, 2], [], [7, 6, 5]] 100
+  //    [[5, 6]]                200
+  //    [[0, 3],[5],[2, 1]]     300
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  LCW a{LCW{LCW({1, 2}, valids), LCW{}, LCW{7, 6, 5}},
+        LCW{LCW{5, 6}},
+        LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}};
+  FCW b{100, 200, 300};
+
+  FCW expected_a({1, 2, 0, 7, 6, 5, 5, 6, 0, 3, 5, 2, 1}, {1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+  FCW expected_b{100, 100, 100, 100, 100, 100, 200, 200, 300, 300, 300, 300, 300};
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto first_explode_ret = cudf::explode_outer(t, 0);
+  auto ret               = cudf::explode_outer(first_explode_ret->view(), 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 0, 1, 2, 0, 1, 0, 1, 0, 0, 1};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_outer_position(first_explode_ret->view(), 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, NestedStructs)
+{
+  //    a                   b
+  //    [[1, 2], [7, 6, 5]] {100, "100"}
+  //    [[5, 6]]            {200, "200"}
+  //    [[0, 3],[5],[2, 1]] {300, "300"}
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}},
+         LCW{LCW{5, 6}},
+         LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}});
+  FCW b1({100, 200, 300});
+  strings_column_wrapper b2{"100", "200", "300"};
+  structs_column_wrapper b({b1, b2});
+
+  LCW expected_a{
+    LCW({1, 2}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)};
+  FCW expected_b1{100, 100, 200, 300, 300, 300};
+  strings_column_wrapper expected_b2{"100", "100", "200", "300", "300", "300"};
+  structs_column_wrapper expected_b({expected_b1, expected_b2});
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode_outer(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 0, 1, 2};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_outer_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TYPED_TEST(ExplodeOuterTypedTest, ListOfStructs)
+{
+  //  a                        b
+  //  [{70, "70"}, {75, "75"}] 100
+  //  [{50, "50"}, {55, "55"}] 200
+  //  [{35, "35"}, {45, "45"}] 300
+  //  [{25, "25"}, {30, "30"}] 400
+  //  [{15, "15"}, {20, "20"}] 500
+
+  auto numeric_col =
+    fixed_width_column_wrapper<TypeParam, int32_t>{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}};
+  strings_column_wrapper string_col{"70", "75", "50", "55", "35", "45", "25", "30", "15", "20"};
+  auto struct_col = structs_column_wrapper{{numeric_col, string_col}}.release();
+  auto a          = cudf::make_lists_column(
+    5, FCW{0, 2, 4, 6, 8, 10}.release(), std::move(struct_col), cudf::UNKNOWN_NULL_COUNT, {});
+
+  FCW b{100, 200, 300, 400, 500};
+
+  cudf::table_view t({a->view(), b});
+  auto ret = cudf::explode_outer(t, 0);
+
+  auto expected_numeric_col =
+    fixed_width_column_wrapper<TypeParam, int32_t>{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}};
+  strings_column_wrapper expected_string_col{
+    "70", "75", "50", "55", "35", "45", "25", "30", "15", "20"};
+
+  auto expected_a = structs_column_wrapper{{expected_numeric_col, expected_string_col}}.release();
+  FCW expected_b{100, 100, 200, 200, 300, 300, 400, 400, 500, 500};
+
+  cudf::table_view expected({expected_a->view(), expected_b});
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
+  cudf::table_view pos_expected({expected_pos_col, expected_a->view(), expected_b});
+
+  auto pos_ret = cudf::explode_outer_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, SlicedList)
+{
+  //    a                        b
+  //    [[1, 2],[7, 6, 5]]       100
+  //    [[5, 6]]                 200
+  //    [[0, 3],[5],[2, 1]]      300
+  //    [[8, 3],[],[4, 3, 1, 2]] 400
+  //    [[2, 3, 4],[9, 8]]       500
+
+  //    slicing the top 2 rows and the bottom row off
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}},
+         LCW{LCW{5, 6}},
+         LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)},
+         LCW{LCW{8, 3}, LCW{}, LCW({4, 3, 1, 2}, valids)},
+         LCW{LCW{2, 3, 4}, LCW{9, 8}}});
+  FCW b({100, 200, 300, 400, 500});
+
+  LCW expected_a{
+    LCW{0, 3}, LCW{5}, LCW({2, 1}, valids), LCW{8, 3}, LCW{}, LCW({4, 3, 1, 2}, valids)};
+  FCW expected_b{300, 300, 300, 400, 400, 400};
+
+  cudf::table_view t({a, b});
+  auto sliced_t = cudf::slice(t, {2, 4});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode_outer(sliced_t[0], 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 2, 0, 1, 2};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_outer_position(sliced_t[0], 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp
index fa3bde8cb52..451fa82d5a3 100644
--- a/cpp/tests/merge/merge_test.cpp
+++ b/cpp/tests/merge/merge_test.cpp
@@ -729,4 +729,36 @@ TEST_F(MergeTest, KeysWithNulls)
     }
 }
 
+template <typename T>
+struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+};
+
+template <typename T>
+using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
+
+TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+
+TYPED_TEST(FixedPointTestBothReps, FixedPointMerge)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = cudf::device_storage_type_t<decimalXX>;
+
+  auto const a       = fp_wrapper<RepType>{{4, 22, 33, 44, 55}, scale_type{-1}};
+  auto const b       = fp_wrapper<RepType>{{5, 7, 10}, scale_type{-1}};
+  auto const table_a = cudf::table_view(std::vector<cudf::column_view>{a});
+  auto const table_b = cudf::table_view(std::vector<cudf::column_view>{b});
+  auto const tables  = std::vector<cudf::table_view>{table_a, table_b};
+
+  auto const key_cols = std::vector<cudf::size_type>{0};
+  auto const order    = std::vector<cudf::order>{cudf::order::ASCENDING};
+
+  auto const exp       = fp_wrapper<RepType>{{4, 5, 7, 10, 22, 33, 44, 55}, scale_type{-1}};
+  auto const exp_table = cudf::table_view(std::vector<cudf::column_view>{exp});
+
+  auto const result = cudf::merge(tables, key_cols, order);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(exp_table.column(0), result->view().column(0));
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/reshape/explode_tests.cpp b/cpp/tests/reshape/explode_tests.cpp
deleted file mode 100644
index 5f3237ce46d..00000000000
--- a/cpp/tests/reshape/explode_tests.cpp
+++ /dev/null
@@ -1,530 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/detail/iterator.cuh>
-#include <cudf/reshape.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/table_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
-
-using namespace cudf::test;
-
-class ExplodeTest : public cudf::test::BaseFixture {
-};
-
-template <typename T>
-class ExplodeTypedTest : public cudf::test::BaseFixture {
-};
-
-TYPED_TEST_CASE(ExplodeTypedTest, cudf::test::FixedPointTypes);
-
-TEST_F(ExplodeTest, Empty)
-{
-  lists_column_wrapper<int32_t> a{};
-  fixed_width_column_wrapper<int32_t> b{};
-
-  cudf::table_view t({a, b});
-
-  auto ret = cudf::explode(t, 0);
-
-  fixed_width_column_wrapper<int32_t> expected_a{};
-  fixed_width_column_wrapper<int32_t> expected_b{};
-  cudf::table_view expected({expected_a, expected_b});
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
-
-  auto pos_ret = cudf::explode_position(t, 0);
-
-  fixed_width_column_wrapper<int32_t> expected_c{};
-  cudf::table_view pos_expected({expected_a, expected_b, expected_c});
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
-}
-
-TEST_F(ExplodeTest, NonList)
-{
-  fixed_width_column_wrapper<int32_t> a{100, 200, 300};
-  fixed_width_column_wrapper<int32_t> b{100, 200, 300};
-
-  cudf::table_view t({a, b});
-
-  EXPECT_THROW(cudf::explode(t, 1), cudf::logic_error);
-  EXPECT_THROW(cudf::explode_position(t, 1), cudf::logic_error);
-}
-
-TEST_F(ExplodeTest, Basics)
-{
-  /*
-      a                   b
-      [1, 2, 7]           100
-      [5, 6]              200
-      [0, 3]              300
-  */
-
-  fixed_width_column_wrapper<int32_t> a{100, 200, 300};
-  lists_column_wrapper<int32_t> b{lists_column_wrapper<int32_t>{1, 2, 7},
-                                  lists_column_wrapper<int32_t>{5, 6},
-                                  lists_column_wrapper<int32_t>{0, 3}};
-  strings_column_wrapper c{"string0", "string1", "string2"};
-
-  fixed_width_column_wrapper<int32_t> expected_a{100, 100, 100, 200, 200, 300, 300};
-  fixed_width_column_wrapper<int32_t> expected_b{1, 2, 7, 5, 6, 0, 3};
-  strings_column_wrapper expected_c{
-    "string0", "string0", "string0", "string1", "string1", "string2", "string2"};
-
-  cudf::table_view t({a, b, c});
-  cudf::table_view expected({expected_a, expected_b, expected_c});
-
-  auto ret = cudf::explode(t, 1);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
-
-  fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 2, 0, 1, 0, 1};
-  cudf::table_view pos_expected({expected_a, expected_pos_col, expected_b, expected_c});
-
-  auto pos_ret = cudf::explode_position(t, 1);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
-}
-
-TEST_F(ExplodeTest, SingleNull)
-{
-  /*
-      a                   b
-      [1, 2, 7]           100
-      [5, 6]              200
-      []                  300
-      [0, 3]              400
-  */
-
-  auto first_invalid =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 0 ? false : true; });
-
-  lists_column_wrapper<int32_t> a({lists_column_wrapper<int32_t>{1, 2, 7},
-                                   lists_column_wrapper<int32_t>{5, 6},
-                                   lists_column_wrapper<int32_t>{},
-                                   lists_column_wrapper<int32_t>{0, 3}},
-                                  first_invalid);
-  fixed_width_column_wrapper<int32_t> b({100, 200, 300, 400});
-
-  fixed_width_column_wrapper<int32_t> expected_a{5, 6, 0, 3};
-  fixed_width_column_wrapper<int32_t> expected_b{200, 200, 400, 400};
-
-  cudf::table_view t({a, b});
-  cudf::table_view expected({expected_a, expected_b});
-
-  auto ret = cudf::explode(t, 0);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
-
-  fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 0, 1};
-  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
-
-  auto pos_ret = cudf::explode_position(t, 0);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
-}
-
-TEST_F(ExplodeTest, Nulls)
-{
-  /*
-      a                   b
-      [1, 2, 7]           100
-      [5, 6]              200
-      [0, 3]              300
-  */
-
-  auto valids = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return i % 2 == 0 ? true : false; });
-  auto always_valid =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
-
-  lists_column_wrapper<int32_t> a({lists_column_wrapper<int32_t>{1, 2, 7},
-                                   lists_column_wrapper<int32_t>{5, 6},
-                                   lists_column_wrapper<int32_t>{0, 3}},
-                                  valids);
-  fixed_width_column_wrapper<int32_t> b({100, 200, 300}, valids);
-
-  fixed_width_column_wrapper<int32_t> expected_a({1, 2, 7, 0, 3});
-  fixed_width_column_wrapper<int32_t> expected_b({100, 100, 100, 300, 300}, always_valid);
-
-  cudf::table_view t({a, b});
-  cudf::table_view expected({expected_a, expected_b});
-
-  auto ret = cudf::explode(t, 0);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
-
-  fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 2, 0, 1};
-  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
-
-  auto pos_ret = cudf::explode_position(t, 0);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
-}
-
-TEST_F(ExplodeTest, NullsInList)
-{
-  /*
-      a                   b
-      [1, 2, 7]           100
-      [5, 6, 0, 9]        200
-      []                  300
-      [0, 3, 8]           400
-  */
-
-  auto valids = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return i % 2 == 0 ? true : false; });
-
-  lists_column_wrapper<int32_t> a{lists_column_wrapper<int32_t>({1, 2, 7}, valids),
-                                  lists_column_wrapper<int32_t>({5, 6, 0, 9}, valids),
-                                  lists_column_wrapper<int32_t>{},
-                                  lists_column_wrapper<int32_t>({0, 3, 8}, valids)};
-  fixed_width_column_wrapper<int32_t> b{100, 200, 300, 400};
-
-  fixed_width_column_wrapper<int32_t> expected_a({1, 2, 7, 5, 6, 0, 9, 0, 3, 8},
-                                                 {1, 0, 1, 1, 0, 1, 0, 1, 0, 1});
-  fixed_width_column_wrapper<int32_t> expected_b{100, 100, 100, 200, 200, 200, 200, 400, 400, 400};
-
-  cudf::table_view t({a, b});
-  cudf::table_view expected({expected_a, expected_b});
-
-  auto ret = cudf::explode(t, 0);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
-
-  fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 2, 0, 1, 2, 3, 0, 1, 2};
-  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
-
-  auto pos_ret = cudf::explode_position(t, 0);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
-}
-
-TEST_F(ExplodeTest, Nested)
-{
-  /*
-      a                      b
-      [[1, 2], [7, 6, 5]]    100
-      [[5, 6]]               200
-      [[0, 3],[],[5],[2, 1]] 300
-  */
-
-  lists_column_wrapper<int32_t> a{
-    lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{1, 2},
-                                  lists_column_wrapper<int32_t>{7, 6, 5}},
-    lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{5, 6}},
-    lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{0, 3},
-                                  lists_column_wrapper<int32_t>{},
-                                  lists_column_wrapper<int32_t>{5},
-                                  lists_column_wrapper<int32_t>{2, 1}}};
-  fixed_width_column_wrapper<int32_t> b{100, 200, 300};
-
-  lists_column_wrapper<int32_t> expected_a{lists_column_wrapper<int32_t>{1, 2},
-                                           lists_column_wrapper<int32_t>{7, 6, 5},
-                                           lists_column_wrapper<int32_t>{5, 6},
-                                           lists_column_wrapper<int32_t>{0, 3},
-                                           lists_column_wrapper<int32_t>{},
-                                           lists_column_wrapper<int32_t>{5},
-                                           lists_column_wrapper<int32_t>{2, 1}};
-  fixed_width_column_wrapper<int32_t> expected_b{100, 100, 200, 300, 300, 300, 300};
-
-  cudf::table_view t({a, b});
-  cudf::table_view expected({expected_a, expected_b});
-
-  auto ret = cudf::explode(t, 0);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
-
-  fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 0, 0, 1, 2, 3};
-  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
-
-  auto pos_ret = cudf::explode_position(t, 0);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
-}
-
-TEST_F(ExplodeTest, NestedNulls)
-{
-  /*
-      a                   b
-      [[1, 2], [7, 6, 5]] 100
-      [[5, 6]]            200
-      [[0, 3],[5],[2, 1]] 300
-  */
-
-  auto valids = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return i % 2 == 0 ? true : false; });
-  auto always_valid =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
-
-  lists_column_wrapper<int32_t> a(
-    {lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{1, 2},
-                                   lists_column_wrapper<int32_t>{7, 6, 5}},
-     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{5, 6}},
-     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{0, 3},
-                                   lists_column_wrapper<int32_t>{5},
-                                   lists_column_wrapper<int32_t>{2, 1}}},
-    valids);
-  fixed_width_column_wrapper<int32_t> b({100, 200, 300}, valids);
-
-  lists_column_wrapper<int32_t> expected_a{lists_column_wrapper<int32_t>{1, 2},
-                                           lists_column_wrapper<int32_t>{7, 6, 5},
-                                           lists_column_wrapper<int32_t>{0, 3},
-                                           lists_column_wrapper<int32_t>{5},
-                                           lists_column_wrapper<int32_t>{2, 1}};
-  fixed_width_column_wrapper<int32_t> expected_b({100, 100, 300, 300, 300}, always_valid);
-
-  cudf::table_view t({a, b});
-  cudf::table_view expected({expected_a, expected_b});
-
-  auto ret = cudf::explode(t, 0);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
-
-  fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 0, 1, 2};
-  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
-
-  auto pos_ret = cudf::explode_position(t, 0);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
-}
-
-TEST_F(ExplodeTest, NullsInNested)
-{
-  /*
-      a                   b
-      [[1, 2], [7, 6, 5]] 100
-      [[5, 6]]            200
-      [[0, 3],[5],[2, 1]] 300
-  */
-
-  auto valids = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return i % 2 == 0 ? true : false; });
-
-  lists_column_wrapper<int32_t> a(
-    {lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>({1, 2}, valids),
-                                   lists_column_wrapper<int32_t>{7, 6, 5}},
-     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{5, 6}},
-     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{0, 3},
-                                   lists_column_wrapper<int32_t>{5},
-                                   lists_column_wrapper<int32_t>({2, 1}, valids)}});
-  fixed_width_column_wrapper<int32_t> b({100, 200, 300});
-
-  lists_column_wrapper<int32_t> expected_a{lists_column_wrapper<int32_t>({1, 2}, valids),
-                                           lists_column_wrapper<int32_t>{7, 6, 5},
-                                           lists_column_wrapper<int32_t>{5, 6},
-                                           lists_column_wrapper<int32_t>{0, 3},
-                                           lists_column_wrapper<int32_t>{5},
-                                           lists_column_wrapper<int32_t>({2, 1}, valids)};
-  fixed_width_column_wrapper<int32_t> expected_b{100, 100, 200, 300, 300, 300};
-
-  cudf::table_view t({a, b});
-  cudf::table_view expected({expected_a, expected_b});
-
-  auto ret = cudf::explode(t, 0);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
-
-  fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 0, 0, 1, 2};
-  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
-
-  auto pos_ret = cudf::explode_position(t, 0);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
-}
-
-TEST_F(ExplodeTest, NullsInNestedDoubleExplode)
-{
-  /*
-      a                       b
-      [[1, 2], [], [7, 6, 5]] 100
-      [[5, 6]]                200
-      [[0, 3],[5],[2, 1]]     300
-  */
-
-  auto valids = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return i % 2 == 0 ? true : false; });
-
-  lists_column_wrapper<int32_t> a{
-    lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>({1, 2}, valids),
-                                  lists_column_wrapper<int32_t>{},
-                                  lists_column_wrapper<int32_t>{7, 6, 5}},
-    lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{5, 6}},
-    lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{0, 3},
-                                  lists_column_wrapper<int32_t>{5},
-                                  lists_column_wrapper<int32_t>({2, 1}, valids)}};
-  fixed_width_column_wrapper<int32_t> b{100, 200, 300};
-
-  fixed_width_column_wrapper<int32_t> expected_a({1, 2, 7, 6, 5, 5, 6, 0, 3, 5, 2, 1},
-                                                 {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
-  fixed_width_column_wrapper<int32_t> expected_b{
-    100, 100, 100, 100, 100, 200, 200, 300, 300, 300, 300, 300};
-
-  cudf::table_view t({a, b});
-  cudf::table_view expected({expected_a, expected_b});
-
-  auto first_explode_ret = cudf::explode(t, 0);
-  auto ret               = cudf::explode(first_explode_ret->view(), 0);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
-
-  fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 0, 1, 2, 0, 1, 0, 1, 0, 0, 1};
-  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
-
-  auto pos_ret = cudf::explode_position(first_explode_ret->view(), 0);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
-}
-
-TEST_F(ExplodeTest, NestedStructs)
-{
-  /*
-      a                   b
-      [[1, 2], [7, 6, 5]] {100, "100"}
-      [[5, 6]]            {200, "200"}
-      [[0, 3],[5],[2, 1]] {300, "300"}
-  */
-
-  auto valids = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return i % 2 == 0 ? true : false; });
-
-  lists_column_wrapper<int32_t> a(
-    {lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>({1, 2}, valids),
-                                   lists_column_wrapper<int32_t>{7, 6, 5}},
-     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{5, 6}},
-     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{0, 3},
-                                   lists_column_wrapper<int32_t>{5},
-                                   lists_column_wrapper<int32_t>({2, 1}, valids)}});
-  fixed_width_column_wrapper<int32_t> b1({100, 200, 300});
-  strings_column_wrapper b2{"100", "200", "300"};
-  structs_column_wrapper b({b1, b2});
-
-  lists_column_wrapper<int32_t> expected_a{lists_column_wrapper<int32_t>({1, 2}, valids),
-                                           lists_column_wrapper<int32_t>{7, 6, 5},
-                                           lists_column_wrapper<int32_t>{5, 6},
-                                           lists_column_wrapper<int32_t>{0, 3},
-                                           lists_column_wrapper<int32_t>{5},
-                                           lists_column_wrapper<int32_t>({2, 1}, valids)};
-  fixed_width_column_wrapper<int32_t> expected_b1{100, 100, 200, 300, 300, 300};
-  strings_column_wrapper expected_b2{"100", "100", "200", "300", "300", "300"};
-  structs_column_wrapper expected_b({expected_b1, expected_b2});
-
-  cudf::table_view t({a, b});
-  cudf::table_view expected({expected_a, expected_b});
-
-  auto ret = cudf::explode(t, 0);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
-
-  fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 0, 0, 1, 2};
-  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
-
-  auto pos_ret = cudf::explode_position(t, 0);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
-}
-
-TYPED_TEST(ExplodeTypedTest, ListOfStructs)
-{
-  /*
-    a                        b
-    [{70, "70"}, {75, "75"}] 100
-    [{50, "50"}, {55, "55"}] 200
-    [{35, "35"}, {45, "45"}] 300
-    [{25, "25"}, {30, "30"}] 400
-    [{15, "15"}, {20, "20"}] 500
-*/
-
-  auto numeric_col =
-    fixed_width_column_wrapper<TypeParam, int32_t>{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}};
-  strings_column_wrapper string_col{"70", "75", "50", "55", "35", "45", "25", "30", "15", "20"};
-  auto struct_col = structs_column_wrapper{{numeric_col, string_col}}.release();
-  auto a          = cudf::make_lists_column(5,
-                                   fixed_width_column_wrapper<int32_t>{0, 2, 4, 6, 8, 10}.release(),
-                                   std::move(struct_col),
-                                   cudf::UNKNOWN_NULL_COUNT,
-                                   {});
-
-  fixed_width_column_wrapper<int32_t> b{100, 200, 300, 400, 500};
-
-  cudf::table_view t({a->view(), b});
-  auto ret = cudf::explode(t, 0);
-
-  auto expected_numeric_col =
-    fixed_width_column_wrapper<TypeParam, int32_t>{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}};
-  strings_column_wrapper expected_string_col{
-    "70", "75", "50", "55", "35", "45", "25", "30", "15", "20"};
-
-  auto expected_a = structs_column_wrapper{{expected_numeric_col, expected_string_col}}.release();
-  fixed_width_column_wrapper<int32_t> expected_b{100, 100, 200, 200, 300, 300, 400, 400, 500, 500};
-
-  cudf::table_view expected({expected_a->view(), expected_b});
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
-
-  fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
-  cudf::table_view pos_expected({expected_pos_col, expected_a->view(), expected_b});
-
-  auto pos_ret = cudf::explode_position(t, 0);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
-}
-
-TEST_F(ExplodeTest, SlicedList)
-{
-  /*
-      a                        b
-      [[1, 2],[7, 6, 5]]       100
-      [[5, 6]]                 200
-      [[0, 3],[5],[2, 1]]      300
-      [[8, 3],[],[4, 3, 1, 2]] 400
-      [[2, 3, 4],[9, 8]]       500
-
-      slicing the top 2 rows and the bottom row off
-  */
-
-  auto valids = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return i % 2 == 0 ? true : false; });
-
-  lists_column_wrapper<int32_t> a(
-    {lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>({1, 2}, valids),
-                                   lists_column_wrapper<int32_t>{7, 6, 5}},
-     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{5, 6}},
-     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{0, 3},
-                                   lists_column_wrapper<int32_t>{5},
-                                   lists_column_wrapper<int32_t>({2, 1}, valids)},
-     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{8, 3},
-                                   lists_column_wrapper<int32_t>{},
-                                   lists_column_wrapper<int32_t>({4, 3, 1, 2}, valids)},
-     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{2, 3, 4},
-                                   lists_column_wrapper<int32_t>{9, 8}}});
-  fixed_width_column_wrapper<int32_t> b({100, 200, 300, 400, 500});
-
-  lists_column_wrapper<int32_t> expected_a{lists_column_wrapper<int32_t>{0, 3},
-                                           lists_column_wrapper<int32_t>{5},
-                                           lists_column_wrapper<int32_t>({2, 1}, valids),
-                                           lists_column_wrapper<int32_t>{8, 3},
-                                           lists_column_wrapper<int32_t>{},
-                                           lists_column_wrapper<int32_t>({4, 3, 1, 2}, valids)};
-  fixed_width_column_wrapper<int32_t> expected_b{300, 300, 300, 400, 400, 400};
-
-  cudf::table_view t({a, b});
-  auto sliced_t = cudf::slice(t, {2, 4});
-  cudf::table_view expected({expected_a, expected_b});
-
-  auto ret = cudf::explode(sliced_t[0], 0);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
-
-  fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 2, 0, 1, 2};
-  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
-
-  auto pos_ret = cudf::explode_position(sliced_t[0], 0);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
-}
diff --git a/cpp/tests/strings/chars_types_tests.cpp b/cpp/tests/strings/chars_types_tests.cpp
index 803a9b01b07..702329edaba 100644
--- a/cpp/tests/strings/chars_types_tests.cpp
+++ b/cpp/tests/strings/chars_types_tests.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <tests/strings/utilities.h>
 #include <cudf/column/column.hpp>
 #include <cudf/strings/char_types/char_types.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -228,54 +227,6 @@ TEST_F(StringsCharsTest, Numerics)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_F(StringsCharsTest, Integers)
-{
-  cudf::test::strings_column_wrapper strings1(
-    {"+175", "-34", "9.8", "17+2", "+-14", "1234567890", "67de", "", "1e10", "-", "++", ""});
-  auto results = cudf::strings::is_integer(cudf::strings_column_view(strings1));
-  cudf::test::fixed_width_column_wrapper<bool> expected1({1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);
-  EXPECT_FALSE(cudf::strings::all_integer(cudf::strings_column_view(strings1)));
-
-  cudf::test::strings_column_wrapper strings2(
-    {"0", "+0", "-0", "1234567890", "-27341132", "+012", "023", "-045"});
-  results = cudf::strings::is_integer(cudf::strings_column_view(strings2));
-  cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1, 1, 1});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
-  EXPECT_TRUE(cudf::strings::all_integer(cudf::strings_column_view(strings2)));
-}
-
-TEST_F(StringsCharsTest, Floats)
-{
-  cudf::test::strings_column_wrapper strings1({"+175",
-                                               "-9.8",
-                                               "7+2",
-                                               "+-4",
-                                               "6.7e17",
-                                               "-1.2e-5",
-                                               "e",
-                                               ".e",
-                                               "1.e+-2",
-                                               "00.00",
-                                               "1.0e+1.0",
-                                               "1.2.3",
-                                               "+",
-                                               "--",
-                                               ""});
-  auto results = cudf::strings::is_float(cudf::strings_column_view(strings1));
-  cudf::test::fixed_width_column_wrapper<bool> expected1(
-    {1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);
-  EXPECT_FALSE(cudf::strings::all_float(cudf::strings_column_view(strings1)));
-
-  cudf::test::strings_column_wrapper strings2(
-    {"+175", "-34", "9.8", "1234567890", "6.7e17", "-917.2e5"});
-  results = cudf::strings::is_float(cudf::strings_column_view(strings2));
-  cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
-  EXPECT_TRUE(cudf::strings::all_float(cudf::strings_column_view(strings2)));
-}
-
 TEST_F(StringsCharsTest, EmptyStrings)
 {
   cudf::test::strings_column_wrapper strings({"", "", ""});
@@ -284,12 +235,6 @@ TEST_F(StringsCharsTest, EmptyStrings)
   auto results = cudf::strings::all_characters_of_type(
     strings_view, cudf::strings::string_character_types::ALPHANUM);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-  results = cudf::strings::is_integer(strings_view);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-  EXPECT_FALSE(cudf::strings::all_integer(strings_view));
-  results = cudf::strings::is_float(strings_view);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-  EXPECT_FALSE(cudf::strings::all_float(strings_view));
 }
 
 TEST_F(StringsCharsTest, FilterCharTypes)
@@ -379,14 +324,6 @@ TEST_F(StringsCharsTest, EmptyStringsColumn)
   EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id());
   EXPECT_EQ(0, results->view().size());
 
-  results = cudf::strings::is_integer(strings_view);
-  EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id());
-  EXPECT_EQ(0, results->view().size());
-
-  results = cudf::strings::is_float(strings_view);
-  EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id());
-  EXPECT_EQ(0, results->view().size());
-
   results = cudf::strings::filter_characters_of_type(
     strings_view, cudf::strings::string_character_types::NUMERIC);
   EXPECT_EQ(cudf::type_id::STRING, results->view().type().id());
diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu
index f904c404251..bd463a7ab0d 100644
--- a/cpp/tests/strings/factories_test.cu
+++ b/cpp/tests/strings/factories_test.cu
@@ -19,12 +19,18 @@
 #include <cudf/copying.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/execution_policy.h>
+#include <thrust/transform.h>
+
 #include <cstring>
 #include <vector>
 
@@ -198,3 +204,31 @@ TEST_F(StringsFactoriesTest, CreateOffsets)
     }
   }
 }
+
+namespace {
+using string_pair = thrust::pair<char const*, cudf::size_type>;
+struct string_view_to_pair {
+  __device__ string_pair operator()(thrust::pair<cudf::string_view, bool> const& p)
+  {
+    return (p.second) ? string_pair{p.first.data(), p.first.size_bytes()} : string_pair{nullptr, 0};
+  }
+};
+}  // namespace
+
+TEST_F(StringsFactoriesTest, StringPairWithNullsAndEmpty)
+{
+  cudf::test::strings_column_wrapper data(
+    {"", "this", "is", "", "a", "", "column", "of", "strings", "", ""},
+    {0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1});
+
+  auto d_column = cudf::column_device_view::create(data);
+  rmm::device_vector<string_pair> pairs(d_column->size());
+  thrust::transform(thrust::device,
+                    d_column->pair_begin<cudf::string_view, true>(),
+                    d_column->pair_end<cudf::string_view, true>(),
+                    pairs.data(),
+                    string_view_to_pair{});
+
+  auto result = cudf::make_strings_column(pairs);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), data);
+}
diff --git a/cpp/tests/strings/floats_tests.cpp b/cpp/tests/strings/floats_tests.cpp
index b98416d9edd..f7151363d83 100644
--- a/cpp/tests/strings/floats_tests.cpp
+++ b/cpp/tests/strings/floats_tests.cpp
@@ -27,6 +27,41 @@
 struct StringsConvertTest : public cudf::test::BaseFixture {
 };
 
+TEST_F(StringsConvertTest, IsFloat)
+{
+  cudf::test::strings_column_wrapper strings;
+  auto strings_view = cudf::strings_column_view(strings);
+  auto results      = cudf::strings::is_float(strings_view);
+  EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id());
+  EXPECT_EQ(0, results->view().size());
+
+  cudf::test::strings_column_wrapper strings1({"+175",
+                                               "-9.8",
+                                               "7+2",
+                                               "+-4",
+                                               "6.7e17",
+                                               "-1.2e-5",
+                                               "e",
+                                               ".e",
+                                               "1.e+-2",
+                                               "00.00",
+                                               "1.0e+1.0",
+                                               "1.2.3",
+                                               "+",
+                                               "--",
+                                               ""});
+  results = cudf::strings::is_float(cudf::strings_column_view(strings1));
+  cudf::test::fixed_width_column_wrapper<bool> expected1(
+    {1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);
+
+  cudf::test::strings_column_wrapper strings2(
+    {"+175", "-34", "9.8", "1234567890", "6.7e17", "-917.2e5"});
+  results = cudf::strings::is_float(cudf::strings_column_view(strings2));
+  cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
+}
+
 TEST_F(StringsConvertTest, ToFloats32)
 {
   std::vector<const char*> h_strings{"1234",
diff --git a/cpp/tests/strings/integers_tests.cu b/cpp/tests/strings/integers_tests.cu
index 9e2b9809b26..d6bf03b3f76 100644
--- a/cpp/tests/strings/integers_tests.cu
+++ b/cpp/tests/strings/integers_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,27 @@
 struct StringsConvertTest : public cudf::test::BaseFixture {
 };
 
+TEST_F(StringsConvertTest, IsInteger)
+{
+  cudf::test::strings_column_wrapper strings;
+  auto strings_view = cudf::strings_column_view(strings);
+  auto results      = cudf::strings::is_integer(strings_view);
+  EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id());
+  EXPECT_EQ(0, results->view().size());
+
+  cudf::test::strings_column_wrapper strings1(
+    {"+175", "-34", "9.8", "17+2", "+-14", "1234567890", "67de", "", "1e10", "-", "++", ""});
+  results = cudf::strings::is_integer(cudf::strings_column_view(strings1));
+  cudf::test::fixed_width_column_wrapper<bool> expected1({1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);
+
+  cudf::test::strings_column_wrapper strings2(
+    {"0", "+0", "-0", "1234567890", "-27341132", "+012", "023", "-045"});
+  results = cudf::strings::is_integer(cudf::strings_column_view(strings2));
+  cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
+}
+
 TEST_F(StringsConvertTest, ToInteger)
 {
   std::vector<const char*> h_strings{
diff --git a/java/src/main/java/ai/rapids/cudf/BinaryOperable.java b/java/src/main/java/ai/rapids/cudf/BinaryOperable.java
index e5e849a74c4..68213c21956 100644
--- a/java/src/main/java/ai/rapids/cudf/BinaryOperable.java
+++ b/java/src/main/java/ai/rapids/cudf/BinaryOperable.java
@@ -38,7 +38,7 @@ public interface BinaryOperable {
    * with scale=0 as scale is required. Dtype is discarded for binary operations for decimal
    * types in cudf as a new DType is created for output type with the new scale.
    */
-  static DType implicitConversion(BinaryOperable lhs, BinaryOperable rhs) {
+  static DType implicitConversion(BinaryOp op, BinaryOperable lhs, BinaryOperable rhs) {
     DType a = lhs.getType();
     DType b = rhs.getType();
     if (a.equals(DType.FLOAT64) || b.equals(DType.FLOAT64)) {
@@ -86,13 +86,15 @@ static DType implicitConversion(BinaryOperable lhs, BinaryOperable rhs) {
       int scale = 0;
       if (a.typeId == DType.DTypeEnum.DECIMAL32) {
         if (b.typeId == DType.DTypeEnum.DECIMAL32) {
-          return DType.create(DType.DTypeEnum.DECIMAL32, scale);
+          return DType.create(DType.DTypeEnum.DECIMAL32,
+              ColumnView.getFixedPointOutputScale(op, lhs.getType(), rhs.getType()));
         } else {
           throw new IllegalArgumentException("Both columns must be of the same fixed_point type");
         }
       } else if (a.typeId == DType.DTypeEnum.DECIMAL64) {
         if (b.typeId == DType.DTypeEnum.DECIMAL64) {
-          return DType.create(DType.DTypeEnum.DECIMAL64, scale);
+          return DType.create(DType.DTypeEnum.DECIMAL64,
+              ColumnView.getFixedPointOutputScale(op, lhs.getType(), rhs.getType()));
         } else {
           throw new IllegalArgumentException("Both columns must be of the same fixed_point type");
         }
@@ -128,7 +130,7 @@ default ColumnVector add(BinaryOperable rhs, DType outType) {
    * Add + operator. this + rhs
    */
   default ColumnVector add(BinaryOperable rhs) {
-    return add(rhs, implicitConversion(this, rhs));
+    return add(rhs, implicitConversion(BinaryOp.ADD, this, rhs));
   }
 
   /**
@@ -144,7 +146,7 @@ default ColumnVector sub(BinaryOperable rhs, DType outType) {
    * Subtract one vector from another. this - rhs
    */
   default ColumnVector sub(BinaryOperable rhs) {
-    return sub(rhs, implicitConversion(this, rhs));
+    return sub(rhs, implicitConversion(BinaryOp.SUB, this, rhs));
   }
 
   /**
@@ -160,7 +162,7 @@ default ColumnVector mul(BinaryOperable rhs, DType outType) {
    * Multiply two vectors together. this * rhs
    */
   default ColumnVector mul(BinaryOperable rhs) {
-    return mul(rhs, implicitConversion(this, rhs));
+    return mul(rhs, implicitConversion(BinaryOp.MUL, this, rhs));
   }
 
   /**
@@ -176,7 +178,7 @@ default ColumnVector div(BinaryOperable rhs, DType outType) {
    * Divide one vector by another. this / rhs
    */
   default ColumnVector div(BinaryOperable rhs) {
-    return div(rhs, implicitConversion(this, rhs));
+    return div(rhs, implicitConversion(BinaryOp.DIV, this, rhs));
   }
 
   /**
@@ -192,7 +194,7 @@ default ColumnVector trueDiv(BinaryOperable rhs, DType outType) {
    * (double)this / (double)rhs
    */
   default ColumnVector trueDiv(BinaryOperable rhs) {
-    return trueDiv(rhs, implicitConversion(this, rhs));
+    return trueDiv(rhs, implicitConversion(BinaryOp.TRUE_DIV, this, rhs));
   }
 
   /**
@@ -208,7 +210,7 @@ default ColumnVector floorDiv(BinaryOperable rhs, DType outType) {
    * Math.floor(this/rhs)
    */
   default ColumnVector floorDiv(BinaryOperable rhs) {
-    return floorDiv(rhs, implicitConversion(this, rhs));
+    return floorDiv(rhs, implicitConversion(BinaryOp.FLOOR_DIV, this, rhs));
   }
 
   /**
@@ -224,7 +226,7 @@ default ColumnVector mod(BinaryOperable rhs, DType outType) {
    * this % rhs
    */
   default ColumnVector mod(BinaryOperable rhs) {
-    return mod(rhs, implicitConversion(this, rhs));
+    return mod(rhs, implicitConversion(BinaryOp.MOD, this, rhs));
   }
 
   /**
@@ -240,7 +242,7 @@ default ColumnVector pow(BinaryOperable rhs, DType outType) {
    * Math.pow(this, rhs)
    */
   default ColumnVector pow(BinaryOperable rhs) {
-    return pow(rhs, implicitConversion(this, rhs));
+    return pow(rhs, implicitConversion(BinaryOp.POW, this, rhs));
   }
 
   /**
@@ -338,7 +340,7 @@ default ColumnVector bitAnd(BinaryOperable rhs, DType outType) {
    * Bit wise and (&). this & rhs
    */
   default ColumnVector bitAnd(BinaryOperable rhs) {
-    return bitAnd(rhs, implicitConversion(this, rhs));
+    return bitAnd(rhs, implicitConversion(BinaryOp.BITWISE_AND, this, rhs));
   }
 
   /**
@@ -352,7 +354,7 @@ default ColumnVector bitOr(BinaryOperable rhs, DType outType) {
    * Bit wise or (|). this | rhs
    */
   default ColumnVector bitOr(BinaryOperable rhs) {
-    return bitOr(rhs, implicitConversion(this, rhs));
+    return bitOr(rhs, implicitConversion(BinaryOp.BITWISE_OR, this, rhs));
   }
 
   /**
@@ -366,7 +368,7 @@ default ColumnVector bitXor(BinaryOperable rhs, DType outType) {
    * Bit wise xor (^). this ^ rhs
    */
   default ColumnVector bitXor(BinaryOperable rhs) {
-    return bitXor(rhs, implicitConversion(this, rhs));
+    return bitXor(rhs, implicitConversion(BinaryOp.BITWISE_XOR, this, rhs));
   }
 
   /**
@@ -380,7 +382,7 @@ default ColumnVector and(BinaryOperable rhs, DType outType) {
    * Logical and (&&). this && rhs
    */
   default ColumnVector and(BinaryOperable rhs) {
-    return and(rhs, implicitConversion(this, rhs));
+    return and(rhs, implicitConversion(BinaryOp.LOGICAL_AND, this, rhs));
   }
 
   /**
@@ -394,7 +396,7 @@ default ColumnVector or(BinaryOperable rhs, DType outType) {
    * Logical or (||). this || rhs
    */
   default ColumnVector or(BinaryOperable rhs) {
-    return or(rhs, implicitConversion(this, rhs));
+    return or(rhs, implicitConversion(BinaryOp.LOGICAL_OR, this, rhs));
   }
 
   /**
@@ -421,7 +423,7 @@ default ColumnVector shiftLeft(BinaryOperable shiftBy, DType outType) {
    *    with this[i] << shiftBy
    */
   default ColumnVector shiftLeft(BinaryOperable shiftBy) {
-    return shiftLeft(shiftBy, implicitConversion(this, shiftBy));
+    return shiftLeft(shiftBy, implicitConversion(BinaryOp.SHIFT_LEFT, this, shiftBy));
   }
 
   /**
@@ -447,7 +449,7 @@ default ColumnVector shiftRight(BinaryOperable shiftBy, DType outType) {
    *    with this[i] >> shiftBy
    */
   default ColumnVector shiftRight(BinaryOperable shiftBy) {
-    return shiftRight(shiftBy, implicitConversion(this, shiftBy));
+    return shiftRight(shiftBy, implicitConversion(BinaryOp.SHIFT_RIGHT, this, shiftBy));
   }
 
   /**
@@ -475,7 +477,8 @@ default ColumnVector shiftRightUnsigned(BinaryOperable shiftBy, DType outType) {
    *    with this[i] >>> shiftBy
    */
   default ColumnVector shiftRightUnsigned(BinaryOperable shiftBy) {
-    return shiftRightUnsigned(shiftBy, implicitConversion(this, shiftBy));
+    return shiftRightUnsigned(shiftBy, implicitConversion(BinaryOp.SHIFT_RIGHT_UNSIGNED, this,
+        shiftBy));
   }
 
   /**
@@ -505,7 +508,7 @@ default ColumnVector arctan2(BinaryOperable xCoordinate, DType outType) {
    * in radians, between the positive x axis and the ray to the point (x, y) ≠ (0, 0).
    */
   default ColumnVector arctan2(BinaryOperable xCoordinate) {
-    return arctan2(xCoordinate, implicitConversion(this, xCoordinate));
+    return arctan2(xCoordinate, implicitConversion(BinaryOp.ATAN2, this, xCoordinate));
   }
 
   /**
@@ -529,7 +532,7 @@ default ColumnVector pmod(BinaryOperable rhs, DType outputType) {
    *
    */
   default ColumnVector pmod(BinaryOperable rhs) {
-    return pmod(rhs, implicitConversion(this, rhs));
+    return pmod(rhs, implicitConversion(BinaryOp.PMOD, this, rhs));
   }
 
   /**
@@ -557,7 +560,7 @@ default ColumnVector maxNullAware(BinaryOperable rhs, DType outType) {
    * Returns the max non null value.
    */
   default ColumnVector maxNullAware(BinaryOperable rhs) {
-    return maxNullAware(rhs, implicitConversion(this, rhs));
+    return maxNullAware(rhs, implicitConversion(BinaryOp.NULL_MAX, this, rhs));
   }
 
   /**
@@ -571,6 +574,7 @@ default ColumnVector minNullAware(BinaryOperable rhs, DType outType) {
    * Returns the min non null value.
    */
   default ColumnVector minNullAware(BinaryOperable rhs) {
-    return minNullAware(rhs, implicitConversion(this, rhs));
+    return minNullAware(rhs, implicitConversion(BinaryOp.NULL_MIN, this, rhs));
   }
+
 }
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index f36896a3c96..2f3f2bf80cf 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -129,6 +129,13 @@ public final long getNativeView() {
     return viewHandle;
   }
 
+  static int getFixedPointOutputScale(BinaryOp op, DType lhsType, DType rhsType) {
+    assert (lhsType.isDecimalType() && rhsType.isDecimalType());
+    return fixedPointOutputScale(op.nativeId, lhsType.getScale(), rhsType.getScale());
+  }
+
+  private static native int fixedPointOutputScale(int op, int lhsScale, int rhsScale);
+
   public final DType getType() {
     return type;
   }
diff --git a/java/src/main/java/ai/rapids/cudf/HashType.java b/java/src/main/java/ai/rapids/cudf/HashType.java
index b521bc5c42c..eb31edd8222 100644
--- a/java/src/main/java/ai/rapids/cudf/HashType.java
+++ b/java/src/main/java/ai/rapids/cudf/HashType.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -22,8 +22,8 @@
  * Hash algorithm identifiers, mirroring native enum cudf::hash_id
  */
 public enum HashType {
-  // TODO IDENTITY(0),
-  // TODO MURMUR3(1),
+  IDENTITY(0),
+  MURMUR3(1),
   HASH_MD5(2),
   HASH_SERIAL_MURMUR3(3),
   HASH_SPARK_MURMUR3(4);
diff --git a/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java b/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
index bf49fb59d52..6c52b8fe798 100644
--- a/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
+++ b/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -353,6 +353,50 @@ static SerializedColumnHeader readFrom(DataInputStream din, long rowCount) throw
     }
   }
 
+  /** Class to hold the header and buffer pair result from host-side concatenation */
+  public static final class HostConcatResult implements AutoCloseable {
+    private final SerializedTableHeader tableHeader;
+    private final HostMemoryBuffer hostBuffer;
+
+    public HostConcatResult(SerializedTableHeader tableHeader, HostMemoryBuffer tableBuffer) {
+      this.tableHeader = tableHeader;
+      this.hostBuffer = tableBuffer;
+    }
+
+    public SerializedTableHeader getTableHeader() {
+      return tableHeader;
+    }
+
+    public HostMemoryBuffer getHostBuffer() {
+      return hostBuffer;
+    }
+
+    /** Build a contiguous table in device memory from this host-concatenated result */
+    public ContiguousTable toContiguousTable() {
+      DeviceMemoryBuffer devBuffer = DeviceMemoryBuffer.allocate(hostBuffer.length);
+      try {
+        if (hostBuffer.length > 0) {
+          devBuffer.copyFromHostBuffer(hostBuffer);
+        }
+        Table table = sliceUpColumnVectors(tableHeader, devBuffer, hostBuffer);
+        try {
+          return new ContiguousTable(table, devBuffer);
+        } catch (Exception e) {
+          table.close();
+          throw e;
+        }
+      } catch (Exception e) {
+        devBuffer.close();
+        throw e;
+      }
+    }
+
+    @Override
+    public void close() {
+      hostBuffer.close();
+    }
+  }
+
   /**
    * Visible for testing
    */
@@ -1681,15 +1725,32 @@ public static Table readAndConcat(SerializedTableHeader[] headers,
     return ct.getTable();
   }
 
+  /**
+   * Concatenate multiple tables in host memory into a contiguous table in device memory.
+   * @param headers table headers corresponding to the host table buffers
+   * @param dataBuffers host table buffer for each input table to be concatenated
+   * @return contiguous table in device memory
+   */
   public static ContiguousTable concatToContiguousTable(SerializedTableHeader[] headers,
                                                         HostMemoryBuffer[] dataBuffers) throws IOException {
+    try (HostConcatResult concatResult = concatToHostBuffer(headers, dataBuffers)) {
+      return concatResult.toContiguousTable();
+    }
+  }
+
+  /**
+   * Concatenate multiple tables in host memory into a single host table buffer.
+   * @param headers table headers corresponding to the host table buffers
+   * @param dataBuffers host table buffer for each input table to be concatenated
+   * @return host table header and buffer
+   */
+  public static HostConcatResult concatToHostBuffer(SerializedTableHeader[] headers,
+                                                    HostMemoryBuffer[] dataBuffers) throws IOException {
     ColumnBufferProvider[][] providersPerColumn = providersFrom(headers, dataBuffers);
-    DeviceMemoryBuffer devBuffer = null;
-    Table table = null;
     try {
       SerializedTableHeader combined = calcConcatHeader(providersPerColumn);
-
-      try (HostMemoryBuffer hostBuffer = HostMemoryBuffer.allocate(combined.dataLen)) {
+      HostMemoryBuffer hostBuffer = HostMemoryBuffer.allocate(combined.dataLen);
+      try {
         try (NvtxRange range = new NvtxRange("Concat Host Side", NvtxColor.GREEN)) {
           DataWriter writer = writerFrom(hostBuffer);
           int numColumns = combined.getNumColumns();
@@ -1697,27 +1758,14 @@ public static ContiguousTable concatToContiguousTable(SerializedTableHeader[] he
             writeConcat(writer, combined.getColumnHeader(columnIdx), providersPerColumn[columnIdx]);
           }
         }
-
-        devBuffer = DeviceMemoryBuffer.allocate(hostBuffer.length);
-        if (hostBuffer.length > 0) {
-          try (NvtxRange range = new NvtxRange("Copy Data To Device", NvtxColor.WHITE)) {
-            devBuffer.copyFromHostBuffer(hostBuffer);
-          }
-        }
-        table = sliceUpColumnVectors(combined, devBuffer, hostBuffer);
-        ContiguousTable result = new ContiguousTable(table, devBuffer);
-        table = null;
-        devBuffer = null;
-        return result;
+      } catch (Exception e) {
+        hostBuffer.close();
+        throw e;
       }
+
+      return new HostConcatResult(combined, hostBuffer);
     } finally {
       closeAll(providersPerColumn);
-      if (table != null) {
-        table.close();
-      }
-      if (devBuffer != null) {
-        devBuffer.close();
-      }
     }
   }
 
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index fcc23777d69..0dc529d423f 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -183,8 +183,12 @@ public long getDeviceMemorySize() {
   
   private static native ContiguousTable[] contiguousSplit(long inputTable, int[] indices);
 
+  private static native long[] partition(long inputTable, long partitionView,
+      int numberOfPartitions, int[] outputOffsets);
+
   private static native long[] hashPartition(long inputTable,
                                              int[] columnsToHash,
+                                             int hashTypeId,
                                              int numberOfPartitions,
                                              int[] outputOffsets) throws CudfException;
 
@@ -515,6 +519,10 @@ private static native long[] repeatColumnCount(long tableHandle,
 
   private static native long[] explodePosition(long tableHandle, int index);
 
+  private static native long[] explodeOuter(long tableHandle, int index);
+
+  private static native long[] explodeOuterPosition(long tableHandle, int index);
+
   private static native long createCudfTableView(long[] nativeColumnViewHandles);
 
   private static native long[] columnViewsFromPacked(ByteBuffer metadata, long dataAddress);
@@ -1252,6 +1260,24 @@ public Table repeat(ColumnVector counts, boolean checkCount) {
     return new Table(repeatColumnCount(this.nativeHandle, counts.getNativeView(), checkCount));
   }
 
+  /**
+   * Partition this table using the mapping in partitionMap. partitionMap must be an integer
+   * column. The number of rows in partitionMap must be the same as this table.  Each row
+   * in the map will indicate which partition the rows in the table belong to.
+   * @param partitionMap the partitions for each row.
+   * @param numberOfPartitions number of partitions
+   * @return {@link PartitionedTable} Table that exposes a limited functionality of the
+   * {@link Table} class
+   */
+  public PartitionedTable partition(ColumnView partitionMap, int numberOfPartitions) {
+    int[] partitionOffsets = new int[numberOfPartitions];
+    return new PartitionedTable(new Table(partition(
+        getNativeView(),
+        partitionMap.getNativeView(),
+        partitionOffsets.length,
+        partitionOffsets)), partitionOffsets);
+  }
+
   /**
    * Find smallest indices in a sorted table where values should be inserted to maintain order.
    * <pre>
@@ -1724,7 +1750,7 @@ public ContiguousTable[] contiguousSplit(int... indices) {
    * Example:
    * input:  [[5,10,15], 100],
    *         [[20,25],   200],
-   *         [[30],      300],
+   *         [[30],      300]
    * index: 0
    * output: [5,         100],
    *         [10,        100],
@@ -1736,12 +1762,12 @@ public ContiguousTable[] contiguousSplit(int... indices) {
    *
    * Nulls propagate in different ways depending on what is null.
    * <code>
-   *     [[5,null,15], 100],
-   *     [null,        200]
-   * returns:
-   *     [5,           100],
-   *     [null,        100],
-   *     [15,          100]
+   * input:  [[5,null,15], 100],
+   *         [null,        200]
+   * index: 0
+   * output: [5,           100],
+   *         [null,        100],
+   *         [15,          100]
    * </code>
    * Note that null lists are completely removed from the output
    * and nulls inside lists are pulled out and remain.
@@ -1762,27 +1788,26 @@ public Table explode(int index) {
    * in the output. The corresponding rows for other columns in the input are duplicated. A position
    * column is added that has the index inside the original list for each row. Example:
    * <code>
-   * [[5,10,15], 100],
-   * [[20,25],   200],
-   * [[30],      300],
-   * returns
-   * [0,   5,    100],
-   * [1,   10,   100],
-   * [2,   15,    100],
-   * [0,   20,    200],
-   * [1,   25,    200],
-   * [0,   30,    300],
+   * input:  [[5,10,15], 100],
+   *         [[20,25],   200],
+   *         [[30],      300]
+   * index: 0
+   * output: [0,   5,    100],
+   *         [1,   10,   100],
+   *         [2,   15,   100],
+   *         [0,   20,   200],
+   *         [1,   25,   200],
+   *         [0,   30,   300]
    * </code>
    *
    * Nulls and empty lists propagate in different ways depending on what is null or empty.
    * <code>
-   * [[5,null,15], 100],
-   * [null,        200],
-   * [[],          300],
-   * returns
-   * [0,    5,     100],
-   * [1,    null,  100],
-   * [2,    15,    100],
+   * input:  [[5,null,15], 100],
+   *         [null,        200]
+   * index: 0
+   * output: [5,           100],
+   *         [null,        100],
+   *         [15,          100]
    * </code>
    *
    * Note that null lists are not included in the resulting table, but nulls inside
@@ -1798,6 +1823,96 @@ public Table explodePosition(int index) {
     return new Table(explodePosition(nativeHandle, index));
   }
 
+  /**
+   * Explodes a list column's elements.
+   *
+   * Any list is exploded, which means the elements of the list in each row are expanded
+   * into new rows in the output. The corresponding rows for other columns in the input
+   * are duplicated.
+   *
+   * <code>
+   * Example:
+   * input:  [[5,10,15], 100],
+   *         [[20,25],   200],
+   *         [[30],      300],
+   * index: 0
+   * output: [5,         100],
+   *         [10,        100],
+   *         [15,        100],
+   *         [20,        200],
+   *         [25,        200],
+   *         [30,        300]
+   * </code>
+   *
+   * Nulls propagate in different ways depending on what is null.
+   * <code>
+   *  input:  [[5,null,15], 100],
+   *          [null,        200]
+   * index: 0
+   * output:  [5,           100],
+   *          [null,        100],
+   *          [15,          100],
+   *          [null,        200]
+   * </code>
+   * Note that null lists are completely removed from the output
+   * and nulls inside lists are pulled out and remain.
+   *
+   * @param index Column index to explode inside the table.
+   * @return A new table with explode_col exploded.
+   */
+  public Table explodeOuter(int index) {
+    assert 0 <= index && index < columns.length : "Column index is out of range";
+    assert columns[index].getType().equals(DType.LIST) : "Column to explode must be of type LIST";
+    return new Table(explodeOuter(nativeHandle, index));
+  }
+
+  /**
+   * Explodes a list column's elements retaining any null entries or empty lists and includes a
+   * position column.
+   *
+   * Any list is exploded, which means the elements of the list in each row are expanded into new rows
+   * in the output. The corresponding rows for other columns in the input are duplicated. A position
+   * column is added that has the index inside the original list for each row. Example:
+   *
+   * <code>
+   * Example:
+   * input:  [[5,10,15], 100],
+   *         [[20,25],   200],
+   *         [[30],      300],
+   * index: 0
+   * output: [0,   5,    100],
+   *         [1,   10,   100],
+   *         [2,   15,   100],
+   *         [0,   20,   200],
+   *         [1,   25,   200],
+   *         [0,   30,   300]
+   * </code>
+   *
+   * Nulls and empty lists propagate as null entries in the result.
+   * <code>
+   * input:  [[5,null,15], 100],
+   *         [null,        200],
+   *         [[],          300]
+   * index: 0
+   * output: [0,     5,    100],
+   *         [1,  null,    100],
+   *         [2,    15,    100],
+   *         [0,  null,    200],
+   *         [0,  null,    300]
+   * </code>
+   *
+   *    returns
+   *
+   * @param index Column index to explode inside the table.
+   * @return A new table with exploded value and position. The column order of return table is
+   *         [cols before explode_input, explode_position, explode_value, cols after explode_input].
+   */
+  public Table explodeOuterPosition(int index) {
+    assert 0 <= index && index < columns.length : "Column index is out of range";
+    assert columns[index].getType().equals(DType.LIST) : "Column to explode must be of type LIST";
+    return new Table(explodeOuterPosition(nativeHandle, index));
+  }
+
   /**
    * Gathers the rows of this table according to `gatherMap` such that row "i"
    * in the resulting table's columns will contain row "gatherMap[i]" from this table.
@@ -2587,15 +2702,31 @@ public Table leftAntiJoin(TableOperation rightJoinIndices) {
     }
 
     /**
-     * Hash partition a table into the specified number of partitions.
+     * Hash partition a table into the specified number of partitions. Uses the default MURMUR3
+     * hashing.
      * @param numberOfPartitions - number of partitions to use
      * @return - {@link PartitionedTable} - Table that exposes a limited functionality of the
      * {@link Table} class
      */
     public PartitionedTable hashPartition(int numberOfPartitions) {
+      return hashPartition(HashType.MURMUR3, numberOfPartitions);
+    }
+
+    /**
+     * Hash partition a table into the specified number of partitions.
+     * @param type the type of hash to use. Depending on the type of hash different restrictions
+     *             on the hash column(s) may exist. Not all hash functions are guaranteed to work
+     *             besides IDENTITY and MURMUR3.
+     * @param numberOfPartitions - number of partitions to use
+     * @return {@link PartitionedTable} - Table that exposes a limited functionality of the
+     * {@link Table} class
+     */
+    public PartitionedTable hashPartition(HashType type, int numberOfPartitions) {
       int[] partitionOffsets = new int[numberOfPartitions];
-      return new PartitionedTable(new Table(Table.hashPartition(operation.table.nativeHandle,
+      return new PartitionedTable(new Table(Table.hashPartition(
+          operation.table.nativeHandle,
           operation.indices,
+          type.nativeId,
           partitionOffsets.length,
           partitionOffsets)), partitionOffsets);
     }
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index c1239fe69ea..ceafc75f840 100755
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -17,10 +17,7 @@ cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 
 # Use GPU_ARCHS if it is defined
 if(DEFINED GPU_ARCHS)
-  unset(CMAKE_CUDA_ARCHITECTURES CACHE)
-  if(NOT "${GPU_ARCHS}" STREQUAL "ALL")
-    set(CMAKE_CUDA_ARCHITECTURES "${GPU_ARCHS}")
-  endif()
+  set(CMAKE_CUDA_ARCHITECTURES "${GPU_ARCHS}")
 endif()
 
 # If `CMAKE_CUDA_ARCHITECTURES` is not defined, build for all supported architectures. If
@@ -29,11 +26,10 @@ endif()
 
 # This needs to be run before enabling the CUDA language due to the default initialization behavior
 # of `CMAKE_CUDA_ARCHITECTURES`, https://gitlab.kitware.com/cmake/cmake/-/issues/21302
-if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-  set(CUDF_BUILD_FOR_ALL_ARCHS TRUE)
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "ALL")
+  set(CUDF_JNI_BUILD_FOR_ALL_ARCHS TRUE)
 elseif(CMAKE_CUDA_ARCHITECTURES STREQUAL "")
-  unset(CMAKE_CUDA_ARCHITECTURES CACHE)
-  set(CUDF_BUILD_FOR_DETECTED_ARCHS TRUE)
+  set(CUDF_JNI_BUILD_FOR_DETECTED_ARCHS TRUE)
 endif()
 
 project(CUDF_JNI VERSION 0.19 LANGUAGES C CXX)
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index e8474bda1be..ac14e1605d7 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -37,7 +37,6 @@
 #include <cudf/strings/attributes.hpp>
 #include <cudf/strings/capitalize.hpp>
 #include <cudf/strings/case.hpp>
-#include <cudf/strings/char_types/char_types.hpp>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/contains.hpp>
 #include <cudf/strings/convert/convert_booleans.hpp>
@@ -60,6 +59,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <map_lookup.hpp>
+#include "cudf/types.hpp"
 
 #include "cudf_jni_apis.hpp"
 #include "dtype_utils.hpp"
@@ -1026,6 +1026,18 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVV(JNIEnv *env, j
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_fixedPointOutputScale(JNIEnv *env, jclass,
+                                                                            jint int_op,
+                                                                            jint lhs_scale,
+                                                                            jint rhs_scale) {
+  try {
+    // we just return the scale as the types will be the same as the lhs input
+    return cudf::binary_operation_fixed_point_scale(static_cast<cudf::binary_operator>(int_op),
+                                                    lhs_scale, rhs_scale);
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVS(JNIEnv *env, jclass,
                                                                   jlong lhs_view, jlong rhs_ptr,
                                                                   jint int_op, jint out_dtype,
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index e051f68be4e..81b9882104f 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -27,6 +27,7 @@
 #include <cudf/io/orc.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/join.hpp>
+#include <cudf/lists/explode.hpp>
 #include <cudf/merge.hpp>
 #include <cudf/partitioning.hpp>
 #include <cudf/reshape.hpp>
@@ -1613,9 +1614,43 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_concatenate(JNIEnv *env,
   CATCH_STD(env, NULL);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_partition(JNIEnv *env, jclass,
+                                                                 jlong input_table,
+                                                                 jlong partition_column,
+                                                                 jint number_of_partitions,
+                                                                 jintArray output_offsets) {
+
+  JNI_NULL_CHECK(env, input_table, "input table is null", NULL);
+  JNI_NULL_CHECK(env, partition_column, "partition_column is null", NULL);
+  JNI_NULL_CHECK(env, output_offsets, "output_offsets is null", NULL);
+  JNI_ARG_CHECK(env, number_of_partitions > 0, "number_of_partitions is zero", NULL);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
+    cudf::column_view *n_part_column = reinterpret_cast<cudf::column_view *>(partition_column);
+    cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
+
+    auto result = cudf::partition(*n_input_table,
+                                  *n_part_column,
+                                  number_of_partitions);
+
+    for (size_t i = 0; i < result.second.size() - 1; i++) {
+      // for what ever reason partition returns the length of the result at then
+      // end and hash partition/round robin do not, so skip the last entry for
+      // consistency
+      n_output_offsets[i] = result.second[i];
+    }
+
+    return cudf::jni::convert_table_for_return(env, result.first);
+  }
+  CATCH_STD(env, NULL);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(JNIEnv *env, jclass,
                                                                      jlong input_table,
                                                                      jintArray columns_to_hash,
+                                                                     jint hash_function,
                                                                      jint number_of_partitions,
                                                                      jintArray output_offsets) {
 
@@ -1626,6 +1661,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(JNIEnv *env
 
   try {
     cudf::jni::auto_set_device(env);
+    cudf::hash_id hash_func = static_cast<cudf::hash_id>(hash_function);
     cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
     cudf::jni::native_jintArray n_columns_to_hash(env, columns_to_hash);
     cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
@@ -1638,7 +1674,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(JNIEnv *env
     }
 
     std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> result =
-        cudf::hash_partition(*n_input_table, columns_to_hash_vec, number_of_partitions);
+        cudf::hash_partition(*n_input_table,
+                             columns_to_hash_vec,
+                             number_of_partitions,
+                             hash_func);
 
     for (size_t i = 0; i < result.second.size(); i++) {
       n_output_offsets[i] = result.second[i];
@@ -2046,4 +2085,32 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodePosition(JNIEnv *e
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuter(JNIEnv *env, jclass,
+                                                                    jlong input_jtable,
+                                                                    jint column_index) {
+  JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *input_table = reinterpret_cast<cudf::table_view *>(input_jtable);
+    cudf::size_type col_index = static_cast<cudf::size_type>(column_index);
+    std::unique_ptr<cudf::table> exploded = cudf::explode_outer(*input_table, col_index);
+    return cudf::jni::convert_table_for_return(env, exploded);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuterPosition(JNIEnv *env, jclass,
+                                                                            jlong input_jtable,
+                                                                            jint column_index) {
+  JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *input_table = reinterpret_cast<cudf::table_view *>(input_jtable);
+    cudf::size_type col_index = static_cast<cudf::size_type>(column_index);
+    std::unique_ptr<cudf::table> exploded = cudf::explode_outer_position(*input_table, col_index);
+    return cudf::jni::convert_table_for_return(env, exploded);
+  }
+  CATCH_STD(env, 0);
+}
+
 } // extern "C"
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 88196a4112a..c075f074068 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -1742,7 +1742,7 @@ void testPartStability() {
     final int PARTS = 5;
     int expectedPart = -1;
     try (Table start = new Table.TestBuilder().column(0).build();
-         PartitionedTable out = start.onColumns(0).partition(PARTS)) {
+         PartitionedTable out = start.onColumns(0).hashPartition(PARTS)) {
       // Lets figure out what partitions this is a part of.
       int[] parts = out.getPartitions();
       for (int i = 0; i < parts.length; i++) {
@@ -1755,7 +1755,7 @@ void testPartStability() {
     for (int numEntries = 1; numEntries < COUNT; numEntries++) {
       try (ColumnVector data = ColumnVector.build(DType.INT32, numEntries, Range.appendInts(0, numEntries));
            Table t = new Table(data);
-           PartitionedTable out = t.onColumns(0).partition(PARTS);
+           PartitionedTable out = t.onColumns(0).hashPartition(PARTS);
            HostColumnVector tmp = out.getColumn(0).copyToHost()) {
         // Now we need to get the range out for the partition we expect
         int[] parts = out.getPartitions();
@@ -1775,6 +1775,23 @@ void testPartStability() {
 
   @Test
   void testPartition() {
+    try (Table t = new Table.TestBuilder()
+        .column(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+        .build();
+         ColumnVector parts = ColumnVector
+             .fromInts(1, 2, 1, 2, 1, 2, 1, 2, 1, 2);
+         PartitionedTable pt = t.partition(parts, 3);
+         Table expected = new Table.TestBuilder()
+             .column(1, 3, 5, 7, 9, 2, 4, 6, 8, 10)
+             .build()) {
+      int[] partCutoffs = pt.getPartitions();
+      assertArrayEquals(new int[]{0, 0, 5}, partCutoffs);
+      assertTablesAreEqual(expected, pt.getTable());
+    }
+  }
+
+  @Test
+  void testIdentityHashPartition() {
     final int count = 1024 * 1024;
     try (ColumnVector aIn = ColumnVector.build(DType.INT64, count, Range.appendLongs(count));
          ColumnVector bIn = ColumnVector.build(DType.INT32, count, (b) -> {
@@ -1793,7 +1810,57 @@ void testPartition() {
         expected.add(i);
       }
       try (Table input = new Table(new ColumnVector[]{aIn, bIn, cIn});
-           PartitionedTable output = input.onColumns(0).partition(5)) {
+           PartitionedTable output = input.onColumns(0).hashPartition(HashType.IDENTITY, 5)) {
+        int[] parts = output.getPartitions();
+        assertEquals(5, parts.length);
+        assertEquals(0, parts[0]);
+        int previous = 0;
+        long rows = 0;
+        for (int i = 1; i < parts.length; i++) {
+          assertTrue(parts[i] >= previous);
+          rows += parts[i] - previous;
+          previous = parts[i];
+        }
+        assertTrue(rows <= count);
+        try (HostColumnVector aOut = output.getColumn(0).copyToHost();
+             HostColumnVector bOut = output.getColumn(1).copyToHost();
+             HostColumnVector cOut = output.getColumn(2).copyToHost()) {
+
+          for (int i = 0; i < count; i++) {
+            long fromA = aOut.getLong(i);
+            long fromB = bOut.getInt(i);
+            String fromC = cOut.getJavaString(i);
+            assertTrue(expected.remove(fromA));
+            assertEquals(fromA / 2, fromB);
+            assertEquals(String.valueOf(fromA), fromC, "At Index " + i);
+          }
+          assertTrue(expected.isEmpty());
+        }
+      }
+    }
+  }
+
+  @Test
+  void testHashPartition() {
+    final int count = 1024 * 1024;
+    try (ColumnVector aIn = ColumnVector.build(DType.INT64, count, Range.appendLongs(count));
+         ColumnVector bIn = ColumnVector.build(DType.INT32, count, (b) -> {
+           for (int i = 0; i < count; i++) {
+             b.append(i / 2);
+           }
+         });
+         ColumnVector cIn = ColumnVector.build(DType.STRING, count, (b) -> {
+           for (int i = 0; i < count; i++) {
+             b.appendUTF8String(String.valueOf(i).getBytes());
+           }
+         })) {
+
+      HashSet<Long> expected = new HashSet<>();
+      for (long i = 0; i < count; i++) {
+        expected.add(i);
+      }
+      try (Table input = new Table(new ColumnVector[]{aIn, bIn, cIn});
+           PartitionedTable output = input.onColumns(0).hashPartition(5)) {
         int[] parts = output.getPartitions();
         assertEquals(5, parts.length);
         assertEquals(0, parts[0]);
@@ -4585,7 +4652,7 @@ private Table[] buildExplodeTestTableWithPrimitiveTypes(boolean pos, boolean out
     }
   }
 
-  private Table[] buildExplodeTestTableWithNestedTypes(boolean pos) {
+  private Table[] buildExplodeTestTableWithNestedTypes(boolean pos, boolean outer) {
     StructType nestedType = new StructType(true,
         new BasicType(false, DType.INT32), new BasicType(false, DType.STRING));
     try (Table input = new Table.TestBuilder()
@@ -4594,23 +4661,42 @@ private Table[] buildExplodeTestTableWithNestedTypes(boolean pos) {
             Arrays.asList(struct(4, "k4"), struct(5, "k5")),
             Arrays.asList(struct(6, "k6")),
             Arrays.asList(new HostColumnVector.StructData((List) null)),
-            Arrays.asList())
+            null)
         .column("s1", "s2", "s3", "s4", "s5")
         .column(1, 3, 5, 7, 9)
         .column(12.0, 14.0, 13.0, 11.0, 15.0)
         .build()) {
       Table.TestBuilder expectedBuilder = new Table.TestBuilder();
       if (pos) {
-        expectedBuilder.column(0, 1, 2, 0, 1, 0, 0);
+        if (!outer)
+          expectedBuilder.column(0, 1, 2, 0, 1, 0, 0);
+        else
+          expectedBuilder.column(0, 1, 2, 0, 1, 0, 0, 0);
       }
-      try (Table expected = expectedBuilder
-          .column(nestedType,
+      List<Object[]> expectedData = new ArrayList<Object[]>(){{
+        if (!outer) {
+          this.add(new HostColumnVector.StructData[]{
               struct(1, "k1"), struct(2, "k2"), struct(3, "k3"),
               struct(4, "k4"), struct(5, "k5"), struct(6, "k6"),
-              new HostColumnVector.StructData((List) null))
-          .column("s1", "s1", "s1", "s2", "s2", "s3", "s4")
-          .column(1, 1, 1, 3, 3, 5, 7)
-          .column(12.0, 12.0, 12.0, 14.0, 14.0, 13.0, 11.0)
+              new HostColumnVector.StructData((List) null)});
+          this.add(new String[]{"s1", "s1", "s1", "s2", "s2", "s3", "s4"});
+          this.add(new Integer[]{1, 1, 1, 3, 3, 5, 7});
+          this.add(new Double[]{12.0, 12.0, 12.0, 14.0, 14.0, 13.0, 11.0});
+        } else {
+          this.add(new HostColumnVector.StructData[]{
+              struct(1, "k1"), struct(2, "k2"), struct(3, "k3"),
+              struct(4, "k4"), struct(5, "k5"), struct(6, "k6"),
+              new HostColumnVector.StructData((List) null), null});
+          this.add(new String[]{"s1", "s1", "s1", "s2", "s2", "s3", "s4", "s5"});
+          this.add(new Integer[]{1, 1, 1, 3, 3, 5, 7, 9});
+          this.add(new Double[]{12.0, 12.0, 12.0, 14.0, 14.0, 13.0, 11.0, 15.0});
+        }
+      }};
+      try (Table expected = expectedBuilder
+          .column(nestedType, (HostColumnVector.StructData[]) expectedData.get(0))
+          .column((String[]) expectedData.get(1))
+          .column((Integer[]) expectedData.get(2))
+          .column((Double[]) expectedData.get(3))
           .build()) {
         return new Table[]{new Table(input.getColumns()), new Table(expected.getColumns())};
       }
@@ -4629,7 +4715,7 @@ void testExplode() {
     }
 
     // Child is nested type
-    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(false);
+    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(false, false);
     try (Table input = testTables2[0];
          Table expected = testTables2[1]) {
       try (Table exploded = input.explode(0)) {
@@ -4639,7 +4725,7 @@ void testExplode() {
   }
 
   @Test
-  void testPosExplode() {
+  void testExplodePosition() {
     // Child is primitive type
     Table[] testTables = buildExplodeTestTableWithPrimitiveTypes(true, false);
     try (Table input = testTables[0];
@@ -4649,8 +4735,8 @@ void testPosExplode() {
       }
     }
 
-    // Child is primitive type
-    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(true);
+    // Child is nested type
+    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(true, false);
     try (Table input = testTables2[0];
          Table expected = testTables2[1]) {
       try (Table exploded = input.explodePosition(0)) {
@@ -4659,4 +4745,45 @@ void testPosExplode() {
     }
   }
 
+  @Test
+  void testExplodeOuter() {
+    // Child is primitive type
+    Table[] testTables = buildExplodeTestTableWithPrimitiveTypes(false, true);
+    try (Table input = testTables[0];
+         Table expected = testTables[1]) {
+      try (Table exploded = input.explodeOuter(0)) {
+        assertTablesAreEqual(expected, exploded);
+      }
+    }
+
+    // Child is nested type
+    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(false, true);
+    try (Table input = testTables2[0];
+         Table expected = testTables2[1]) {
+      try (Table exploded = input.explodeOuter(0)) {
+        assertTablesAreEqual(expected, exploded);
+      }
+    }
+  }
+
+  @Test
+  void testExplodeOuterPosition() {
+    // Child is primitive type
+    Table[] testTables = buildExplodeTestTableWithPrimitiveTypes(true, true);
+    try (Table input = testTables[0];
+         Table expected = testTables[1]) {
+      try (Table exploded = input.explodeOuterPosition(0)) {
+        assertTablesAreEqual(expected, exploded);
+      }
+    }
+
+    // Child is nested type
+    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(true, true);
+    try (Table input = testTables2[0];
+         Table expected = testTables2[1]) {
+      try (Table exploded = input.explodeOuterPosition(0)) {
+        assertTablesAreEqual(expected, exploded);
+      }
+    }
+  }
 }
diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx
index 59a6b876961..5eaec640b15 100644
--- a/python/cudf/cudf/_lib/binaryop.pyx
+++ b/python/cudf/cudf/_lib/binaryop.pyx
@@ -93,6 +93,9 @@ class BinaryOperation(IntEnum):
     GENERIC_BINARY = (
         <underlying_type_t_binary_operator> binary_operator.GENERIC_BINARY
     )
+    NULL_EQUALS = (
+        <underlying_type_t_binary_operator> binary_operator.NULL_EQUALS
+    )
 
 
 cdef binaryop_v_v(Column lhs, Column rhs,
@@ -154,17 +157,6 @@ cdef binaryop_s_v(DeviceScalar lhs, Column rhs,
     return Column.from_unique_ptr(move(c_result))
 
 
-def handle_null_for_string_column(Column input_col, op):
-    if op in ('eq', 'lt', 'le', 'gt', 'ge'):
-        return replace_nulls(input_col, DeviceScalar(False, 'bool'))
-
-    elif op == 'ne':
-        return replace_nulls(input_col, DeviceScalar(True, 'bool'))
-
-    # Nothing needs to be done
-    return input_col
-
-
 def binaryop(lhs, rhs, op, dtype):
     """
     Dispatches a binary op call to the appropriate libcudf function:
@@ -205,11 +197,7 @@ def binaryop(lhs, rhs, op, dtype):
             c_op,
             c_dtype
         )
-
-    if is_string_col is True:
-        return handle_null_for_string_column(result, op.name.lower())
-    else:
-        return result
+    return result
 
 
 def binaryop_udf(Column lhs, Column rhs, udf_ptx, dtype):
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index ad798a73ed2..e5501428624 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -3,7 +3,7 @@
 import pandas as pd
 
 from libcpp cimport bool
-from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr, shared_ptr, make_shared
 from libcpp.vector cimport vector
 from libcpp.utility cimport move
 from libc.stdint cimport int32_t, int64_t
@@ -24,6 +24,10 @@ from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport size_type
+from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
+from cudf._lib.cpp.lists.gather cimport (
+    segmented_gather as cpp_segmented_gather
+)
 cimport cudf._lib.cpp.copying as cpp_copying
 
 # workaround for https://github.com/cython/cython/issues/3885
@@ -704,3 +708,22 @@ def sample(Table input, size_type n,
             else input._index_names
         )
     )
+
+
+def segmented_gather(Column source_column, Column gather_map):
+    cdef shared_ptr[lists_column_view] source_LCV = (
+        make_shared[lists_column_view](source_column.view())
+    )
+    cdef shared_ptr[lists_column_view] gather_map_LCV = (
+        make_shared[lists_column_view](gather_map.view())
+    )
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_segmented_gather(
+                source_LCV.get()[0], gather_map_LCV.get()[0])
+        )
+
+    result = Column.from_unique_ptr(move(c_result))
+    return result
diff --git a/python/cudf/cudf/_lib/cpp/binaryop.pxd b/python/cudf/cudf/_lib/cpp/binaryop.pxd
index fb36fdfd639..2e36070a164 100644
--- a/python/cudf/cudf/_lib/cpp/binaryop.pxd
+++ b/python/cudf/cudf/_lib/cpp/binaryop.pxd
@@ -27,6 +27,7 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         GREATER "cudf::binary_operator::GREATER"
         LESS_EQUAL "cudf::binary_operator::LESS_EQUAL"
         GREATER_EQUAL "cudf::binary_operator::GREATER_EQUAL"
+        NULL_EQUALS "cudf::binary_operator::NULL_EQUALS"
         BITWISE_AND "cudf::binary_operator::BITWISE_AND"
         BITWISE_OR "cudf::binary_operator::BITWISE_OR"
         BITWISE_XOR "cudf::binary_operator::BITWISE_XOR"
diff --git a/python/cudf/cudf/_lib/cpp/lists/gather.pxd b/python/cudf/cudf/_lib/cpp/lists/gather.pxd
new file mode 100644
index 00000000000..ea664eee82e
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/lists/gather.pxd
@@ -0,0 +1,13 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
+
+
+cdef extern from "cudf/lists/gather.hpp" namespace "cudf::lists" nogil:
+    cdef unique_ptr[column] segmented_gather(
+        const lists_column_view source_column,
+        const lists_column_view gather_map_list
+    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/char_types.pxd b/python/cudf/cudf/_lib/cpp/strings/char_types.pxd
index ad675027c10..934269c6f25 100644
--- a/python/cudf/cudf/_lib/cpp/strings/char_types.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/char_types.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2021, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -33,11 +33,3 @@ cdef extern from "cudf/strings/char_types/char_types.hpp" \
         string_character_types types_to_remove,
         string_scalar replacement,
         string_character_types types_to_keep) except +
-
-    cdef unique_ptr[column] is_integer(
-        column_view source_strings
-    ) except +
-
-    cdef unique_ptr[column] is_float(
-        column_view source_strings
-    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd b/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd
index baee01b8f99..55a84b60efd 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2021, NVIDIA CORPORATION.
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -14,3 +14,7 @@ cdef extern from "cudf/strings/convert/convert_floats.hpp" namespace \
 
     cdef unique_ptr[column] from_floats(
         column_view input_col) except +
+
+    cdef unique_ptr[column] is_float(
+        column_view source_strings
+    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd b/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd
index 92f99a2f5cb..6e45d4ba869 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2021, NVIDIA CORPORATION.
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -15,6 +15,10 @@ cdef extern from "cudf/strings/convert/convert_integers.hpp" namespace \
     cdef unique_ptr[column] from_integers(
         column_view input_col) except +
 
+    cdef unique_ptr[column] is_integer(
+        column_view source_strings
+    ) except +
+
     cdef unique_ptr[column] hex_to_integers(
         column_view input_col,
         data_type output_type) except +
diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 7b455dd574b..2185cb089a7 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -57,6 +57,8 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
             return incol.dtype.type(0)
         if reduction_op == 'product':
             return incol.dtype.type(1)
+        if reduction_op == "any":
+            return False
 
         return cudf.utils.dtypes._get_nan_for_dtype(col_dtype)
 
diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx
index 5d8d1522418..1890e98f956 100644
--- a/python/cudf/cudf/_lib/strings/char_types.pyx
+++ b/python/cudf/cudf/_lib/strings/char_types.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2021, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -14,8 +14,6 @@ from cudf._lib.cpp.strings.char_types cimport (
     all_characters_of_type as cpp_all_characters_of_type,
     filter_characters_of_type as cpp_filter_characters_of_type,
     string_character_types as string_character_types,
-    is_integer as cpp_is_integer,
-    is_float as cpp_is_float,
 )
 
 
@@ -191,35 +189,3 @@ def is_space(Column source_strings):
         ))
 
     return Column.from_unique_ptr(move(c_result))
-
-
-def is_integer(Column source_strings):
-    """
-    Returns a Column of boolean values with True for `source_strings`
-    that have intergers.
-    """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_is_integer(
-            source_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
-
-
-def is_float(Column source_strings):
-    """
-    Returns a Column of boolean values with True for `source_strings`
-    that have floats.
-    """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_is_float(
-            source_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
new file mode 100644
index 00000000000..195d9b71f6e
--- /dev/null
+++ b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
@@ -0,0 +1,29 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column cimport column
+
+from cudf._lib.cpp.strings.convert.convert_floats cimport (
+    is_float as cpp_is_float,
+)
+
+
+def is_float(Column source_strings):
+    """
+    Returns a Column of boolean values with True for `source_strings`
+    that have floats.
+    """
+    cdef unique_ptr[column] c_result
+    cdef column_view source_view = source_strings.view()
+
+    with nogil:
+        c_result = move(cpp_is_float(
+            source_view
+        ))
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
new file mode 100644
index 00000000000..d1bae1edd37
--- /dev/null
+++ b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
@@ -0,0 +1,29 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column cimport column
+
+from cudf._lib.cpp.strings.convert.convert_integers cimport (
+    is_integer as cpp_is_integer,
+)
+
+
+def is_integer(Column source_strings):
+    """
+    Returns a Column of boolean values with True for `source_strings`
+    that have intergers.
+    """
+    cdef unique_ptr[column] c_result
+    cdef column_view source_view = source_strings.view()
+
+    with nogil:
+        c_result = move(cpp_is_integer(
+            source_view
+        ))
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index c41a458f02b..39c278d2abf 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1014,7 +1014,11 @@ def slice(
     def binary_operator(
         self, op: str, rhs, reflect: bool = False
     ) -> ColumnBase:
-        if not (self.ordered and rhs.ordered) and op not in ("eq", "ne"):
+        if not (self.ordered and rhs.ordered) and op not in (
+            "eq",
+            "ne",
+            "NULL_EQUALS",
+        ):
             if op in ("lt", "gt", "le", "ge"):
                 raise TypeError(
                     "Unordered Categoricals can only compare equality or not"
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 2bb35c97d7c..b2b2874eeb4 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -179,7 +179,11 @@ def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool:
         if check_dtypes:
             if self.dtype != other.dtype:
                 return False
-        return (self == other).min()
+        null_equals = self._null_equals(other)
+        return null_equals.all()
+
+    def _null_equals(self, other: ColumnBase) -> ColumnBase:
+        return self.binary_operator("NULL_EQUALS", other)
 
     def all(self) -> bool:
         return bool(libcudf.reduce.reduce("all", self, dtype=np.bool_))
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 7c5385b9bbf..a563248f4ab 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -274,7 +274,7 @@ def binary_operator(
         if isinstance(rhs, cudf.DateOffset):
             return binop_offset(self, rhs, op)
         lhs, rhs = self, rhs
-        if op in ("eq", "ne", "lt", "gt", "le", "ge"):
+        if op in ("eq", "ne", "lt", "gt", "le", "ge", "NULL_EQUALS"):
             out_dtype = np.dtype(np.bool_)  # type: Dtype
         elif op == "add" and pd.api.types.is_timedelta64_dtype(rhs.dtype):
             out_dtype = cudf.core.column.timedelta._timedelta_add_result_dtype(
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index a60fe627acb..1d3f73822a9 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -2,14 +2,16 @@
 
 import pickle
 
+import numpy as np
 import pyarrow as pa
 
 import cudf
+from cudf._lib.copying import segmented_gather
 from cudf._lib.lists import count_elements
 from cudf.core.buffer import Buffer
-from cudf.core.column import ColumnBase, column
+from cudf.core.column import ColumnBase, as_column, column
 from cudf.core.column.methods import ColumnMethodsMixin
-from cudf.utils.dtypes import is_list_dtype
+from cudf.utils.dtypes import is_list_dtype, is_numerical_dtype
 
 
 class ListColumn(ColumnBase):
@@ -228,3 +230,58 @@ def len(self):
         dtype: int32
         """
         return self._return_or_inplace(count_elements(self._column))
+
+    def take(self, lists_indices):
+        """
+        Collect list elements based on given indices.
+
+        Parameters
+        ----------
+        lists_indices: List type arrays
+            Specifies what to collect from each row
+
+        Returns
+        -------
+        ListColumn
+
+        Examples
+        --------
+        >>> s = cudf.Series([[1, 2, 3], None, [4, 5]])
+        >>> s
+        0    [1, 2, 3]
+        1         None
+        2       [4, 5]
+        dtype: list
+        >>> s.list.take([[0, 1], [], []])
+        0    [1, 2]
+        1      None
+        2        []
+        dtype: list
+        """
+
+        lists_indices_col = as_column(lists_indices)
+        if not isinstance(lists_indices_col, ListColumn):
+            raise ValueError("lists_indices should be list type array.")
+        if not lists_indices_col.size == self._column.size:
+            raise ValueError(
+                "lists_indices and list column is of different " "size."
+            )
+        if not is_numerical_dtype(
+            lists_indices_col.children[1].dtype
+        ) or not np.issubdtype(
+            lists_indices_col.children[1].dtype, np.integer
+        ):
+            raise TypeError(
+                "lists_indices should be column of values of index types."
+            )
+
+        try:
+            res = self._return_or_inplace(
+                segmented_gather(self._column, lists_indices_col)
+            )
+        except RuntimeError as e:
+            if "contains nulls" in str(e):
+                raise ValueError("lists_indices contains null.") from e
+            raise
+        else:
+            return res
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 6fae8c644e3..7ad6eed65a8 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -700,16 +700,21 @@ def _numeric_column_binop(
     if reflect:
         lhs, rhs = rhs, lhs
 
-    is_op_comparison = op in ["lt", "gt", "le", "ge", "eq", "ne"]
+    is_op_comparison = op in [
+        "lt",
+        "gt",
+        "le",
+        "ge",
+        "eq",
+        "ne",
+        "NULL_EQUALS",
+    ]
 
     if is_op_comparison:
         out_dtype = "bool"
 
     out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
 
-    if is_op_comparison:
-        out = out.fillna(op == "ne")
-
     return out
 
 
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 81abdd3f66a..11dd7556812 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -70,13 +70,15 @@
     is_alpha as cpp_is_alpha,
     is_decimal as cpp_is_decimal,
     is_digit as cpp_is_digit,
-    is_float as cpp_is_float,
-    is_integer as cpp_is_integer,
     is_lower as cpp_is_lower,
     is_numeric as cpp_is_numeric,
     is_space as cpp_isspace,
     is_upper as cpp_is_upper,
 )
+from cudf._lib.strings.convert.convert_integers import (
+    is_integer as cpp_is_integer,
+)
+from cudf._lib.strings.convert.convert_floats import is_float as cpp_is_float
 from cudf._lib.strings.combine import (
     concatenate as cpp_concatenate,
     join as cpp_join,
@@ -434,7 +436,6 @@ def cat(self, others=None, sep=None, na_rep=None):
         3    dD
         dtype: object
         """
-
         if sep is None:
             sep = ""
 
@@ -5109,7 +5110,7 @@ def binary_operator(
         if isinstance(rhs, (StringColumn, str, cudf.Scalar)):
             if op == "add":
                 return cast("column.ColumnBase", lhs.str().cat(others=rhs))
-            elif op in ("eq", "ne", "gt", "lt", "ge", "le"):
+            elif op in ("eq", "ne", "gt", "lt", "ge", "le", "NULL_EQUALS"):
                 return _string_column_binop(self, rhs, op=op, out_dtype="bool")
 
         raise TypeError(
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index ac63192b692..e22b511db01 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -223,7 +223,7 @@ def binary_operator(
 
         if op in ("eq", "ne"):
             out_dtype = self._binary_op_eq_ne(rhs)
-        elif op in ("lt", "gt", "le", "ge"):
+        elif op in ("lt", "gt", "le", "ge", "NULL_EQUALS"):
             out_dtype = self._binary_op_lt_gt_le_ge(rhs)
         elif op == "mul":
             out_dtype = self._binary_op_mul(rhs)
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index ecdce9443a1..9672ab3002f 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1518,11 +1518,7 @@ def fallback(col, fn):
                 else:
                     if col not in df_cols:
                         r_opr = other_cols[col]
-                        l_opr = Series(
-                            column_empty(
-                                len(self), masked=True, dtype=other.dtype
-                            )
-                        )
+                        l_opr = Series(as_column(np.nan, length=len(self)))
                     if col not in other_cols_keys:
                         r_opr = None
                         l_opr = self[col]
@@ -2198,7 +2194,7 @@ def rpow(self, other, axis="columns", level=None, fill_value=None):
         return self._apply_op("rpow", other, fill_value)
 
     def __rpow__(self, other):
-        return self._apply_op("__pow__", other)
+        return self._apply_op("__rpow__", other)
 
     def floordiv(self, other, axis="columns", level=None, fill_value=None):
         """
@@ -6031,7 +6027,6 @@ def isin(self, values):
         falcon      True       True
         dog        False      False
         """
-
         if isinstance(values, dict):
 
             result_df = DataFrame()
@@ -6051,14 +6046,15 @@ def isin(self, values):
             values = values.reindex(self.index)
 
             result = DataFrame()
-
+            # TODO: propagate nulls through isin
+            # https://github.com/rapidsai/cudf/issues/7556
             for col in self._data.names:
                 if isinstance(
                     self[col]._column, cudf.core.column.CategoricalColumn
                 ) and isinstance(
                     values._column, cudf.core.column.CategoricalColumn
                 ):
-                    res = self._data[col] == values._column
+                    res = (self._data[col] == values._column).fillna(False)
                     result[col] = res
                 elif (
                     isinstance(
@@ -6073,7 +6069,9 @@ def isin(self, values):
                 ):
                     result[col] = utils.scalar_broadcast_to(False, len(self))
                 else:
-                    result[col] = self._data[col] == values._column
+                    result[col] = (self._data[col] == values._column).fillna(
+                        False
+                    )
 
             result.index = self.index
             return result
@@ -6083,7 +6081,9 @@ def isin(self, values):
             result = DataFrame()
             for col in self._data.names:
                 if col in values.columns:
-                    result[col] = self._data[col] == values[col]._column
+                    result[col] = (
+                        self._data[col] == values[col]._column
+                    ).fillna(False)
                 else:
                     result[col] = utils.scalar_broadcast_to(False, len(self))
             result.index = self.index
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 275d085ef5d..fab5936f94d 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1566,10 +1566,7 @@ def _apply_boolean_mask(self, boolean_mask):
         rows corresponding to `False` is dropped
         """
         boolean_mask = as_column(boolean_mask)
-        if boolean_mask.has_nulls:
-            raise ValueError(
-                "cannot mask with boolean_mask containing null values"
-            )
+
         result = self.__class__._from_table(
             libcudf.stream_compaction.apply_boolean_mask(
                 self, as_column(boolean_mask)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 11e32e2285d..b06fef178f6 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1501,9 +1501,7 @@ def _binaryop(
         If ``reflect`` is ``True``, swap the order of the operands.
         """
         if isinstance(other, cudf.DataFrame):
-            # TODO: fn is not the same as arg expected by _apply_op
-            # e.g. for fn = 'and', _apply_op equivalent is '__and__'
-            return other._apply_op(self, fn)
+            return NotImplemented
 
         result_name = utils.get_result_name(self, other)
         if isinstance(other, Series):
@@ -3120,8 +3118,10 @@ def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
                 "bool_only parameter is not implemented yet"
             )
 
-        if self.empty:
-            return False
+        skipna = False if skipna is None else skipna
+
+        if skipna is False and self.has_nulls:
+            return True
 
         if skipna:
             result_series = self.nans_to_nulls()
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 4e5e4ce1987..535e497e8dc 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -8,7 +8,9 @@
 from pandas.core.tools.datetimes import _unit_map
 
 import cudf
-from cudf._lib.strings.char_types import is_integer as cpp_is_integer
+from cudf._lib.strings.convert.convert_integers import (
+    is_integer as cpp_is_integer,
+)
 from cudf.core import column
 from cudf.core.index import as_index
 from cudf.utils.dtypes import is_scalar
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index a0b65743180..18f2d7e474b 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -206,12 +206,45 @@ def test_series_compare(cmpop, obj_class, dtype):
     np.testing.assert_equal(result3.to_array(), cmpop(arr1, arr2))
 
 
+def _series_compare_nulls_typegen():
+    tests = []
+    tests += list(product(DATETIME_TYPES, DATETIME_TYPES))
+    tests += list(product(TIMEDELTA_TYPES, TIMEDELTA_TYPES))
+    tests += list(product(NUMERIC_TYPES, NUMERIC_TYPES))
+    tests += list(product(STRING_TYPES, STRING_TYPES))
+
+    return tests
+
+
+@pytest.mark.parametrize("cmpop", _cmpops)
+@pytest.mark.parametrize("dtypes", _series_compare_nulls_typegen())
+def test_series_compare_nulls(cmpop, dtypes):
+    ltype, rtype = dtypes
+
+    ldata = [1, 2, None, None, 5]
+    rdata = [2, 1, None, 4, None]
+
+    lser = Series(ldata, dtype=ltype)
+    rser = Series(rdata, dtype=rtype)
+
+    lmask = ~lser.isnull()
+    rmask = ~rser.isnull()
+
+    expect_mask = np.logical_and(lmask, rmask)
+    expect = cudf.Series([None] * 5, dtype="bool")
+    expect[expect_mask] = cmpop(lser[expect_mask], rser[expect_mask])
+
+    got = cmpop(lser, rser)
+    utils.assert_eq(expect, got)
+
+
 @pytest.mark.parametrize(
-    "obj", [pd.Series(["a", "b", None, "d", "e", None]), "a"]
+    "obj", [pd.Series(["a", "b", None, "d", "e", None], dtype="string"), "a"]
 )
 @pytest.mark.parametrize("cmpop", _cmpops)
 @pytest.mark.parametrize(
-    "cmp_obj", [pd.Series(["b", "a", None, "d", "f", None]), "a"]
+    "cmp_obj",
+    [pd.Series(["b", "a", None, "d", "f", None], dtype="string"), "a"],
 )
 def test_string_series_compare(obj, cmpop, cmp_obj):
 
@@ -221,10 +254,12 @@ def test_string_series_compare(obj, cmpop, cmp_obj):
     g_cmp_obj = cmp_obj
     if isinstance(g_cmp_obj, pd.Series):
         g_cmp_obj = Series.from_pandas(g_cmp_obj)
-
     got = cmpop(g_obj, g_cmp_obj)
     expected = cmpop(obj, cmp_obj)
 
+    if isinstance(expected, pd.Series):
+        expected = cudf.from_pandas(expected)
+
     utils.assert_eq(expected, got)
 
 
@@ -694,10 +729,12 @@ def test_operator_func_series_and_scalar(
 def test_operator_func_between_series_logical(
     dtype, func, scalar_a, scalar_b, fill_value
 ):
-    gdf_series_a = Series([scalar_a]).astype(dtype)
-    gdf_series_b = Series([scalar_b]).astype(dtype)
-    pdf_series_a = gdf_series_a.to_pandas()
-    pdf_series_b = gdf_series_b.to_pandas()
+
+    gdf_series_a = Series([scalar_a], nan_as_null=False).astype(dtype)
+    gdf_series_b = Series([scalar_b], nan_as_null=False).astype(dtype)
+
+    pdf_series_a = gdf_series_a.to_pandas(nullable=True)
+    pdf_series_b = gdf_series_b.to_pandas(nullable=True)
 
     gdf_series_result = getattr(gdf_series_a, func)(
         gdf_series_b, fill_value=fill_value
@@ -705,16 +742,22 @@ def test_operator_func_between_series_logical(
     pdf_series_result = getattr(pdf_series_a, func)(
         pdf_series_b, fill_value=fill_value
     )
-
-    if scalar_a in [None, np.nan] and scalar_b in [None, np.nan]:
-        # cudf binary operations will return `None` when both left- and right-
-        # side values are `None`. It will return `np.nan` when either side is
-        # `np.nan`. As a consequence, when we convert our gdf => pdf during
-        # assert_eq, we get a pdf with dtype='object' (all inputs are none).
-        # to account for this, we use fillna.
-        gdf_series_result.fillna(func == "ne", inplace=True)
-
-    utils.assert_eq(pdf_series_result, gdf_series_result)
+    expect = pdf_series_result
+    got = gdf_series_result.to_pandas(nullable=True)
+
+    # If fill_value is np.nan, things break down a bit,
+    # because setting a NaN into a pandas nullable float
+    # array still gets transformed to <NA>. As such,
+    # pd_series_with_nulls.fillna(np.nan) has no effect.
+    if (
+        (pdf_series_a.isnull().sum() != pdf_series_b.isnull().sum())
+        and np.isscalar(fill_value)
+        and np.isnan(fill_value)
+    ):
+        with pytest.raises(AssertionError):
+            utils.assert_eq(expect, got)
+        return
+    utils.assert_eq(expect, got)
 
 
 @pytest.mark.parametrize("dtype", ["float32", "float64"])
@@ -729,8 +772,7 @@ def test_operator_func_series_and_scalar_logical(
     gdf_series = utils.gen_rand_series(
         dtype, 1000, has_nulls=has_nulls, stride=10000
     )
-    pdf_series = gdf_series.to_pandas()
-
+    pdf_series = gdf_series.to_pandas(nullable=True)
     gdf_series_result = getattr(gdf_series, func)(
         cudf.Scalar(scalar) if use_cudf_scalar else scalar,
         fill_value=fill_value,
@@ -739,7 +781,10 @@ def test_operator_func_series_and_scalar_logical(
         scalar, fill_value=fill_value
     )
 
-    utils.assert_eq(pdf_series_result, gdf_series_result)
+    expect = pdf_series_result
+    got = gdf_series_result.to_pandas(nullable=True)
+
+    utils.assert_eq(expect, got)
 
 
 @pytest.mark.parametrize("func", _operators_arithmetic)
@@ -1738,10 +1783,61 @@ def test_equality_ops_index_mismatch(fn):
         index=["aa", "b", "c", "d", "e", "f", "y", "z"],
     )
 
-    pa = a.to_pandas()
-    pb = b.to_pandas()
-
+    pa = a.to_pandas(nullable=True)
+    pb = b.to_pandas(nullable=True)
     expected = getattr(pa, fn)(pb)
-    actual = getattr(a, fn)(b)
+    actual = getattr(a, fn)(b).to_pandas(nullable=True)
 
     utils.assert_eq(expected, actual)
+
+
+def generate_test_null_equals_columnops_data():
+    # Generate tuples of:
+    # (left_data, right_data, compare_bool
+    # where compare_bool is the correct answer to
+    # if the columns should compare as null equals
+
+    def set_null_cases(column_l, column_r, case):
+        if case == "neither":
+            return column_l, column_r
+        elif case == "left":
+            column_l[1] = None
+        elif case == "right":
+            column_r[1] = None
+        elif case == "both":
+            column_l[1] = None
+            column_r[1] = None
+        else:
+            raise ValueError("Unknown null case")
+        return column_l, column_r
+
+    null_cases = ["neither", "left", "right", "both"]
+    data = [1, 2, 3]
+
+    results = []
+    # TODO: Numeric types can be cross compared as null equal
+    for dtype in (
+        list(NUMERIC_TYPES)
+        + list(DATETIME_TYPES)
+        + list(TIMEDELTA_TYPES)
+        + list(STRING_TYPES)
+        + ["category"]
+    ):
+        for case in null_cases:
+            left = cudf.Series(data, dtype=dtype)
+            right = cudf.Series(data, dtype=dtype)
+            if case in {"left", "right"}:
+                answer = False
+            else:
+                answer = True
+            left, right = set_null_cases(left, right, case)
+            results.append((left._column, right._column, answer, case))
+
+    return results
+
+
+@pytest.mark.parametrize(
+    "lcol,rcol,ans,case", generate_test_null_equals_columnops_data()
+)
+def test_null_equals_columnops(lcol, rcol, ans, case):
+    assert lcol._null_equals(rcol).all() == ans
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index ffd66e18314..5f4d571e8c5 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -4996,13 +4996,13 @@ def test_cov_nans():
 @pytest.mark.parametrize(
     "gsr",
     [
-        cudf.Series([1, 2, 3]),
-        cudf.Series([1, 2, 3], index=["a", "b", "c"]),
-        cudf.Series([1, 2, 3], index=["a", "b", "d"]),
-        cudf.Series([1, 2], index=["a", "b"]),
-        cudf.Series([1, 2, 3], index=cudf.core.index.RangeIndex(0, 3)),
+        cudf.Series([4, 2, 3]),
+        cudf.Series([4, 2, 3], index=["a", "b", "c"]),
+        cudf.Series([4, 2, 3], index=["a", "b", "d"]),
+        cudf.Series([4, 2], index=["a", "b"]),
+        cudf.Series([4, 2, 3], index=cudf.core.index.RangeIndex(0, 3)),
         pytest.param(
-            cudf.Series([1, 2, 3, 4, 5], index=["a", "b", "d", "0", "12"]),
+            cudf.Series([4, 2, 3, 4, 5], index=["a", "b", "d", "0", "12"]),
             marks=pytest.mark.xfail,
         ),
     ],
@@ -5026,21 +5026,23 @@ def test_cov_nans():
     ],
 )
 def test_df_sr_binop(gsr, colnames, op):
-    data = [[0, 2, 5], [3, None, 5], [6, 7, np.nan]]
+    data = [[3.0, 2.0, 5.0], [3.0, None, 5.0], [6.0, 7.0, np.nan]]
     data = dict(zip(colnames, data))
 
+    gsr = gsr.astype("float64")
+
     gdf = cudf.DataFrame(data)
-    pdf = pd.DataFrame.from_dict(data)
+    pdf = gdf.to_pandas(nullable=True)
 
-    psr = gsr.to_pandas()
+    psr = gsr.to_pandas(nullable=True)
 
     expect = op(pdf, psr)
-    got = op(gdf, gsr)
-    assert_eq(expect.astype(float), got.astype(float))
+    got = op(gdf, gsr).to_pandas(nullable=True)
+    assert_eq(expect, got, check_dtype=False)
 
     expect = op(psr, pdf)
-    got = op(psr, pdf)
-    assert_eq(expect.astype(float), got.astype(float))
+    got = op(gsr, gdf).to_pandas(nullable=True)
+    assert_eq(expect, got, check_dtype=False)
 
 
 @pytest.mark.parametrize(
@@ -5052,12 +5054,14 @@ def test_df_sr_binop(gsr, colnames, op):
         operator.truediv,
         operator.mod,
         operator.pow,
-        operator.eq,
-        operator.lt,
-        operator.le,
-        operator.gt,
-        operator.ge,
-        operator.ne,
+        # comparison ops will temporarily XFAIL
+        # see PR  https://github.com/rapidsai/cudf/pull/7491
+        pytest.param(operator.eq, marks=pytest.mark.xfail()),
+        pytest.param(operator.lt, marks=pytest.mark.xfail()),
+        pytest.param(operator.le, marks=pytest.mark.xfail()),
+        pytest.param(operator.gt, marks=pytest.mark.xfail()),
+        pytest.param(operator.ge, marks=pytest.mark.xfail()),
+        pytest.param(operator.ne, marks=pytest.mark.xfail()),
     ],
 )
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 558700f1f89..cec2623027f 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -755,17 +755,6 @@ def do_slice(x):
     assert_eq(expect, got, check_dtype=False)
 
 
-def test_dataframe_boolean_mask_with_None():
-    pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]})
-    gdf = cudf.DataFrame.from_pandas(pdf)
-    pdf_masked = pdf[[True, False, True, False]]
-    gdf_masked = gdf[[True, False, True, False]]
-    assert_eq(pdf_masked, gdf_masked)
-
-    with pytest.raises(ValueError):
-        gdf[cudf.Series([True, False, None, False])]
-
-
 @pytest.mark.parametrize("dtype", [int, float, str])
 def test_empty_boolean_mask(dtype):
     gdf = cudf.datasets.randomdata(nrows=0, dtypes={"a": dtype})
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 195d8749ec6..33812cfa7a7 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -112,3 +112,50 @@ def test_len(data):
     got = gsr.list.len()
 
     assert_eq(expect, got, check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    ("data", "idx"),
+    [
+        ([[1, 2, 3], [3, 4, 5], [4, 5, 6]], [[0, 1], [2], [1, 2]]),
+        ([[1, 2, 3], [3, 4, 5], [4, 5, 6]], [[1, 2, 0], [1, 0, 2], [0, 1, 2]]),
+        ([[1, 2, 3], []], [[0, 1], []]),
+        ([[1, 2, 3], [None]], [[0, 1], []]),
+        ([[1, None, 3], None], [[0, 1], []]),
+    ],
+)
+def test_take(data, idx):
+    ps = pd.Series(data)
+    gs = cudf.from_pandas(ps)
+
+    expected = pd.Series(zip(ps, idx)).map(
+        lambda x: [x[0][i] for i in x[1]] if x[0] is not None else None
+    )
+    got = gs.list.take(idx)
+    assert_eq(expected, got)
+
+
+@pytest.mark.parametrize(
+    ("invalid", "exception"),
+    [
+        ([[0]], pytest.raises(ValueError, match="different size")),
+        ([1, 2, 3, 4], pytest.raises(ValueError, match="should be list type")),
+        (
+            [["a", "b"], ["c"]],
+            pytest.raises(
+                TypeError, match="should be column of values of index types"
+            ),
+        ),
+        (
+            [[[1], [0]], [[0]]],
+            pytest.raises(
+                TypeError, match="should be column of values of index types"
+            ),
+        ),
+        ([[0, 1], None], pytest.raises(ValueError, match="contains null")),
+    ],
+)
+def test_take_invalid(invalid, exception):
+    gs = cudf.Series([[0, 1], [2, 3]])
+    with exception:
+        gs.list.take(invalid)
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index ed91e909f25..ca8aa00f80c 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -724,3 +724,17 @@ def test_orc_bool_encode_fail():
     # Also validate data
     pdf = pa.orc.ORCFile(buffer).read().to_pandas()
     assert_eq(okay_df, pdf)
+
+
+def test_nanoseconds_overflow():
+    buffer = BytesIO()
+    # Use nanosecond values that take more than 32 bits to encode
+    s = cudf.Series([710424008, -1338482640], dtype="datetime64[ns]")
+    expected = cudf.DataFrame({"s": s})
+    expected.to_orc(buffer)
+
+    cudf_got = cudf.read_orc(buffer)
+    assert_eq(expected, cudf_got)
+
+    pyarrow_got = pa.orc.ORCFile(buffer).read()
+    assert_eq(expected.to_pandas(), pyarrow_got.to_pandas())
diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
index 4d2e2a4b33b..1005efec3ee 100644
--- a/python/cudf/cudf/tests/test_setitem.py
+++ b/python/cudf/cudf/tests/test_setitem.py
@@ -143,15 +143,14 @@ def test_setitem_dataframe_series_inplace(df):
 )
 def test_series_set_equal_length_object_by_mask(replace_data):
 
-    psr = pd.Series([1, 2, 3, 4, 5])
+    psr = pd.Series([1, 2, 3, 4, 5], dtype="Int64")
     gsr = cudf.from_pandas(psr)
 
     # Lengths match in trivial case
-    pd_bool_col = pd.Series([True] * len(psr))
+    pd_bool_col = pd.Series([True] * len(psr), dtype="boolean")
     gd_bool_col = cudf.from_pandas(pd_bool_col)
-
     psr[pd_bool_col] = (
-        replace_data.to_pandas()
+        replace_data.to_pandas(nullable=True)
         if hasattr(replace_data, "to_pandas")
         else replace_data
     )