From 36f18c8974c178eae2f58b936c83738a35168521 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 15 Mar 2021 11:09:39 -0700
Subject: [PATCH 01/21] Fix ORC issue with incorrect timestamp nanosecond
 values (#7581)

Closes #7355

Use 64 bit variables/buffers to handle nanosecond values since nanosecond encode can overflow a 32bit value in some cases.
Removed the overloaded `intrle_minmax` function, using templated `numeric_limits` functions instead (the alternative was to add another overload).

Performance impact evaluation pending, but this fix seems unavoidable regardless of the impact.

Authors:
  - Vukasin Milovanovic (@vuule)

Approvers:
  - GALI PREM SAGAR (@galipremsagar)
  - Devavret Makkar (@devavret)
  - Kumar Aatish (@kaatish)

URL: https://github.com/rapidsai/cudf/pull/7581
---
 cpp/src/io/orc/stripe_data.cu      | 34 ++++++++++++++++++++--------
 cpp/src/io/orc/stripe_enc.cu       | 36 +++++++++---------------------
 python/cudf/cudf/tests/test_orc.py | 14 ++++++++++++
 3 files changed, 49 insertions(+), 35 deletions(-)

diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 4bca725a16b..1ff752034ad 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1455,8 +1455,9 @@ __global__ void __launch_bounds__(block_size)
     __syncthreads();
     // Decode data streams
     {
-      uint32_t numvals      = s->top.data.max_vals, secondary_val;
-      uint32_t vals_skipped = 0;
+      uint32_t numvals       = s->top.data.max_vals;
+      uint64_t secondary_val = 0;
+      uint32_t vals_skipped  = 0;
       if (s->is_string || s->chunk.type_kind == TIMESTAMP) {
         // For these data types, we have a secondary unsigned 32-bit data stream
         orc_bytestream_s *bs = (is_dictionary(s->chunk.encoding_kind)) ? &s->bs : &s->bs2;
@@ -1471,9 +1472,15 @@ __global__ void __launch_bounds__(block_size)
         }
         if (numvals > ofs) {
           if (is_rlev1(s->chunk.encoding_kind)) {
-            numvals = ofs + Integer_RLEv1(bs, &s->u.rlev1, &s->vals.u32[ofs], numvals - ofs, t);
+            if (s->chunk.type_kind == TIMESTAMP)
+              numvals = ofs + Integer_RLEv1(bs, &s->u.rlev1, &s->vals.u64[ofs], numvals - ofs, t);
+            else
+              numvals = ofs + Integer_RLEv1(bs, &s->u.rlev1, &s->vals.u32[ofs], numvals - ofs, t);
           } else {
-            numvals = ofs + Integer_RLEv2(bs, &s->u.rlev2, &s->vals.u32[ofs], numvals - ofs, t);
+            if (s->chunk.type_kind == TIMESTAMP)
+              numvals = ofs + Integer_RLEv2(bs, &s->u.rlev2, &s->vals.u64[ofs], numvals - ofs, t);
+            else
+              numvals = ofs + Integer_RLEv2(bs, &s->u.rlev2, &s->vals.u32[ofs], numvals - ofs, t);
           }
           __syncthreads();
           if (numvals <= ofs && t >= ofs && t < s->top.data.max_vals) { s->vals.u32[t] = 0; }
@@ -1487,15 +1494,24 @@ __global__ void __launch_bounds__(block_size)
             __syncthreads();
             if (t == 0) { s->top.data.index.run_pos[cid] = 0; }
             numvals -= vals_skipped;
-            if (t < numvals) { secondary_val = s->vals.u32[vals_skipped + t]; }
+            if (t < numvals) {
+              secondary_val = (s->chunk.type_kind == TIMESTAMP) ? s->vals.u64[vals_skipped + t]
+                                                                : s->vals.u32[vals_skipped + t];
+            }
             __syncthreads();
-            if (t < numvals) { s->vals.u32[t] = secondary_val; }
+            if (t < numvals) {
+              if (s->chunk.type_kind == TIMESTAMP)
+                s->vals.u64[t] = secondary_val;
+              else
+                s->vals.u32[t] = secondary_val;
+            }
           }
         }
         __syncthreads();
         // For strings with direct encoding, we need to convert the lengths into an offset
         if (!is_dictionary(s->chunk.encoding_kind)) {
-          secondary_val = (t < numvals) ? s->vals.u32[t] : 0;
+          if (t < numvals)
+            secondary_val = (s->chunk.type_kind == TIMESTAMP) ? s->vals.u64[t] : s->vals.u32[t];
           if (s->chunk.type_kind != TIMESTAMP) {
             lengths_to_positions(s->vals.u32, numvals, t);
             __syncthreads();
@@ -1693,7 +1709,7 @@ __global__ void __launch_bounds__(block_size)
             }
             case TIMESTAMP: {
               int64_t seconds = s->vals.i64[t + vals_skipped] + s->top.data.utc_epoch;
-              uint32_t nanos  = secondary_val;
+              uint64_t nanos  = secondary_val;
               nanos           = (nanos >> 3) * kTimestampNanoScale[nanos & 7];
               if (!tz_table.ttimes.empty()) {
                 seconds += get_gmt_offset(tz_table.ttimes, tz_table.offsets, seconds);
@@ -1716,7 +1732,7 @@ __global__ void __launch_bounds__(block_size)
       if (s->chunk.type_kind == TIMESTAMP) {
         int buffer_pos = s->top.data.max_vals;
         if (t >= buffer_pos && t < buffer_pos + s->top.data.buffered_count) {
-          s->vals.u32[t - buffer_pos] = secondary_val;
+          s->vals.u64[t - buffer_pos] = secondary_val;
         }
       } else if (s->chunk.type_kind == BOOLEAN && t < s->top.data.buffered_count) {
         s->vals.u8[t] = secondary_val;
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 88cad005817..aef32efaf6e 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -92,6 +92,7 @@ struct orcenc_state_s {
   union {
     uint8_t u8[2048];
     uint32_t u32[1024];
+    uint64_t u64[1024];
   } lengths;
 };
 
@@ -101,6 +102,7 @@ static inline __device__ uint32_t zigzag(int32_t v)
   int32_t s = (v >> 31);
   return ((v ^ s) * 2) - s;
 }
+static inline __device__ uint64_t zigzag(uint64_t v) { return v; }
 static inline __device__ uint64_t zigzag(int64_t v)
 {
   int64_t s = (v < 0) ? 1 : 0;
@@ -286,24 +288,6 @@ static inline __device__ uint32_t StoreVarint(uint8_t *dst, uint64_t v)
   return bytecnt;
 }
 
-static inline __device__ void intrle_minmax(int64_t &vmin, int64_t &vmax)
-{
-  vmin = INT64_MIN;
-  vmax = INT64_MAX;
-}
-// static inline __device__ void intrle_minmax(uint64_t &vmin, uint64_t &vmax) { vmin = UINT64_C(0);
-// vmax = UINT64_MAX; }
-static inline __device__ void intrle_minmax(int32_t &vmin, int32_t &vmax)
-{
-  vmin = INT32_MIN;
-  vmax = INT32_MAX;
-}
-static inline __device__ void intrle_minmax(uint32_t &vmin, uint32_t &vmax)
-{
-  vmin = UINT32_C(0);
-  vmax = UINT32_MAX;
-}
-
 template <class T>
 static inline __device__ void StoreBytesBigEndian(uint8_t *dst, T v, uint32_t w)
 {
@@ -412,13 +396,9 @@ static __device__ uint32_t IntegerRLE(orcenc_state_s *s,
     // Find minimum and maximum values
     if (literal_run > 0) {
       // Find min & max
-      T vmin, vmax;
+      T vmin = (t < literal_run) ? v0 : std::numeric_limits<T>::max();
+      T vmax = (t < literal_run) ? v0 : std::numeric_limits<T>::min();
       uint32_t literal_mode, literal_w;
-      if (t < literal_run) {
-        vmin = vmax = v0;
-      } else {
-        intrle_minmax(vmax, vmin);
-      }
       vmin = block_reduce(temp_storage).Reduce(vmin, cub::Min());
       __syncthreads();
       vmax = block_reduce(temp_storage).Reduce(vmax, cub::Max());
@@ -652,6 +632,7 @@ __global__ void __launch_bounds__(block_size)
     typename cub::BlockReduce<int32_t, block_size>::TempStorage i32;
     typename cub::BlockReduce<int64_t, block_size>::TempStorage i64;
     typename cub::BlockReduce<uint32_t, block_size>::TempStorage u32;
+    typename cub::BlockReduce<uint64_t, block_size>::TempStorage u64;
   } temp_storage;
 
   orcenc_state_s *const s = &state_g;
@@ -763,7 +744,7 @@ __global__ void __launch_bounds__(block_size)
             int64_t ts       = static_cast<const int64_t *>(base)[row];
             int32_t ts_scale = kTimeScale[min(s->chunk.scale, 9)];
             int64_t seconds  = ts / ts_scale;
-            int32_t nanos    = (ts - seconds * ts_scale);
+            int64_t nanos    = (ts - seconds * ts_scale);
             // There is a bug in the ORC spec such that for negative timestamps, it is understood
             // between the writer and reader that nanos will be adjusted to their positive component
             // but the negative seconds will be left alone. This means that -2.6 is encoded as
@@ -786,7 +767,7 @@ __global__ void __launch_bounds__(block_size)
               }
               nanos = (nanos << 3) + zeroes;
             }
-            s->lengths.u32[nz_idx] = nanos;
+            s->lengths.u64[nz_idx] = nanos;
             break;
           }
           case STRING:
@@ -897,6 +878,9 @@ __global__ void __launch_bounds__(block_size)
         uint32_t flush = (s->cur_row == s->chunk.num_rows) ? 1 : 0, n;
         switch (s->chunk.type_kind) {
           case TIMESTAMP:
+            n = IntegerRLE<CI_DATA2, uint64_t, false, 0x3ff, block_size>(
+              s, s->lengths.u64, s->nnz - s->numlengths, s->numlengths, flush, t, temp_storage.u64);
+            break;
           case STRING:
             n = IntegerRLE<CI_DATA2, uint32_t, false, 0x3ff, block_size>(
               s, s->lengths.u32, s->nnz - s->numlengths, s->numlengths, flush, t, temp_storage.u32);
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index ed91e909f25..ca8aa00f80c 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -724,3 +724,17 @@ def test_orc_bool_encode_fail():
     # Also validate data
     pdf = pa.orc.ORCFile(buffer).read().to_pandas()
     assert_eq(okay_df, pdf)
+
+
+def test_nanoseconds_overflow():
+    buffer = BytesIO()
+    # Use nanosecond values that take more than 32 bits to encode
+    s = cudf.Series([710424008, -1338482640], dtype="datetime64[ns]")
+    expected = cudf.DataFrame({"s": s})
+    expected.to_orc(buffer)
+
+    cudf_got = cudf.read_orc(buffer)
+    assert_eq(expected, cudf_got)
+
+    pyarrow_got = pa.orc.ORCFile(buffer).read()
+    assert_eq(expected.to_pandas(), pyarrow_got.to_pandas())

From 05bb2f06ad05b4db1cde08e947797729e4a4b9dd Mon Sep 17 00:00:00 2001
From: Paul Taylor <paul.e.taylor@me.com>
Date: Mon, 15 Mar 2021 18:56:30 -0500
Subject: [PATCH 02/21] Fix auto-detecting GPU architectures (#7593)

Fixes regression from https://github.com/rapidsai/cudf/pull/7579 in auto-detecting GPU architectures when `-DCMAKE_CUDA_ARCHITECTURES=` is passed on the CLI.

Now that the cached `CMAKE_CUDA_ARCHITECTURES` isn't unset before calling `enable_language(CUDA)`, this call throws an error and configuration fails. This change ensures we call `enable_language(CUDA)` after any potential rewrites of `CMAKE_CUDA_ARCHITECTURES`.

This PR also aligns with RMM's `EvalGPUArchs.cmake` logic and prints `SUPPORTED_CUDA_ARCHITECTURES` instead of `"ALL"` in the case the current machine is a CPU-only node.

Related: https://github.com/rapidsai/rmm/pull/727

Authors:
  - Paul Taylor (@trxcllnt)
  - Robert Maynard (@robertmaynard)

Approvers:
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7593
---
 build.sh                              |  2 +-
 cpp/cmake/Modules/ConfigureCUDA.cmake | 22 ++++---------
 cpp/cmake/Modules/EvalGPUArchs.cmake  | 37 ++++++++++++---------
 cpp/cmake/Modules/SetGPUArchs.cmake   | 46 +++++++++++++++------------
 4 files changed, 55 insertions(+), 52 deletions(-)

diff --git a/build.sh b/build.sh
index 5eb404d02a8..d75053f8849 100755
--- a/build.sh
+++ b/build.sh
@@ -135,7 +135,7 @@ if hasArg clean; then
 fi
 
 if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then
-    CUDF_CMAKE_CUDA_ARCHITECTURES="-DCMAKE_CUDA_ARCHITECTURES=ALL"
+    CUDF_CMAKE_CUDA_ARCHITECTURES="-DCMAKE_CUDA_ARCHITECTURES="
     echo "Building for the architecture of the GPU in the system..."
 else
     CUDF_CMAKE_CUDA_ARCHITECTURES=""
diff --git a/cpp/cmake/Modules/ConfigureCUDA.cmake b/cpp/cmake/Modules/ConfigureCUDA.cmake
index 44699a13206..d4be6e65021 100644
--- a/cpp/cmake/Modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/Modules/ConfigureCUDA.cmake
@@ -17,26 +17,16 @@
 # Find the CUDAToolkit
 find_package(CUDAToolkit REQUIRED)
 
-# Must come after find_package(CUDAToolkit) because we symlink
-# ccache as a compiler front-end for nvcc in gpuCI CPU builds.
-enable_language(CUDA)
-
-if(CMAKE_CUDA_COMPILER_VERSION)
-  # Compute the version. from  CMAKE_CUDA_COMPILER_VERSION
-  string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR ${CMAKE_CUDA_COMPILER_VERSION})
-  string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR ${CMAKE_CUDA_COMPILER_VERSION})
-  set(CUDA_VERSION "${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}")
-endif()
-
-message(VERBOSE "CUDF: CUDA_VERSION_MAJOR: ${CUDA_VERSION_MAJOR}")
-message(VERBOSE "CUDF: CUDA_VERSION_MINOR: ${CUDA_VERSION_MINOR}")
-message(STATUS "CUDF: CUDA_VERSION: ${CUDA_VERSION}")
-
 # Auto-detect available GPU compute architectures
-
 include(${CUDF_SOURCE_DIR}/cmake/Modules/SetGPUArchs.cmake)
 message(STATUS "CUDF: Building CUDF for GPU architectures: ${CMAKE_CUDA_ARCHITECTURES}")
 
+# Must come after find_package(CUDAToolkit) because we symlink
+# ccache as a compiler front-end for nvcc in gpuCI CPU builds.
+# Must also come after we detect and potentially rewrite
+# CMAKE_CUDA_ARCHITECTURES
+enable_language(CUDA)
+
 if(CMAKE_COMPILER_IS_GNUCXX)
     list(APPEND CUDF_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations)
     if(CUDF_BUILD_TESTS OR CUDF_BUILD_BENCHMARKS)
diff --git a/cpp/cmake/Modules/EvalGPUArchs.cmake b/cpp/cmake/Modules/EvalGPUArchs.cmake
index 6c747a0b867..09e42c6cc7a 100644
--- a/cpp/cmake/Modules/EvalGPUArchs.cmake
+++ b/cpp/cmake/Modules/EvalGPUArchs.cmake
@@ -14,12 +14,21 @@
 # limitations under the License.
 #=============================================================================
 
+# Unset this first in case it's set to <empty_string>
+set(CMAKE_CUDA_ARCHITECTURES OFF)
+
+# Enable CUDA so we can invoke nvcc
+enable_language(CUDA)
+
+# Function uses the CUDA runtime API to query the compute capability of the device, so if a user
+# doesn't pass any architecture options to CMake we only build the current architecture
 function(evaluate_gpu_archs gpu_archs)
   set(eval_file ${PROJECT_BINARY_DIR}/eval_gpu_archs.cu)
   set(eval_exe ${PROJECT_BINARY_DIR}/eval_gpu_archs)
   set(error_file ${PROJECT_BINARY_DIR}/eval_gpu_archs.stderr.log)
-  file(WRITE ${eval_file}
-[=[
+  file(
+    WRITE ${eval_file}
+    "
 #include <cstdio>
 #include <set>
 #include <string>
@@ -32,32 +41,30 @@ int main(int argc, char** argv) {
       char buff[32];
       cudaDeviceProp prop;
       if(cudaGetDeviceProperties(&prop, dev) != cudaSuccess) continue;
-      sprintf(buff, "%d%d", prop.major, prop.minor);
+      sprintf(buff, \"%d%d\", prop.major, prop.minor);
       archs.insert(buff);
     }
   }
   if(archs.empty()) {
-    printf("ALL");
+    printf(\"${SUPPORTED_CUDA_ARCHITECTURES}\");
   } else {
     bool first = true;
     for(const auto& arch : archs) {
-      printf(first? "%s" : ";%s", arch.c_str());
+      printf(first? \"%s\" : \";%s\", arch.c_str());
       first = false;
     }
   }
-  printf("\n");
+  printf(\"\\n\");
   return 0;
 }
-]=])
+")
   execute_process(
-    COMMAND ${CMAKE_CUDA_COMPILER}
-      -std=c++11
-      -o ${eval_exe}
-      --run
-      ${eval_file}
+    COMMAND ${CMAKE_CUDA_COMPILER} -std=c++11 -o ${eval_exe} --run ${eval_file}
     OUTPUT_VARIABLE __gpu_archs
     OUTPUT_STRIP_TRAILING_WHITESPACE
     ERROR_FILE ${error_file})
-  message(VERBOSE "CUDF: Auto detection of gpu-archs: ${__gpu_archs}")
-  set(${gpu_archs} ${__gpu_archs} PARENT_SCOPE)
-endfunction()
+  message(STATUS "CUDF: Auto detection of gpu-archs: ${__gpu_archs}")
+  set(${gpu_archs}
+      ${__gpu_archs}
+      PARENT_SCOPE)
+endfunction(evaluate_gpu_archs)
diff --git a/cpp/cmake/Modules/SetGPUArchs.cmake b/cpp/cmake/Modules/SetGPUArchs.cmake
index 396023ee9a9..61e4e6bc198 100644
--- a/cpp/cmake/Modules/SetGPUArchs.cmake
+++ b/cpp/cmake/Modules/SetGPUArchs.cmake
@@ -25,35 +25,41 @@ else()
   list(REMOVE_ITEM SUPPORTED_CUDA_ARCHITECTURES "62" "72")
 endif()
 
-if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11)
+# CMake < 3.20 has a bug in FindCUDAToolkit where it won't properly detect the CUDAToolkit version
+# when find_package(CUDAToolkit) occurs before enable_language(CUDA)
+if(NOT DEFINED CUDAToolkit_VERSION AND CMAKE_CUDA_COMPILER)
+  execute_process(COMMAND ${CMAKE_CUDA_COMPILER} "--version" OUTPUT_VARIABLE NVCC_OUT)
+  if(NVCC_OUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+    set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
+    set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
+    set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
+    set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}")
+  endif()
+  unset(NVCC_OUT)
+endif()
+
+if(CUDAToolkit_VERSION_MAJOR LESS 11)
   list(REMOVE_ITEM SUPPORTED_CUDA_ARCHITECTURES "80")
 endif()
-if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 10)
+if(CUDAToolkit_VERSION_MAJOR LESS 10)
   list(REMOVE_ITEM SUPPORTED_CUDA_ARCHITECTURES "75")
 endif()
-if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 9)
+if(CUDAToolkit_VERSION_MAJOR LESS 9)
   list(REMOVE_ITEM SUPPORTED_CUDA_ARCHITECTURES "70")
 endif()
 
-if(CUDF_BUILD_FOR_DETECTED_ARCHS)
-  include(${CUDF_SOURCE_DIR}/cmake/Modules/EvalGPUArchs.cmake)
-  evaluate_gpu_archs(CMAKE_CUDA_ARCHITECTURES)
-  if(CMAKE_CUDA_ARCHITECTURES STREQUAL "ALL")
-    unset(CMAKE_CUDA_ARCHITECTURES CACHE)
-    set(CUDF_BUILD_FOR_ALL_ARCHS TRUE)
-  else()
-    set(CUDF_BUILD_FOR_ALL_ARCHS FALSE)
-    list(TRANSFORM CMAKE_CUDA_ARCHITECTURES APPEND "-real")
-  endif()
-endif()
-
-if(CUDF_BUILD_FOR_ALL_ARCHS)
+if(${PROJECT_NAME}_BUILD_FOR_ALL_ARCHS)
   set(CMAKE_CUDA_ARCHITECTURES ${SUPPORTED_CUDA_ARCHITECTURES})
-  # CMake architecture list entry of "80" means to build compute and sm.
-  # What we want is for the newest arch only to build that way
-  # while the rest built only for sm.
-  list(SORT CMAKE_CUDA_ARCHITECTURES ORDER ASCENDING)
+
+  # CMake architecture list entry of "80" means to build compute and sm. What we want is for the
+  # newest arch only to build that way while the rest built only for sm.
   list(POP_BACK CMAKE_CUDA_ARCHITECTURES latest_arch)
   list(TRANSFORM CMAKE_CUDA_ARCHITECTURES APPEND "-real")
   list(APPEND CMAKE_CUDA_ARCHITECTURES ${latest_arch})
+
+elseif(${PROJECT_NAME}_BUILD_FOR_DETECTED_ARCHS)
+  include(${PROJECT_SOURCE_DIR}/cmake/Modules/EvalGPUArchs.cmake)
+  evaluate_gpu_archs(CMAKE_CUDA_ARCHITECTURES)
+
+  list(TRANSFORM CMAKE_CUDA_ARCHITECTURES APPEND "-real")
 endif()

From 561f68a387578cf491da27c475ee7439ecd8855f Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Mon, 15 Mar 2021 19:23:32 -0500
Subject: [PATCH 03/21] Match Pandas logic for comparing two objects with nulls
 (#7490)

Fixes https://github.com/rapidsai/cudf/issues/7066

Authors:
  - @brandon-b-miller

Approvers:
  - Ashwin Srinath (@shwina)
  - Christopher Harris (@cwharris)
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7490
---
 python/cudf/cudf/_lib/binaryop.pyx          |  20 +--
 python/cudf/cudf/_lib/cpp/binaryop.pxd      |   1 +
 python/cudf/cudf/_lib/reduce.pyx            |   2 +
 python/cudf/cudf/core/column/categorical.py |   6 +-
 python/cudf/cudf/core/column/column.py      |   6 +-
 python/cudf/cudf/core/column/datetime.py    |   2 +-
 python/cudf/cudf/core/column/numerical.py   |  13 +-
 python/cudf/cudf/core/column/string.py      |   3 +-
 python/cudf/cudf/core/column/timedelta.py   |   2 +-
 python/cudf/cudf/core/dataframe.py          |  14 +-
 python/cudf/cudf/core/frame.py              |   5 +-
 python/cudf/cudf/core/series.py             |   6 +-
 python/cudf/cudf/tests/test_binops.py       | 144 ++++++++++++++++----
 python/cudf/cudf/tests/test_dataframe.py    |  28 ++--
 python/cudf/cudf/tests/test_indexing.py     |  11 --
 python/cudf/cudf/tests/test_setitem.py      |   7 +-
 16 files changed, 182 insertions(+), 88 deletions(-)

diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx
index 59a6b876961..5eaec640b15 100644
--- a/python/cudf/cudf/_lib/binaryop.pyx
+++ b/python/cudf/cudf/_lib/binaryop.pyx
@@ -93,6 +93,9 @@ class BinaryOperation(IntEnum):
     GENERIC_BINARY = (
         <underlying_type_t_binary_operator> binary_operator.GENERIC_BINARY
     )
+    NULL_EQUALS = (
+        <underlying_type_t_binary_operator> binary_operator.NULL_EQUALS
+    )
 
 
 cdef binaryop_v_v(Column lhs, Column rhs,
@@ -154,17 +157,6 @@ cdef binaryop_s_v(DeviceScalar lhs, Column rhs,
     return Column.from_unique_ptr(move(c_result))
 
 
-def handle_null_for_string_column(Column input_col, op):
-    if op in ('eq', 'lt', 'le', 'gt', 'ge'):
-        return replace_nulls(input_col, DeviceScalar(False, 'bool'))
-
-    elif op == 'ne':
-        return replace_nulls(input_col, DeviceScalar(True, 'bool'))
-
-    # Nothing needs to be done
-    return input_col
-
-
 def binaryop(lhs, rhs, op, dtype):
     """
     Dispatches a binary op call to the appropriate libcudf function:
@@ -205,11 +197,7 @@ def binaryop(lhs, rhs, op, dtype):
             c_op,
             c_dtype
         )
-
-    if is_string_col is True:
-        return handle_null_for_string_column(result, op.name.lower())
-    else:
-        return result
+    return result
 
 
 def binaryop_udf(Column lhs, Column rhs, udf_ptx, dtype):
diff --git a/python/cudf/cudf/_lib/cpp/binaryop.pxd b/python/cudf/cudf/_lib/cpp/binaryop.pxd
index fb36fdfd639..2e36070a164 100644
--- a/python/cudf/cudf/_lib/cpp/binaryop.pxd
+++ b/python/cudf/cudf/_lib/cpp/binaryop.pxd
@@ -27,6 +27,7 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         GREATER "cudf::binary_operator::GREATER"
         LESS_EQUAL "cudf::binary_operator::LESS_EQUAL"
         GREATER_EQUAL "cudf::binary_operator::GREATER_EQUAL"
+        NULL_EQUALS "cudf::binary_operator::NULL_EQUALS"
         BITWISE_AND "cudf::binary_operator::BITWISE_AND"
         BITWISE_OR "cudf::binary_operator::BITWISE_OR"
         BITWISE_XOR "cudf::binary_operator::BITWISE_XOR"
diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 7b455dd574b..2185cb089a7 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -57,6 +57,8 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
             return incol.dtype.type(0)
         if reduction_op == 'product':
             return incol.dtype.type(1)
+        if reduction_op == "any":
+            return False
 
         return cudf.utils.dtypes._get_nan_for_dtype(col_dtype)
 
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index c41a458f02b..39c278d2abf 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1014,7 +1014,11 @@ def slice(
     def binary_operator(
         self, op: str, rhs, reflect: bool = False
     ) -> ColumnBase:
-        if not (self.ordered and rhs.ordered) and op not in ("eq", "ne"):
+        if not (self.ordered and rhs.ordered) and op not in (
+            "eq",
+            "ne",
+            "NULL_EQUALS",
+        ):
             if op in ("lt", "gt", "le", "ge"):
                 raise TypeError(
                     "Unordered Categoricals can only compare equality or not"
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 2bb35c97d7c..b2b2874eeb4 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -179,7 +179,11 @@ def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool:
         if check_dtypes:
             if self.dtype != other.dtype:
                 return False
-        return (self == other).min()
+        null_equals = self._null_equals(other)
+        return null_equals.all()
+
+    def _null_equals(self, other: ColumnBase) -> ColumnBase:
+        return self.binary_operator("NULL_EQUALS", other)
 
     def all(self) -> bool:
         return bool(libcudf.reduce.reduce("all", self, dtype=np.bool_))
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 7c5385b9bbf..a563248f4ab 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -274,7 +274,7 @@ def binary_operator(
         if isinstance(rhs, cudf.DateOffset):
             return binop_offset(self, rhs, op)
         lhs, rhs = self, rhs
-        if op in ("eq", "ne", "lt", "gt", "le", "ge"):
+        if op in ("eq", "ne", "lt", "gt", "le", "ge", "NULL_EQUALS"):
             out_dtype = np.dtype(np.bool_)  # type: Dtype
         elif op == "add" and pd.api.types.is_timedelta64_dtype(rhs.dtype):
             out_dtype = cudf.core.column.timedelta._timedelta_add_result_dtype(
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 6fae8c644e3..7ad6eed65a8 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -700,16 +700,21 @@ def _numeric_column_binop(
     if reflect:
         lhs, rhs = rhs, lhs
 
-    is_op_comparison = op in ["lt", "gt", "le", "ge", "eq", "ne"]
+    is_op_comparison = op in [
+        "lt",
+        "gt",
+        "le",
+        "ge",
+        "eq",
+        "ne",
+        "NULL_EQUALS",
+    ]
 
     if is_op_comparison:
         out_dtype = "bool"
 
     out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
 
-    if is_op_comparison:
-        out = out.fillna(op == "ne")
-
     return out
 
 
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 81abdd3f66a..ea01aa07b91 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -434,7 +434,6 @@ def cat(self, others=None, sep=None, na_rep=None):
         3    dD
         dtype: object
         """
-
         if sep is None:
             sep = ""
 
@@ -5109,7 +5108,7 @@ def binary_operator(
         if isinstance(rhs, (StringColumn, str, cudf.Scalar)):
             if op == "add":
                 return cast("column.ColumnBase", lhs.str().cat(others=rhs))
-            elif op in ("eq", "ne", "gt", "lt", "ge", "le"):
+            elif op in ("eq", "ne", "gt", "lt", "ge", "le", "NULL_EQUALS"):
                 return _string_column_binop(self, rhs, op=op, out_dtype="bool")
 
         raise TypeError(
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index ac63192b692..e22b511db01 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -223,7 +223,7 @@ def binary_operator(
 
         if op in ("eq", "ne"):
             out_dtype = self._binary_op_eq_ne(rhs)
-        elif op in ("lt", "gt", "le", "ge"):
+        elif op in ("lt", "gt", "le", "ge", "NULL_EQUALS"):
             out_dtype = self._binary_op_lt_gt_le_ge(rhs)
         elif op == "mul":
             out_dtype = self._binary_op_mul(rhs)
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index ecdce9443a1..25f57748765 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6031,7 +6031,6 @@ def isin(self, values):
         falcon      True       True
         dog        False      False
         """
-
         if isinstance(values, dict):
 
             result_df = DataFrame()
@@ -6051,14 +6050,15 @@ def isin(self, values):
             values = values.reindex(self.index)
 
             result = DataFrame()
-
+            # TODO: propagate nulls through isin
+            # https://github.com/rapidsai/cudf/issues/7556
             for col in self._data.names:
                 if isinstance(
                     self[col]._column, cudf.core.column.CategoricalColumn
                 ) and isinstance(
                     values._column, cudf.core.column.CategoricalColumn
                 ):
-                    res = self._data[col] == values._column
+                    res = (self._data[col] == values._column).fillna(False)
                     result[col] = res
                 elif (
                     isinstance(
@@ -6073,7 +6073,9 @@ def isin(self, values):
                 ):
                     result[col] = utils.scalar_broadcast_to(False, len(self))
                 else:
-                    result[col] = self._data[col] == values._column
+                    result[col] = (self._data[col] == values._column).fillna(
+                        False
+                    )
 
             result.index = self.index
             return result
@@ -6083,7 +6085,9 @@ def isin(self, values):
             result = DataFrame()
             for col in self._data.names:
                 if col in values.columns:
-                    result[col] = self._data[col] == values[col]._column
+                    result[col] = (
+                        self._data[col] == values[col]._column
+                    ).fillna(False)
                 else:
                     result[col] = utils.scalar_broadcast_to(False, len(self))
             result.index = self.index
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 275d085ef5d..fab5936f94d 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1566,10 +1566,7 @@ def _apply_boolean_mask(self, boolean_mask):
         rows corresponding to `False` is dropped
         """
         boolean_mask = as_column(boolean_mask)
-        if boolean_mask.has_nulls:
-            raise ValueError(
-                "cannot mask with boolean_mask containing null values"
-            )
+
         result = self.__class__._from_table(
             libcudf.stream_compaction.apply_boolean_mask(
                 self, as_column(boolean_mask)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 11e32e2285d..5e7121c0488 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3120,8 +3120,10 @@ def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
                 "bool_only parameter is not implemented yet"
             )
 
-        if self.empty:
-            return False
+        skipna = False if skipna is None else skipna
+
+        if skipna is False and self.has_nulls:
+            return True
 
         if skipna:
             result_series = self.nans_to_nulls()
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index a0b65743180..18f2d7e474b 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -206,12 +206,45 @@ def test_series_compare(cmpop, obj_class, dtype):
     np.testing.assert_equal(result3.to_array(), cmpop(arr1, arr2))
 
 
+def _series_compare_nulls_typegen():
+    tests = []
+    tests += list(product(DATETIME_TYPES, DATETIME_TYPES))
+    tests += list(product(TIMEDELTA_TYPES, TIMEDELTA_TYPES))
+    tests += list(product(NUMERIC_TYPES, NUMERIC_TYPES))
+    tests += list(product(STRING_TYPES, STRING_TYPES))
+
+    return tests
+
+
+@pytest.mark.parametrize("cmpop", _cmpops)
+@pytest.mark.parametrize("dtypes", _series_compare_nulls_typegen())
+def test_series_compare_nulls(cmpop, dtypes):
+    ltype, rtype = dtypes
+
+    ldata = [1, 2, None, None, 5]
+    rdata = [2, 1, None, 4, None]
+
+    lser = Series(ldata, dtype=ltype)
+    rser = Series(rdata, dtype=rtype)
+
+    lmask = ~lser.isnull()
+    rmask = ~rser.isnull()
+
+    expect_mask = np.logical_and(lmask, rmask)
+    expect = cudf.Series([None] * 5, dtype="bool")
+    expect[expect_mask] = cmpop(lser[expect_mask], rser[expect_mask])
+
+    got = cmpop(lser, rser)
+    utils.assert_eq(expect, got)
+
+
 @pytest.mark.parametrize(
-    "obj", [pd.Series(["a", "b", None, "d", "e", None]), "a"]
+    "obj", [pd.Series(["a", "b", None, "d", "e", None], dtype="string"), "a"]
 )
 @pytest.mark.parametrize("cmpop", _cmpops)
 @pytest.mark.parametrize(
-    "cmp_obj", [pd.Series(["b", "a", None, "d", "f", None]), "a"]
+    "cmp_obj",
+    [pd.Series(["b", "a", None, "d", "f", None], dtype="string"), "a"],
 )
 def test_string_series_compare(obj, cmpop, cmp_obj):
 
@@ -221,10 +254,12 @@ def test_string_series_compare(obj, cmpop, cmp_obj):
     g_cmp_obj = cmp_obj
     if isinstance(g_cmp_obj, pd.Series):
         g_cmp_obj = Series.from_pandas(g_cmp_obj)
-
     got = cmpop(g_obj, g_cmp_obj)
     expected = cmpop(obj, cmp_obj)
 
+    if isinstance(expected, pd.Series):
+        expected = cudf.from_pandas(expected)
+
     utils.assert_eq(expected, got)
 
 
@@ -694,10 +729,12 @@ def test_operator_func_series_and_scalar(
 def test_operator_func_between_series_logical(
     dtype, func, scalar_a, scalar_b, fill_value
 ):
-    gdf_series_a = Series([scalar_a]).astype(dtype)
-    gdf_series_b = Series([scalar_b]).astype(dtype)
-    pdf_series_a = gdf_series_a.to_pandas()
-    pdf_series_b = gdf_series_b.to_pandas()
+
+    gdf_series_a = Series([scalar_a], nan_as_null=False).astype(dtype)
+    gdf_series_b = Series([scalar_b], nan_as_null=False).astype(dtype)
+
+    pdf_series_a = gdf_series_a.to_pandas(nullable=True)
+    pdf_series_b = gdf_series_b.to_pandas(nullable=True)
 
     gdf_series_result = getattr(gdf_series_a, func)(
         gdf_series_b, fill_value=fill_value
@@ -705,16 +742,22 @@ def test_operator_func_between_series_logical(
     pdf_series_result = getattr(pdf_series_a, func)(
         pdf_series_b, fill_value=fill_value
     )
-
-    if scalar_a in [None, np.nan] and scalar_b in [None, np.nan]:
-        # cudf binary operations will return `None` when both left- and right-
-        # side values are `None`. It will return `np.nan` when either side is
-        # `np.nan`. As a consequence, when we convert our gdf => pdf during
-        # assert_eq, we get a pdf with dtype='object' (all inputs are none).
-        # to account for this, we use fillna.
-        gdf_series_result.fillna(func == "ne", inplace=True)
-
-    utils.assert_eq(pdf_series_result, gdf_series_result)
+    expect = pdf_series_result
+    got = gdf_series_result.to_pandas(nullable=True)
+
+    # If fill_value is np.nan, things break down a bit,
+    # because setting a NaN into a pandas nullable float
+    # array still gets transformed to <NA>. As such,
+    # pd_series_with_nulls.fillna(np.nan) has no effect.
+    if (
+        (pdf_series_a.isnull().sum() != pdf_series_b.isnull().sum())
+        and np.isscalar(fill_value)
+        and np.isnan(fill_value)
+    ):
+        with pytest.raises(AssertionError):
+            utils.assert_eq(expect, got)
+        return
+    utils.assert_eq(expect, got)
 
 
 @pytest.mark.parametrize("dtype", ["float32", "float64"])
@@ -729,8 +772,7 @@ def test_operator_func_series_and_scalar_logical(
     gdf_series = utils.gen_rand_series(
         dtype, 1000, has_nulls=has_nulls, stride=10000
     )
-    pdf_series = gdf_series.to_pandas()
-
+    pdf_series = gdf_series.to_pandas(nullable=True)
     gdf_series_result = getattr(gdf_series, func)(
         cudf.Scalar(scalar) if use_cudf_scalar else scalar,
         fill_value=fill_value,
@@ -739,7 +781,10 @@ def test_operator_func_series_and_scalar_logical(
         scalar, fill_value=fill_value
     )
 
-    utils.assert_eq(pdf_series_result, gdf_series_result)
+    expect = pdf_series_result
+    got = gdf_series_result.to_pandas(nullable=True)
+
+    utils.assert_eq(expect, got)
 
 
 @pytest.mark.parametrize("func", _operators_arithmetic)
@@ -1738,10 +1783,61 @@ def test_equality_ops_index_mismatch(fn):
         index=["aa", "b", "c", "d", "e", "f", "y", "z"],
     )
 
-    pa = a.to_pandas()
-    pb = b.to_pandas()
-
+    pa = a.to_pandas(nullable=True)
+    pb = b.to_pandas(nullable=True)
     expected = getattr(pa, fn)(pb)
-    actual = getattr(a, fn)(b)
+    actual = getattr(a, fn)(b).to_pandas(nullable=True)
 
     utils.assert_eq(expected, actual)
+
+
+def generate_test_null_equals_columnops_data():
+    # Generate tuples of:
+    # (left_data, right_data, compare_bool
+    # where compare_bool is the correct answer to
+    # if the columns should compare as null equals
+
+    def set_null_cases(column_l, column_r, case):
+        if case == "neither":
+            return column_l, column_r
+        elif case == "left":
+            column_l[1] = None
+        elif case == "right":
+            column_r[1] = None
+        elif case == "both":
+            column_l[1] = None
+            column_r[1] = None
+        else:
+            raise ValueError("Unknown null case")
+        return column_l, column_r
+
+    null_cases = ["neither", "left", "right", "both"]
+    data = [1, 2, 3]
+
+    results = []
+    # TODO: Numeric types can be cross compared as null equal
+    for dtype in (
+        list(NUMERIC_TYPES)
+        + list(DATETIME_TYPES)
+        + list(TIMEDELTA_TYPES)
+        + list(STRING_TYPES)
+        + ["category"]
+    ):
+        for case in null_cases:
+            left = cudf.Series(data, dtype=dtype)
+            right = cudf.Series(data, dtype=dtype)
+            if case in {"left", "right"}:
+                answer = False
+            else:
+                answer = True
+            left, right = set_null_cases(left, right, case)
+            results.append((left._column, right._column, answer, case))
+
+    return results
+
+
+@pytest.mark.parametrize(
+    "lcol,rcol,ans,case", generate_test_null_equals_columnops_data()
+)
+def test_null_equals_columnops(lcol, rcol, ans, case):
+    assert lcol._null_equals(rcol).all() == ans
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index ffd66e18314..77548b95277 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -5017,12 +5017,14 @@ def test_cov_nans():
         operator.truediv,
         operator.mod,
         operator.pow,
-        operator.eq,
-        operator.lt,
-        operator.le,
-        operator.gt,
-        operator.ge,
-        operator.ne,
+        # comparison ops will temporarily XFAIL
+        # see PR  https://github.com/rapidsai/cudf/pull/7491
+        pytest.param(operator.eq, marks=pytest.mark.xfail()),
+        pytest.param(operator.lt, marks=pytest.mark.xfail()),
+        pytest.param(operator.le, marks=pytest.mark.xfail()),
+        pytest.param(operator.gt, marks=pytest.mark.xfail()),
+        pytest.param(operator.ge, marks=pytest.mark.xfail()),
+        pytest.param(operator.ne, marks=pytest.mark.xfail()),
     ],
 )
 def test_df_sr_binop(gsr, colnames, op):
@@ -5052,12 +5054,14 @@ def test_df_sr_binop(gsr, colnames, op):
         operator.truediv,
         operator.mod,
         operator.pow,
-        operator.eq,
-        operator.lt,
-        operator.le,
-        operator.gt,
-        operator.ge,
-        operator.ne,
+        # comparison ops will temporarily XFAIL
+        # see PR  https://github.com/rapidsai/cudf/pull/7491
+        pytest.param(operator.eq, marks=pytest.mark.xfail()),
+        pytest.param(operator.lt, marks=pytest.mark.xfail()),
+        pytest.param(operator.le, marks=pytest.mark.xfail()),
+        pytest.param(operator.gt, marks=pytest.mark.xfail()),
+        pytest.param(operator.ge, marks=pytest.mark.xfail()),
+        pytest.param(operator.ne, marks=pytest.mark.xfail()),
     ],
 )
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 558700f1f89..cec2623027f 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -755,17 +755,6 @@ def do_slice(x):
     assert_eq(expect, got, check_dtype=False)
 
 
-def test_dataframe_boolean_mask_with_None():
-    pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]})
-    gdf = cudf.DataFrame.from_pandas(pdf)
-    pdf_masked = pdf[[True, False, True, False]]
-    gdf_masked = gdf[[True, False, True, False]]
-    assert_eq(pdf_masked, gdf_masked)
-
-    with pytest.raises(ValueError):
-        gdf[cudf.Series([True, False, None, False])]
-
-
 @pytest.mark.parametrize("dtype", [int, float, str])
 def test_empty_boolean_mask(dtype):
     gdf = cudf.datasets.randomdata(nrows=0, dtypes={"a": dtype})
diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
index 4d2e2a4b33b..1005efec3ee 100644
--- a/python/cudf/cudf/tests/test_setitem.py
+++ b/python/cudf/cudf/tests/test_setitem.py
@@ -143,15 +143,14 @@ def test_setitem_dataframe_series_inplace(df):
 )
 def test_series_set_equal_length_object_by_mask(replace_data):
 
-    psr = pd.Series([1, 2, 3, 4, 5])
+    psr = pd.Series([1, 2, 3, 4, 5], dtype="Int64")
     gsr = cudf.from_pandas(psr)
 
     # Lengths match in trivial case
-    pd_bool_col = pd.Series([True] * len(psr))
+    pd_bool_col = pd.Series([True] * len(psr), dtype="boolean")
     gd_bool_col = cudf.from_pandas(pd_bool_col)
-
     psr[pd_bool_col] = (
-        replace_data.to_pandas()
+        replace_data.to_pandas(nullable=True)
         if hasattr(replace_data, "to_pandas")
         else replace_data
     )

From 2f5901ffb49eed3216d82d793e5a366a5e021d72 Mon Sep 17 00:00:00 2001
From: Keith Kraus <kkraus@nvidia.com>
Date: Tue, 16 Mar 2021 00:29:31 -0400
Subject: [PATCH 04/21] Fix 0.18 --> 0.19 automerge (#7589)

Closes #7586

Brings the hotfix from #7568 into branch-0.19.

Authors:
  - Keith Kraus (@kkraus14)
  - Ray Douglass (@raydouglass)
  - MithunR (@mythrocks)

Approvers:
  - Nghia Truong (@ttnghia)

URL: https://github.com/rapidsai/cudf/pull/7589
---
 cpp/src/rolling/grouped_rolling.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index 135df6bdfe2..c1ebc9f3f9f 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -384,7 +384,7 @@ get_null_bounds_for_timestamp_column(column_view const& timestamp_column,
 
   if (timestamp_column.has_nulls()) {
     auto p_timestamps_device_view = column_device_view::create(timestamp_column);
-    auto num_groups               = group_offsets.size();
+    auto num_groups               = group_offsets.size() - 1;
 
     // Null timestamps exist. Find null bounds, per group.
     thrust::for_each(

From c1c60ba3daf36d0ee5553558f70669f454d9f0c8 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Tue, 16 Mar 2021 14:27:55 -0500
Subject: [PATCH 05/21] Fix specifying GPU architecture in JNI build (#7612)

After #7593 the variables for controlling the CUDA build for either all architectures or detected architectures changed to be based on the project name which broke the JNI build.  This updates the JNI CMakeList accordingly to fix the JNI build.

Authors:
  - Jason Lowe (@jlowe)

Approvers:
  - Rong Ou (@rongou)
  - Gera Shegalov (@gerashegalov)
  - Thomas Graves (@tgravescs)

URL: https://github.com/rapidsai/cudf/pull/7612
---
 java/src/main/native/CMakeLists.txt | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index c1239fe69ea..ceafc75f840 100755
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -17,10 +17,7 @@ cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 
 # Use GPU_ARCHS if it is defined
 if(DEFINED GPU_ARCHS)
-  unset(CMAKE_CUDA_ARCHITECTURES CACHE)
-  if(NOT "${GPU_ARCHS}" STREQUAL "ALL")
-    set(CMAKE_CUDA_ARCHITECTURES "${GPU_ARCHS}")
-  endif()
+  set(CMAKE_CUDA_ARCHITECTURES "${GPU_ARCHS}")
 endif()
 
 # If `CMAKE_CUDA_ARCHITECTURES` is not defined, build for all supported architectures. If
@@ -29,11 +26,10 @@ endif()
 
 # This needs to be run before enabling the CUDA language due to the default initialization behavior
 # of `CMAKE_CUDA_ARCHITECTURES`, https://gitlab.kitware.com/cmake/cmake/-/issues/21302
-if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-  set(CUDF_BUILD_FOR_ALL_ARCHS TRUE)
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "ALL")
+  set(CUDF_JNI_BUILD_FOR_ALL_ARCHS TRUE)
 elseif(CMAKE_CUDA_ARCHITECTURES STREQUAL "")
-  unset(CMAKE_CUDA_ARCHITECTURES CACHE)
-  set(CUDF_BUILD_FOR_DETECTED_ARCHS TRUE)
+  set(CUDF_JNI_BUILD_FOR_DETECTED_ARCHS TRUE)
 endif()
 
 project(CUDF_JNI VERSION 0.19 LANGUAGES C CXX)

From 2b2c0d2a071a9e5ad9c1f596e5013b7eba5beaee Mon Sep 17 00:00:00 2001
From: Raza Jafri <razajafri@users.noreply.github.com>
Date: Tue, 16 Mar 2021 16:00:26 -0700
Subject: [PATCH 06/21] Support Decimal DIV changes in cudf (#7527)

@codereport is making changes to the way `DIV` will behave for fixed-point types #7435. This PR contains Java changes to support those changes.

Note: This is a draft until #7435 is merged

Authors:
  - Raza Jafri (@razajafri)

Approvers:
  - MithunR (@mythrocks)
  - Jason Lowe (@jlowe)
  - Gera Shegalov (@gerashegalov)

URL: https://github.com/rapidsai/cudf/pull/7527
---
 .../java/ai/rapids/cudf/BinaryOperable.java   | 50 ++++++++++---------
 .../main/java/ai/rapids/cudf/ColumnView.java  |  7 +++
 java/src/main/native/src/ColumnViewJni.cpp    | 13 +++++
 3 files changed, 47 insertions(+), 23 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/BinaryOperable.java b/java/src/main/java/ai/rapids/cudf/BinaryOperable.java
index e5e849a74c4..68213c21956 100644
--- a/java/src/main/java/ai/rapids/cudf/BinaryOperable.java
+++ b/java/src/main/java/ai/rapids/cudf/BinaryOperable.java
@@ -38,7 +38,7 @@ public interface BinaryOperable {
    * with scale=0 as scale is required. Dtype is discarded for binary operations for decimal
    * types in cudf as a new DType is created for output type with the new scale.
    */
-  static DType implicitConversion(BinaryOperable lhs, BinaryOperable rhs) {
+  static DType implicitConversion(BinaryOp op, BinaryOperable lhs, BinaryOperable rhs) {
     DType a = lhs.getType();
     DType b = rhs.getType();
     if (a.equals(DType.FLOAT64) || b.equals(DType.FLOAT64)) {
@@ -86,13 +86,15 @@ static DType implicitConversion(BinaryOperable lhs, BinaryOperable rhs) {
       int scale = 0;
       if (a.typeId == DType.DTypeEnum.DECIMAL32) {
         if (b.typeId == DType.DTypeEnum.DECIMAL32) {
-          return DType.create(DType.DTypeEnum.DECIMAL32, scale);
+          return DType.create(DType.DTypeEnum.DECIMAL32,
+              ColumnView.getFixedPointOutputScale(op, lhs.getType(), rhs.getType()));
         } else {
           throw new IllegalArgumentException("Both columns must be of the same fixed_point type");
         }
       } else if (a.typeId == DType.DTypeEnum.DECIMAL64) {
         if (b.typeId == DType.DTypeEnum.DECIMAL64) {
-          return DType.create(DType.DTypeEnum.DECIMAL64, scale);
+          return DType.create(DType.DTypeEnum.DECIMAL64,
+              ColumnView.getFixedPointOutputScale(op, lhs.getType(), rhs.getType()));
         } else {
           throw new IllegalArgumentException("Both columns must be of the same fixed_point type");
         }
@@ -128,7 +130,7 @@ default ColumnVector add(BinaryOperable rhs, DType outType) {
    * Add + operator. this + rhs
    */
   default ColumnVector add(BinaryOperable rhs) {
-    return add(rhs, implicitConversion(this, rhs));
+    return add(rhs, implicitConversion(BinaryOp.ADD, this, rhs));
   }
 
   /**
@@ -144,7 +146,7 @@ default ColumnVector sub(BinaryOperable rhs, DType outType) {
    * Subtract one vector from another. this - rhs
    */
   default ColumnVector sub(BinaryOperable rhs) {
-    return sub(rhs, implicitConversion(this, rhs));
+    return sub(rhs, implicitConversion(BinaryOp.SUB, this, rhs));
   }
 
   /**
@@ -160,7 +162,7 @@ default ColumnVector mul(BinaryOperable rhs, DType outType) {
    * Multiply two vectors together. this * rhs
    */
   default ColumnVector mul(BinaryOperable rhs) {
-    return mul(rhs, implicitConversion(this, rhs));
+    return mul(rhs, implicitConversion(BinaryOp.MUL, this, rhs));
   }
 
   /**
@@ -176,7 +178,7 @@ default ColumnVector div(BinaryOperable rhs, DType outType) {
    * Divide one vector by another. this / rhs
    */
   default ColumnVector div(BinaryOperable rhs) {
-    return div(rhs, implicitConversion(this, rhs));
+    return div(rhs, implicitConversion(BinaryOp.DIV, this, rhs));
   }
 
   /**
@@ -192,7 +194,7 @@ default ColumnVector trueDiv(BinaryOperable rhs, DType outType) {
    * (double)this / (double)rhs
    */
   default ColumnVector trueDiv(BinaryOperable rhs) {
-    return trueDiv(rhs, implicitConversion(this, rhs));
+    return trueDiv(rhs, implicitConversion(BinaryOp.TRUE_DIV, this, rhs));
   }
 
   /**
@@ -208,7 +210,7 @@ default ColumnVector floorDiv(BinaryOperable rhs, DType outType) {
    * Math.floor(this/rhs)
    */
   default ColumnVector floorDiv(BinaryOperable rhs) {
-    return floorDiv(rhs, implicitConversion(this, rhs));
+    return floorDiv(rhs, implicitConversion(BinaryOp.FLOOR_DIV, this, rhs));
   }
 
   /**
@@ -224,7 +226,7 @@ default ColumnVector mod(BinaryOperable rhs, DType outType) {
    * this % rhs
    */
   default ColumnVector mod(BinaryOperable rhs) {
-    return mod(rhs, implicitConversion(this, rhs));
+    return mod(rhs, implicitConversion(BinaryOp.MOD, this, rhs));
   }
 
   /**
@@ -240,7 +242,7 @@ default ColumnVector pow(BinaryOperable rhs, DType outType) {
    * Math.pow(this, rhs)
    */
   default ColumnVector pow(BinaryOperable rhs) {
-    return pow(rhs, implicitConversion(this, rhs));
+    return pow(rhs, implicitConversion(BinaryOp.POW, this, rhs));
   }
 
   /**
@@ -338,7 +340,7 @@ default ColumnVector bitAnd(BinaryOperable rhs, DType outType) {
    * Bit wise and (&). this & rhs
    */
   default ColumnVector bitAnd(BinaryOperable rhs) {
-    return bitAnd(rhs, implicitConversion(this, rhs));
+    return bitAnd(rhs, implicitConversion(BinaryOp.BITWISE_AND, this, rhs));
   }
 
   /**
@@ -352,7 +354,7 @@ default ColumnVector bitOr(BinaryOperable rhs, DType outType) {
    * Bit wise or (|). this | rhs
    */
   default ColumnVector bitOr(BinaryOperable rhs) {
-    return bitOr(rhs, implicitConversion(this, rhs));
+    return bitOr(rhs, implicitConversion(BinaryOp.BITWISE_OR, this, rhs));
   }
 
   /**
@@ -366,7 +368,7 @@ default ColumnVector bitXor(BinaryOperable rhs, DType outType) {
    * Bit wise xor (^). this ^ rhs
    */
   default ColumnVector bitXor(BinaryOperable rhs) {
-    return bitXor(rhs, implicitConversion(this, rhs));
+    return bitXor(rhs, implicitConversion(BinaryOp.BITWISE_XOR, this, rhs));
   }
 
   /**
@@ -380,7 +382,7 @@ default ColumnVector and(BinaryOperable rhs, DType outType) {
    * Logical and (&&). this && rhs
    */
   default ColumnVector and(BinaryOperable rhs) {
-    return and(rhs, implicitConversion(this, rhs));
+    return and(rhs, implicitConversion(BinaryOp.LOGICAL_AND, this, rhs));
   }
 
   /**
@@ -394,7 +396,7 @@ default ColumnVector or(BinaryOperable rhs, DType outType) {
    * Logical or (||). this || rhs
    */
   default ColumnVector or(BinaryOperable rhs) {
-    return or(rhs, implicitConversion(this, rhs));
+    return or(rhs, implicitConversion(BinaryOp.LOGICAL_OR, this, rhs));
   }
 
   /**
@@ -421,7 +423,7 @@ default ColumnVector shiftLeft(BinaryOperable shiftBy, DType outType) {
    *    with this[i] << shiftBy
    */
   default ColumnVector shiftLeft(BinaryOperable shiftBy) {
-    return shiftLeft(shiftBy, implicitConversion(this, shiftBy));
+    return shiftLeft(shiftBy, implicitConversion(BinaryOp.SHIFT_LEFT, this, shiftBy));
   }
 
   /**
@@ -447,7 +449,7 @@ default ColumnVector shiftRight(BinaryOperable shiftBy, DType outType) {
    *    with this[i] >> shiftBy
    */
   default ColumnVector shiftRight(BinaryOperable shiftBy) {
-    return shiftRight(shiftBy, implicitConversion(this, shiftBy));
+    return shiftRight(shiftBy, implicitConversion(BinaryOp.SHIFT_RIGHT, this, shiftBy));
   }
 
   /**
@@ -475,7 +477,8 @@ default ColumnVector shiftRightUnsigned(BinaryOperable shiftBy, DType outType) {
    *    with this[i] >>> shiftBy
    */
   default ColumnVector shiftRightUnsigned(BinaryOperable shiftBy) {
-    return shiftRightUnsigned(shiftBy, implicitConversion(this, shiftBy));
+    return shiftRightUnsigned(shiftBy, implicitConversion(BinaryOp.SHIFT_RIGHT_UNSIGNED, this,
+        shiftBy));
   }
 
   /**
@@ -505,7 +508,7 @@ default ColumnVector arctan2(BinaryOperable xCoordinate, DType outType) {
    * in radians, between the positive x axis and the ray to the point (x, y) ≠ (0, 0).
    */
   default ColumnVector arctan2(BinaryOperable xCoordinate) {
-    return arctan2(xCoordinate, implicitConversion(this, xCoordinate));
+    return arctan2(xCoordinate, implicitConversion(BinaryOp.ATAN2, this, xCoordinate));
   }
 
   /**
@@ -529,7 +532,7 @@ default ColumnVector pmod(BinaryOperable rhs, DType outputType) {
    *
    */
   default ColumnVector pmod(BinaryOperable rhs) {
-    return pmod(rhs, implicitConversion(this, rhs));
+    return pmod(rhs, implicitConversion(BinaryOp.PMOD, this, rhs));
   }
 
   /**
@@ -557,7 +560,7 @@ default ColumnVector maxNullAware(BinaryOperable rhs, DType outType) {
    * Returns the max non null value.
    */
   default ColumnVector maxNullAware(BinaryOperable rhs) {
-    return maxNullAware(rhs, implicitConversion(this, rhs));
+    return maxNullAware(rhs, implicitConversion(BinaryOp.NULL_MAX, this, rhs));
   }
 
   /**
@@ -571,6 +574,7 @@ default ColumnVector minNullAware(BinaryOperable rhs, DType outType) {
    * Returns the min non null value.
    */
   default ColumnVector minNullAware(BinaryOperable rhs) {
-    return minNullAware(rhs, implicitConversion(this, rhs));
+    return minNullAware(rhs, implicitConversion(BinaryOp.NULL_MIN, this, rhs));
   }
+
 }
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index f36896a3c96..2f3f2bf80cf 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -129,6 +129,13 @@ public final long getNativeView() {
     return viewHandle;
   }
 
+  static int getFixedPointOutputScale(BinaryOp op, DType lhsType, DType rhsType) {
+    assert (lhsType.isDecimalType() && rhsType.isDecimalType());
+    return fixedPointOutputScale(op.nativeId, lhsType.getScale(), rhsType.getScale());
+  }
+
+  private static native int fixedPointOutputScale(int op, int lhsScale, int rhsScale);
+
   public final DType getType() {
     return type;
   }
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index e8474bda1be..0ce9d6303e4 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -60,6 +60,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <map_lookup.hpp>
+#include "cudf/types.hpp"
 
 #include "cudf_jni_apis.hpp"
 #include "dtype_utils.hpp"
@@ -1026,6 +1027,18 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVV(JNIEnv *env, j
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_fixedPointOutputScale(JNIEnv *env, jclass,
+                                                                            jint int_op,
+                                                                            jint lhs_scale,
+                                                                            jint rhs_scale) {
+  try {
+    // we just return the scale as the types will be the same as the lhs input
+    return cudf::binary_operation_fixed_point_scale(static_cast<cudf::binary_operator>(int_op),
+                                                    lhs_scale, rhs_scale);
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVS(JNIEnv *env, jclass,
                                                                   jlong lhs_view, jlong rhs_ptr,
                                                                   jint int_op, jint out_dtype,

From 57a7a8f6931f55b3241fb2b5ef51f41ab4d291ef Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 16 Mar 2021 17:09:35 -0700
Subject: [PATCH 07/21] Use file(COPY ) over file(INSTALL ) so cmake output is
 reduced (#7616)

Currently it can be hard to see important details in cudf CMake output. The biggest culprit is the usage of `file(INSTALL` which outputs per file the current status on each configuration, as shown below:
```
- Up-to-date: /home/nfs/rmaynard/Work/cudf/cpp/build/cuda-11.0/branch-0.19/release/include/libcxx/include/<file.h>
```
Moving to `file(COPY` has all the same behavior but doesn't have any output.

Authors:
  - Robert Maynard (@robertmaynard)

Approvers:
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7616
---
 cpp/cmake/Modules/StringifyJITHeaders.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/cmake/Modules/StringifyJITHeaders.cmake b/cpp/cmake/Modules/StringifyJITHeaders.cmake
index 36752d7f715..0bfb37773dc 100644
--- a/cpp/cmake/Modules/StringifyJITHeaders.cmake
+++ b/cpp/cmake/Modules/StringifyJITHeaders.cmake
@@ -164,5 +164,5 @@ add_custom_target(stringify_run DEPENDS
 # - copy libcu++ ----------------------------------------------------------------------------------
 
 # `${LIBCUDACXX_INCLUDE_DIR}/` specifies that the contents of this directory will be installed (not the directory itself)
-file(INSTALL "${LIBCUDACXX_INCLUDE_DIR}/" DESTINATION "${CUDF_GENERATED_INCLUDE_DIR}/include/libcudacxx")
-file(INSTALL "${LIBCXX_INCLUDE_DIR}"      DESTINATION "${CUDF_GENERATED_INCLUDE_DIR}/include/libcxx")
+file(COPY "${LIBCUDACXX_INCLUDE_DIR}/" DESTINATION "${CUDF_GENERATED_INCLUDE_DIR}/include/libcudacxx")
+file(COPY "${LIBCXX_INCLUDE_DIR}"      DESTINATION "${CUDF_GENERATED_INCLUDE_DIR}/include/libcxx")

From 5f127323906c92e8525a857c9fbd80ae2122cec2 Mon Sep 17 00:00:00 2001
From: Paul Taylor <paul.e.taylor@me.com>
Date: Tue, 16 Mar 2021 21:36:08 -0500
Subject: [PATCH 08/21] Always build and export the cudf::cudftestutil target
 (#7574)

We should always build the static `cudftestutil` target regardless of whether `BUILD_TESTS=ON` was passed.

Needed by https://github.com/rapidsai/cudf/pull/7484 and https://github.com/rapidsai/cuspatial/pull/365.

Authors:
  - Paul Taylor (@trxcllnt)

Approvers:
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7574
---
 cpp/CMakeLists.txt                       | 66 +++++++++++++-----------
 cpp/cmake/cudf-build-config.cmake.in     |  2 +
 cpp/cmake/cudf-config.cmake.in           |  7 +--
 cpp/cmake/thirdparty/CUDF_GetCPM.cmake   | 11 ++++
 cpp/cmake/thirdparty/CUDF_GetGTest.cmake | 53 +++++++++++++++++++
 cpp/cmake/thirdparty/CUDF_GetRMM.cmake   | 10 ++--
 cpp/tests/CMakeLists.txt                 | 32 ------------
 7 files changed, 107 insertions(+), 74 deletions(-)
 create mode 100644 cpp/cmake/thirdparty/CUDF_GetGTest.cmake

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2a51ad5e55a..103b163e260 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -117,6 +117,8 @@ include(cmake/Modules/ConfigureCUDA.cmake)
 
 # find zlib
 find_package(ZLIB REQUIRED)
+# find Threads (needed by cudftestutil)
+find_package(Threads REQUIRED)
 # add third party dependencies using CPM
 include(cmake/thirdparty/CUDF_GetCPM.cmake)
 # find boost
@@ -133,6 +135,8 @@ include(cmake/thirdparty/CUDF_GetArrow.cmake)
 include(cmake/thirdparty/CUDF_GetDLPack.cmake)
 # find libcu++
 include(cmake/thirdparty/CUDF_GetLibcudacxx.cmake)
+# find or install GoogleTest
+include(cmake/thirdparty/CUDF_GetGTest.cmake)
 # Stringify libcudf and libcudacxx headers used in JIT operations
 include(cmake/Modules/StringifyJITHeaders.cmake)
 
@@ -417,7 +421,8 @@ target_include_directories(cudf
                        "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/include>"
                        "$<BUILD_INTERFACE:${CUDF_GENERATED_INCLUDE_DIR}/include>"
            PRIVATE     "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
-           INTERFACE   "$<INSTALL_INTERFACE:include>")
+           INTERFACE   "$<INSTALL_INTERFACE:include>"
+                       "$<INSTALL_INTERFACE:include/libcudf/libcudaxx>")
 
 # Add Conda library paths if specified
 if(CONDA_LINK_DIRS)
@@ -480,34 +485,37 @@ add_library(cudf::cudf ALIAS cudf)
 # - tests and benchmarks --------------------------------------------------------------------------
 ###################################################################################################
 
-if (CUDF_BUILD_TESTS OR CUDF_BUILD_BENCHMARKS)
-    # Find or install GoogleTest
-    CPMFindPackage(NAME GTest
-        VERSION         1.10.0
-        GIT_REPOSITORY  https://github.com/google/googletest.git
-        GIT_TAG         release-1.10.0
-        GIT_SHALLOW     TRUE
-        OPTIONS         "INSTALL_GTEST OFF"
-        # googletest >= 1.10.0 provides a cmake config file -- use it if it exists
-        FIND_PACKAGE_ARGUMENTS "CONFIG")
-    # Add GTest aliases if they don't already exist.
-    # Assumes if GTest::gtest doesn't exist, the others don't either.
-    # TODO: Is this always a valid assumption?
-    if(NOT TARGET GTest::gtest)
-        add_library(GTest::gtest ALIAS gtest)
-        add_library(GTest::gmock ALIAS gmock)
-        add_library(GTest::gtest_main ALIAS gtest_main)
-        add_library(GTest::gmock_main ALIAS gmock_main)
-    endif()
-    if(GTest_ADDED)
-        install(TARGETS gmock
-                        gtest
-                        gmock_main
-                        gtest_main
-            DESTINATION lib
-            EXPORT cudf-targets)
-    endif()
-endif()
+###################################################################################################
+# - build cudftestutil ----------------------------------------------------------------------------
+
+add_library(cudftestutil STATIC
+            tests/utilities/base_fixture.cpp
+            tests/utilities/column_utilities.cu
+            tests/utilities/table_utilities.cu
+            tests/strings/utilities.cu)
+
+target_compile_options(cudftestutil
+            PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
+                   "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
+)
+
+target_compile_features(cudftestutil PUBLIC cxx_std_14 cuda_std_14)
+
+target_link_libraries(cudftestutil
+               PUBLIC GTest::gmock
+                      GTest::gtest
+                      Threads::Threads
+                      cudf)
+
+target_include_directories(cudftestutil
+    PUBLIC "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
+           "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>")
+
+install(TARGETS cudftestutil
+        DESTINATION lib
+        EXPORT cudf-targets)
+
+add_library(cudf::cudftestutil ALIAS cudftestutil)
 
 ###################################################################################################
 # - add tests -------------------------------------------------------------------------------------
diff --git a/cpp/cmake/cudf-build-config.cmake.in b/cpp/cmake/cudf-build-config.cmake.in
index 5f6b265384e..3f4d2e5586e 100644
--- a/cpp/cmake/cudf-build-config.cmake.in
+++ b/cpp/cmake/cudf-build-config.cmake.in
@@ -36,6 +36,8 @@ include(@CUDF_SOURCE_DIR@/cmake/thirdparty/CUDF_GetThrust.cmake)
 # find rmm
 set(CUDF_MIN_VERSION_rmm "${CUDF_VERSION_MAJOR}.${CUDF_VERSION_MINOR}")
 include(@CUDF_SOURCE_DIR@/cmake/thirdparty/CUDF_GetRMM.cmake)
+# find gtest
+include(@CUDF_SOURCE_DIR@/cmake/thirdparty/CUDF_GetGTest.cmake)
 
 # find arrow
 if(NOT EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-arrow-targets.cmake")
diff --git a/cpp/cmake/cudf-config.cmake.in b/cpp/cmake/cudf-config.cmake.in
index aeb7d9915cf..1147e1160e7 100644
--- a/cpp/cmake/cudf-config.cmake.in
+++ b/cpp/cmake/cudf-config.cmake.in
@@ -19,13 +19,8 @@ find_dependency(Arrow @CUDF_VERSION_Arrow@)
 find_dependency(ArrowCUDA @CUDF_VERSION_Arrow@)
 find_dependency(Boost @CUDF_MIN_VERSION_Boost@)
 
-find_dependency(jitify)
 find_dependency(rmm @CUDF_MIN_VERSION_rmm@)
-find_dependency(Thrust @CUDF_MIN_VERSION_Thrust@)
-find_dependency(dlpack @CUDF_MIN_VERSION_dlpack@)
-find_dependency(libcudacxx @CUDF_MIN_VERSION_libcudacxx@)
-
-thrust_create_target(cudf::Thrust FROM_OPTIONS)
+find_dependency(gtest @CUDF_MIN_VERSION_gtest@)
 
 list(POP_FRONT CMAKE_MODULE_PATH)
 
diff --git a/cpp/cmake/thirdparty/CUDF_GetCPM.cmake b/cpp/cmake/thirdparty/CUDF_GetCPM.cmake
index f50b9e7f646..5162aaf6ce7 100644
--- a/cpp/cmake/thirdparty/CUDF_GetCPM.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetCPM.cmake
@@ -17,3 +17,14 @@ if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION}))
 endif()
 
 include(${CPM_DOWNLOAD_LOCATION})
+
+# If a target is installed, found by the `find_package` step of CPMFindPackage,
+# and marked as IMPORTED, make it globally accessible to consumers of our libs.
+function(fix_cmake_global_defaults target)
+    if(TARGET ${target})
+        get_target_property(_is_imported ${target} IMPORTED)
+        if(_is_imported)
+            set_target_properties(${target} PROPERTIES IMPORTED_GLOBAL TRUE)
+        endif()
+    endif()
+endfunction()
diff --git a/cpp/cmake/thirdparty/CUDF_GetGTest.cmake b/cpp/cmake/thirdparty/CUDF_GetGTest.cmake
new file mode 100644
index 00000000000..2911e4fce29
--- /dev/null
+++ b/cpp/cmake/thirdparty/CUDF_GetGTest.cmake
@@ -0,0 +1,53 @@
+#=============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_gtest VERSION)
+    # Find or install GoogleTest
+    CPMFindPackage(NAME GTest
+        VERSION         ${VERSION}
+        GIT_REPOSITORY  https://github.com/google/googletest.git
+        GIT_TAG         release-${VERSION}
+        GIT_SHALLOW     TRUE
+        OPTIONS         "INSTALL_GTEST OFF"
+        # googletest >= 1.10.0 provides a cmake config file -- use it if it exists
+        FIND_PACKAGE_ARGUMENTS "CONFIG")
+    # Add GTest aliases if they don't already exist.
+    # Assumes if GTest::gtest doesn't exist, the others don't either.
+    # TODO: Is this always a valid assumption?
+    if(NOT TARGET GTest::gtest)
+        add_library(GTest::gtest ALIAS gtest)
+        add_library(GTest::gmock ALIAS gmock)
+        add_library(GTest::gtest_main ALIAS gtest_main)
+        add_library(GTest::gmock_main ALIAS gmock_main)
+    endif()
+    # Make sure consumers of cudf can also see GTest::* targets
+    fix_cmake_global_defaults(GTest::gtest)
+    fix_cmake_global_defaults(GTest::gmock)
+    fix_cmake_global_defaults(GTest::gtest_main)
+    fix_cmake_global_defaults(GTest::gmock_main)
+    if(GTest_ADDED)
+        install(TARGETS gmock
+                        gtest
+                        gmock_main
+                        gtest_main
+            DESTINATION lib
+            EXPORT cudf-targets)
+    endif()
+endfunction()
+
+set(CUDF_MIN_VERSION_gtest 1.10.0)
+
+find_and_configure_gtest(${CUDF_MIN_VERSION_gtest})
diff --git a/cpp/cmake/thirdparty/CUDF_GetRMM.cmake b/cpp/cmake/thirdparty/CUDF_GetRMM.cmake
index 16c8a2b39f4..54e0a8620c5 100644
--- a/cpp/cmake/thirdparty/CUDF_GetRMM.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetRMM.cmake
@@ -48,13 +48,9 @@ function(find_and_configure_rmm VERSION)
     cudf_restore_if_enabled(BUILD_TESTS)
     cudf_restore_if_enabled(BUILD_BENCHMARKS)
 
-    #Make sure consumers of cudf can also see rmm::rmm
-    if(TARGET rmm::rmm)
-        get_target_property(rmm_is_imported rmm::rmm IMPORTED)
-        if(rmm_is_imported)
-            set_target_properties(rmm::rmm PROPERTIES IMPORTED_GLOBAL TRUE)
-        endif()
-    endif()
+    # Make sure consumers of cudf can also see rmm::rmm
+    fix_cmake_global_defaults(rmm::rmm)
+
     if(NOT rmm_BINARY_DIR IN_LIST CMAKE_PREFIX_PATH)
         list(APPEND CMAKE_PREFIX_PATH "${rmm_BINARY_DIR}")
         set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} PARENT_SCOPE)
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 40829c74957..b94e9587fc0 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -14,38 +14,6 @@
 # limitations under the License.
 #=============================================================================
 
-###################################################################################################
-# - common test utils -----------------------------------------------------------------------------
-
-find_package(Threads REQUIRED)
-
-add_library(cudftestutil STATIC
-            utilities/base_fixture.cpp
-            utilities/column_utilities.cu
-            utilities/table_utilities.cu
-            strings/utilities.cu)
-
-target_compile_options(cudftestutil
-            PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
-                   "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
-)
-
-target_compile_features(cudftestutil PUBLIC cxx_std_14 cuda_std_14)
-
-target_link_libraries(cudftestutil
-               PUBLIC GTest::gmock
-                      GTest::gtest
-                      Threads::Threads
-                      cudf)
-
-target_include_directories(cudftestutil
-    PUBLIC "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
-           "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>")
-
-install(TARGETS cudftestutil
-        DESTINATION lib
-        EXPORT cudf-targets)
-
 ###################################################################################################
 # - compiler function -----------------------------------------------------------------------------
 

From 34cccfe7b922562856e3d188f90d6a6ad50ab77f Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 16 Mar 2021 21:08:05 -0700
Subject: [PATCH 09/21] Fix ORC writer OOM issue (#7605)

Closes #7588

The stream size used to be calculated incorrectly, leading to huge allocation for the encoded data buffer.

This PR fixes the stream size computation to count each row group only once.

Authors:
  - Vukasin Milovanovic (@vuule)

Approvers:
  - Ram (Ramakrishna Prabhu) (@rgsl888prabhu)
  - Kumar Aatish (@kaatish)
  - Devavret Makkar (@devavret)

URL: https://github.com/rapidsai/cudf/pull/7605
---
 cpp/src/io/orc/reader_impl.cu |  2 +-
 cpp/src/io/orc/writer_impl.cu | 13 ++++++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 80935e3fbd5..61adef26dab 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -139,7 +139,7 @@ struct orc_stream_info {
   }
   uint64_t offset;      // offset in file
   size_t dst_pos;       // offset in memory relative to start of compressed stripe data
-  uint32_t length;      // length in file
+  size_t length;        // length in file
   uint32_t gdf_idx;     // column index
   uint32_t stripe_idx;  // stripe index
 };
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 1c99c99369b..eb5e90bbeec 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -476,7 +476,6 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
         break;
       case TypeKind::STRING: {
         bool enable_dict           = enable_dictionary_;
-        size_t direct_data_size    = 0;
         size_t dict_data_size      = 0;
         size_t dict_strings        = 0;
         size_t dict_lengths_div512 = 0;
@@ -488,11 +487,15 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
             dict_lengths_div512 += (sd->num_strings + 0x1ff) >> 9;
             dict_data_size += sd->dict_char_count;
           }
-          direct_data_size += std::accumulate(
-            stripe.cbegin(), stripe.cend(), direct_data_size, [&](auto data_size, auto rg_idx) {
-              return data_size + column.host_dict_chunk(rg_idx)->string_char_count;
-            });
         }
+
+        auto const direct_data_size =
+          std::accumulate(stripe_bounds.front().cbegin(),
+                          stripe_bounds.back().cend(),
+                          size_t{0},
+                          [&](auto data_size, auto rg_idx) {
+                            return data_size + column.host_dict_chunk(rg_idx)->string_char_count;
+                          });
         if (enable_dict) {
           uint32_t dict_bits = 0;
           for (dict_bits = 1; dict_bits < 32; dict_bits <<= 1) {

From 0146f743987a6f2a51aab08f34771eb4d3531afc Mon Sep 17 00:00:00 2001
From: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Date: Wed, 17 Mar 2021 01:40:50 -0400
Subject: [PATCH 10/21] Add explode_outer and explode_outer_position (#7499)

This code adds support for explode_outer and explode_outer_position. These differ from explode and explode_position by the way null and empty lists are handled. Explode discards null and empty lists and as such, lifts the child column directly out of the list column. Explode_outer must find these null and empty lists and make space for a null entry in the child column. This means we need to gather both the table and the exploded column. Further, we must make a pass on the exploded column to count these entries initially as we do not know the required size of the gather maps until we have this information and it isn't just the null count.

If there are no null or empty lists in the input, the normal explode function is called as it is simpler, but it does come at the cost of marching the offsets looking for duplicates, which indicate null or empty lists.

closes #7466

Authors:
  - Mike Wilson (@hyperbolic2346)

Approvers:
  - AJ Schmidt (@ajschmidt8)
  - Jake Hemstad (@jrhemstad)
  - Nghia Truong (@ttnghia)

URL: https://github.com/rapidsai/cudf/pull/7499
---
 conda/recipes/libcudf/meta.yaml       |   1 +
 cpp/CMakeLists.txt                    |   2 +-
 cpp/include/cudf/lists/explode.hpp    | 200 +++++++
 cpp/include/cudf/reshape.hpp          |  86 ---
 cpp/include/cudf/table/table.hpp      |  26 +-
 cpp/include/cudf/table/table_view.hpp |  19 +
 cpp/src/lists/explode.cu              | 314 ++++++++++
 cpp/src/reshape/explode.cu            | 178 ------
 cpp/src/table/table.cpp               |   8 -
 cpp/src/table/table_view.cpp          |   6 +-
 cpp/tests/CMakeLists.txt              |   2 +-
 cpp/tests/lists/explode_tests.cpp     | 819 ++++++++++++++++++++++++++
 cpp/tests/reshape/explode_tests.cpp   | 530 -----------------
 13 files changed, 1381 insertions(+), 810 deletions(-)
 create mode 100644 cpp/include/cudf/lists/explode.hpp
 create mode 100644 cpp/src/lists/explode.cu
 delete mode 100644 cpp/src/reshape/explode.cu
 create mode 100644 cpp/tests/lists/explode_tests.cpp
 delete mode 100644 cpp/tests/reshape/explode_tests.cpp

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index e709824721c..5657d21889f 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -134,6 +134,7 @@ test:
     - test -f $PREFIX/include/cudf/lists/detail/copying.hpp
     - test -f $PREFIX/include/cudf/lists/detail/sorting.hpp
     - test -f $PREFIX/include/cudf/lists/count_elements.hpp
+    - test -f $PREFIX/include/cudf/lists/explode.hpp
     - test -f $PREFIX/include/cudf/lists/drop_list_duplicates.hpp
     - test -f $PREFIX/include/cudf/lists/extract.hpp
     - test -f $PREFIX/include/cudf/lists/contains.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 103b163e260..39acc362450 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -260,6 +260,7 @@ add_library(cudf
     src/lists/copying/gather.cu
     src/lists/copying/segmented_gather.cu
     src/lists/count_elements.cu
+    src/lists/explode.cu
     src/lists/extract.cu
     src/lists/drop_list_duplicates.cu
     src/lists/lists_column_factories.cu
@@ -289,7 +290,6 @@ add_library(cudf
     src/replace/nulls.cu
     src/replace/replace.cu
     src/reshape/byte_cast.cu
-    src/reshape/explode.cu
     src/reshape/interleave_columns.cu
     src/reshape/tile.cu
     src/rolling/grouped_rolling.cu
diff --git a/cpp/include/cudf/lists/explode.hpp b/cpp/include/cudf/lists/explode.hpp
new file mode 100644
index 00000000000..156d4b9275d
--- /dev/null
+++ b/cpp/include/cudf/lists/explode.hpp
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <memory>
+
+namespace cudf {
+
+/**
+ * @brief Explodes a list column's elements.
+ *
+ * Any list is exploded, which means the elements of the list in each row are expanded into new rows
+ * in the output. The corresponding rows for other columns in the input are duplicated. Example:
+ * ```
+ * [[5,10,15], 100],
+ * [[20,25],   200],
+ * [[30],      300],
+ * returns
+ * [5,         100],
+ * [10,        100],
+ * [15,        100],
+ * [20,        200],
+ * [25,        200],
+ * [30,        300],
+ * ```
+ *
+ * Nulls and empty lists propagate in different ways depending on what is null or empty.
+ *```
+ * [[5,null,15], 100],
+ * [null,        200],
+ * [[],          300],
+ * returns
+ * [5,           100],
+ * [null,        100],
+ * [15,          100],
+ * ```
+ * Note that null lists are not included in the resulting table, but nulls inside
+ * lists and empty lists will be represented with a null entry for that column in that row.
+ *
+ * @param input_table Table to explode.
+ * @param explode_column_idx Column index to explode inside the table.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ *
+ * @return A new table with explode_col exploded.
+ */
+std::unique_ptr<table> explode(
+  table_view const& input_table,
+  size_type explode_column_idx,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Explodes a list column's elements and includes a position column.
+ *
+ * Any list is exploded, which means the elements of the list in each row are expanded into new rows
+ * in the output. The corresponding rows for other columns in the input are duplicated. A position
+ * column is added that has the index inside the original list for each row. Example:
+ * ```
+ * [[5,10,15], 100],
+ * [[20,25],   200],
+ * [[30],      300],
+ * returns
+ * [0,   5,     100],
+ * [1,   10,    100],
+ * [2,   15,    100],
+ * [0,   20,    200],
+ * [1,   25,    200],
+ * [0,   30,    300],
+ * ```
+ *
+ * Nulls and empty lists propagate in different ways depending on what is null or empty.
+ *```
+ * [[5,null,15], 100],
+ * [null,        200],
+ * [[],          300],
+ * returns
+ * [0,     5,    100],
+ * [1,  null,    100],
+ * [2,    15,    100],
+ * ```
+ * Note that null lists are not included in the resulting table, but nulls inside
+ * lists and empty lists will be represented with a null entry for that column in that row.
+ *
+ * @param input_table Table to explode.
+ * @param explode_column_idx Column index to explode inside the table.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ *
+ * @return A new table with exploded value and position. The column order of return table is
+ *         [cols before explode_input, explode_position, explode_value, cols after explode_input].
+ */
+std::unique_ptr<table> explode_position(
+  table_view const& input_table,
+  size_type explode_column_idx,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Explodes a list column's elements retaining any null entries or empty lists inside.
+ *
+ * Any list is exploded, which means the elements of the list in each row are expanded into new rows
+ * in the output. The corresponding rows for other columns in the input are duplicated. Example:
+ * ```
+ * [[5,10,15], 100],
+ * [[20,25],   200],
+ * [[30],      300],
+ * returns
+ * [5,         100],
+ * [10,        100],
+ * [15,        100],
+ * [20,        200],
+ * [25,        200],
+ * [30,        300],
+ * ```
+ *
+ * Nulls and empty lists propagate as null entries in the result.
+ *```
+ * [[5,null,15], 100],
+ * [null,        200],
+ * [[],          300],
+ * returns
+ * [5,           100],
+ * [null,        100],
+ * [15,          100],
+ * [null,        200],
+ * [null,        300],
+ * ```
+ *
+ * @param input_table Table to explode.
+ * @param explode_column_idx Column index to explode inside the table.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ *
+ * @return A new table with explode_col exploded.
+ */
+std::unique_ptr<table> explode_outer(
+  table_view const& input_table,
+  size_type explode_column_idx,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Explodes a list column's elements retaining any null entries or empty lists and includes a
+ *position column.
+ *
+ * Any list is exploded, which means the elements of the list in each row are expanded into new rows
+ * in the output. The corresponding rows for other columns in the input are duplicated. A position
+ * column is added that has the index inside the original list for each row. Example:
+ * ```
+ * [[5,10,15], 100],
+ * [[20,25],   200],
+ * [[30],      300],
+ * returns
+ * [0,   5,    100],
+ * [1,  10,    100],
+ * [2,  15,    100],
+ * [0,  20,    200],
+ * [1,  25,    200],
+ * [0,  30,    300],
+ * ```
+ *
+ * Nulls and empty lists propagate as null entries in the result.
+ *```
+ * [[5,null,15], 100],
+ * [null,        200],
+ * [[],          300],
+ * returns
+ * [0,     5,    100],
+ * [1,  null,    100],
+ * [2,    15,    100],
+ * [0,  null,    200],
+ * [0,  null,    300],
+ * ```
+ *
+ * @param input_table Table to explode.
+ * @param explode_column_idx Column index to explode inside the table.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ *
+ * @return A new table with explode_col exploded.
+ */
+std::unique_ptr<table> explode_outer_position(
+  table_view const& input_table,
+  size_type explode_column_idx,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/** @} */  // end of group
+
+}  // namespace cudf
diff --git a/cpp/include/cudf/reshape.hpp b/cpp/include/cudf/reshape.hpp
index a6030f31e6d..74e4ebb8d05 100644
--- a/cpp/include/cudf/reshape.hpp
+++ b/cpp/include/cudf/reshape.hpp
@@ -97,92 +97,6 @@ std::unique_ptr<column> byte_cast(
   flip_endianness endian_configuration,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Explodes a list column's elements.
- *
- * Any list is exploded, which means the elements of the list in each row are expanded into new rows
- * in the output. The corresponding rows for other columns in the input are duplicated. Example:
- * ```
- * [[5,10,15], 100],
- * [[20,25],   200],
- * [[30],      300],
- * returns
- * [5,         100],
- * [10,        100],
- * [15,        100],
- * [20,        200],
- * [25,        200],
- * [30,        300],
- * ```
- *
- * Nulls and empty lists propagate in different ways depending on what is null or empty.
- *```
- * [[5,null,15], 100],
- * [null,        200],
- * [[],          300],
- * returns
- * [5,           100],
- * [null,        100],
- * [15,          100],
- * ```
- * Note that null lists are not included in the resulting table, but nulls inside
- * lists and empty lists will be represented with a null entry for that column in that row.
- *
- * @param input_table Table to explode.
- * @param explode_column_idx Column index to explode inside the table.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- *
- * @return A new table with explode_col exploded.
- */
-std::unique_ptr<table> explode(
-  table_view const& input_table,
-  size_type explode_column_idx,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Explodes a list column's elements and includes a position column.
- *
- * Any list is exploded, which means the elements of the list in each row are expanded into new rows
- * in the output. The corresponding rows for other columns in the input are duplicated. A position
- * column is added that has the index inside the original list for each row. Example:
- * ```
- * [[5,10,15], 100],
- * [[20,25],   200],
- * [[30],      300],
- * returns
- * [0,   5,    100],
- * [1,   10,   100],
- * [2,   15,    100],
- * [0,   20,    200],
- * [1,   25,    200],
- * [0,   30,    300],
- * ```
- *
- * Nulls and empty lists propagate in different ways depending on what is null or empty.
- *```
- * [[5,null,15], 100],
- * [null,        200],
- * [[],          300],
- * returns
- * [0,    5,     100],
- * [1,    null,  100],
- * [2,    15,    100],
- * ```
- * Note that null lists are not included in the resulting table, but nulls inside
- * lists and empty lists will be represented with a null entry for that column in that row.
- *
- * @param input_table Table to explode.
- * @param explode_column_idx Column index to explode inside the table.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- *
- * @return A new table with exploded value and position. The column order of return table is
- *         [cols before explode_input, explode_position, explode_value, cols after explode_input].
- */
-std::unique_ptr<table> explode_position(
-  table_view const& input_table,
-  size_type explode_column_idx,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
 /** @} */  // end of group
 
 }  // namespace cudf
diff --git a/cpp/include/cudf/table/table.hpp b/cpp/include/cudf/table/table.hpp
index 553cf5e9096..4571362076c 100644
--- a/cpp/include/cudf/table/table.hpp
+++ b/cpp/include/cudf/table/table.hpp
@@ -110,6 +110,27 @@ class table {
    */
   std::vector<std::unique_ptr<column>> release();
 
+  /**
+   * @brief Returns a table_view built from a range of column indices.
+   *
+   * @throws std::out_of_range
+   * If any index is outside [0, num_columns())
+   *
+   * @param begin Beginning of the range
+   * @param end Ending of the range
+   * @return A table_view consisting of columns from the original table
+   * specified by the elements of `column_indices`
+   */
+
+  template <typename InputIterator>
+  table_view select(InputIterator begin, InputIterator end) const
+  {
+    std::vector<column_view> columns(std::distance(begin, end));
+    std::transform(
+      begin, end, columns.begin(), [this](auto index) { return _columns.at(index)->view(); });
+    return table_view(columns);
+  }
+
   /**
    * @brief Returns a table_view with set of specified columns.
    *
@@ -120,7 +141,10 @@ class table {
    * @return A table_view consisting of columns from the original table
    * specified by the elements of `column_indices`
    */
-  table_view select(std::vector<cudf::size_type> const& column_indices) const;
+  table_view select(std::vector<cudf::size_type> const& column_indices) const
+  {
+    return select(column_indices.begin(), column_indices.end());
+  };
 
   /**
    * @brief Returns a reference to the specified column
diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp
index 22f2073f73c..083366cc310 100644
--- a/cpp/include/cudf/table/table_view.hpp
+++ b/cpp/include/cudf/table/table_view.hpp
@@ -174,6 +174,25 @@ class table_view : public detail::table_view_base<column_view> {
    */
   table_view(std::vector<table_view> const& views);
 
+  /**
+   * @brief Returns a table_view built from a range of column indices.
+   *
+   * @throws std::out_of_range
+   * If any index is outside [0, num_columns())
+   *
+   * @param begin Beginning of the range
+   * @param end Ending of the range
+   * @return A table_view consisting of columns from the original table
+   * specified by the elements of `column_indices`
+   */
+  template <typename InputIterator>
+  table_view select(InputIterator begin, InputIterator end) const
+  {
+    std::vector<column_view> columns(std::distance(begin, end));
+    std::transform(begin, end, columns.begin(), [this](auto index) { return this->column(index); });
+    return table_view(columns);
+  }
+
   /**
    * @brief Returns a table_view with set of specified columns.
    *
diff --git a/cpp/src/lists/explode.cu b/cpp/src/lists/explode.cu
new file mode 100644
index 00000000000..336aabde15e
--- /dev/null
+++ b/cpp/src/lists/explode.cu
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/gather.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/lists/explode.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include <memory>
+#include <type_traits>
+
+namespace cudf {
+namespace detail {
+namespace {
+
+std::unique_ptr<table> build_table(
+  table_view const& input_table,
+  size_type const explode_column_idx,
+  column_view const& sliced_child,
+  cudf::device_span<size_type const> gather_map,
+  thrust::optional<cudf::device_span<size_type const>> explode_col_gather_map,
+  thrust::optional<rmm::device_uvector<size_type>> position_array,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  auto select_iter = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(0),
+    [explode_column_idx](size_type i) { return i >= explode_column_idx ? i + 1 : i; });
+
+  auto gathered_table =
+    detail::gather(input_table.select(select_iter, select_iter + input_table.num_columns() - 1),
+                   gather_map.begin(),
+                   gather_map.end(),
+                   cudf::out_of_bounds_policy::DONT_CHECK,
+                   stream,
+                   mr);
+
+  std::vector<std::unique_ptr<column>> columns = gathered_table.release()->release();
+
+  columns.insert(columns.begin() + explode_column_idx,
+                 explode_col_gather_map
+                   ? std::move(detail::gather(table_view({sliced_child}),
+                                              explode_col_gather_map->begin(),
+                                              explode_col_gather_map->end(),
+                                              cudf::out_of_bounds_policy::NULLIFY,
+                                              stream,
+                                              mr)
+                                 ->release()[0])
+                   : std::make_unique<column>(sliced_child, stream, mr));
+
+  if (position_array) {
+    size_type position_size = position_array->size();
+    columns.insert(columns.begin() + explode_column_idx,
+                   std::make_unique<column>(
+                     data_type(type_to_id<size_type>()), position_size, position_array->release()));
+  }
+
+  return std::make_unique<table>(std::move(columns));
+}
+}  // namespace
+
+std::unique_ptr<table> explode(table_view const& input_table,
+                               size_type const explode_column_idx,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
+{
+  lists_column_view explode_col{input_table.column(explode_column_idx)};
+  auto sliced_child = explode_col.get_sliced_child(stream);
+  rmm::device_uvector<size_type> gather_map(sliced_child.size(), stream);
+
+  // Sliced columns may require rebasing of the offsets.
+  auto offsets = explode_col.offsets_begin();
+  // offsets + 1 here to skip the 0th offset, which removes a - 1 operation later.
+  auto offsets_minus_one = thrust::make_transform_iterator(
+    thrust::next(offsets), [offsets] __device__(auto i) { return (i - offsets[0]) - 1; });
+  auto counting_iter = thrust::make_counting_iterator(0);
+
+  // This looks like an off-by-one bug, but what is going on here is that we need to reduce each
+  // result from `lower_bound` by 1 to build the correct gather map. This can be accomplished by
+  // skipping the first entry and using the result of `lower_bound` directly.
+  thrust::lower_bound(rmm::exec_policy(stream),
+                      offsets_minus_one,
+                      offsets_minus_one + explode_col.size(),
+                      counting_iter,
+                      counting_iter + gather_map.size(),
+                      gather_map.begin());
+
+  return build_table(input_table,
+                     explode_column_idx,
+                     sliced_child,
+                     gather_map,
+                     thrust::nullopt,
+                     thrust::nullopt,
+                     stream,
+                     mr);
+}
+
+std::unique_ptr<table> explode_position(table_view const& input_table,
+                                        size_type const explode_column_idx,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  lists_column_view explode_col{input_table.column(explode_column_idx)};
+  auto sliced_child = explode_col.get_sliced_child(stream);
+  rmm::device_uvector<size_type> gather_map(sliced_child.size(), stream);
+
+  // Sliced columns may require rebasing of the offsets.
+  auto offsets = explode_col.offsets_begin();
+  // offsets + 1 here to skip the 0th offset, which removes a - 1 operation later.
+  auto offsets_minus_one = thrust::make_transform_iterator(
+    offsets + 1, [offsets] __device__(auto i) { return (i - offsets[0]) - 1; });
+  auto counting_iter = thrust::make_counting_iterator(0);
+
+  rmm::device_uvector<size_type> pos(sliced_child.size(), stream, mr);
+
+  // This looks like an off-by-one bug, but what is going on here is that we need to reduce each
+  // result from `lower_bound` by 1 to build the correct gather map. This can be accomplished by
+  // skipping the first entry and using the result of `lower_bound` directly.
+  thrust::transform(
+    rmm::exec_policy(stream),
+    counting_iter,
+    counting_iter + gather_map.size(),
+    gather_map.begin(),
+    [position_array = pos.data(),
+     offsets_minus_one,
+     offsets,
+     offset_size = explode_col.size()] __device__(auto idx) -> size_type {
+      auto lb_idx = thrust::distance(
+        offsets_minus_one,
+        thrust::lower_bound(thrust::seq, offsets_minus_one, offsets_minus_one + offset_size, idx));
+      position_array[idx] = idx - (offsets[lb_idx] - offsets[0]);
+      return lb_idx;
+    });
+
+  return build_table(input_table,
+                     explode_column_idx,
+                     sliced_child,
+                     gather_map,
+                     thrust::nullopt,
+                     std::move(pos),
+                     stream,
+                     mr);
+}
+
+std::unique_ptr<table> explode_outer(table_view const& input_table,
+                                     size_type const explode_column_idx,
+                                     bool include_position,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+{
+  lists_column_view explode_col{input_table.column(explode_column_idx)};
+  auto sliced_child  = explode_col.get_sliced_child(stream);
+  auto counting_iter = thrust::make_counting_iterator(0);
+  auto offsets       = explode_col.offsets_begin();
+
+  // number of nulls or empty lists found so far in the explode column
+  rmm::device_uvector<size_type> null_or_empty_offset(explode_col.size(), stream);
+
+  auto null_or_empty = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(0),
+    [offsets, offsets_size = explode_col.size() - 1] __device__(int idx) {
+      return (idx > offsets_size || (offsets[idx + 1] != offsets[idx])) ? 0 : 1;
+    });
+  thrust::inclusive_scan(rmm::exec_policy(stream),
+                         null_or_empty,
+                         null_or_empty + sliced_child.size(),
+                         null_or_empty_offset.begin());
+
+  auto null_or_empty_count =
+    null_or_empty_offset.size() > 0 ? null_or_empty_offset.back_element(stream) : 0;
+  if (null_or_empty_count == 0) {
+    // performance penalty to run the below loop if there are no nulls or empty lists.
+    // run simple explode instead
+    return include_position ? explode_position(input_table, explode_column_idx, stream, mr)
+                            : explode(input_table, explode_column_idx, stream, mr);
+  }
+
+  auto gather_map_size = sliced_child.size() + null_or_empty_count;
+
+  rmm::device_uvector<size_type> gather_map(gather_map_size, stream);
+  rmm::device_uvector<size_type> explode_col_gather_map(gather_map_size, stream);
+  rmm::device_uvector<size_type> pos(include_position ? gather_map_size : 0, stream, mr);
+
+  // offsets + 1 here to skip the 0th offset, which removes a - 1 operation later.
+  auto offsets_minus_one = thrust::make_transform_iterator(
+    thrust::next(offsets), [offsets] __device__(auto i) { return (i - offsets[0]) - 1; });
+  // Fill in gather map with all the child column's entries
+  thrust::for_each(rmm::exec_policy(stream),
+                   counting_iter,
+                   counting_iter + sliced_child.size(),
+                   [offsets_minus_one,
+                    gather_map             = gather_map.begin(),
+                    explode_col_gather_map = explode_col_gather_map.begin(),
+                    position_array         = pos.begin(),
+                    include_position,
+                    offsets,
+                    null_or_empty_offset = null_or_empty_offset.begin(),
+                    null_or_empty,
+                    offset_size = explode_col.offsets().size() - 1] __device__(auto idx) {
+                     auto lb_idx = thrust::distance(
+                       offsets_minus_one,
+                       thrust::lower_bound(
+                         thrust::seq, offsets_minus_one, offsets_minus_one + (offset_size), idx));
+                     auto index_to_write                    = null_or_empty_offset[lb_idx] + idx;
+                     gather_map[index_to_write]             = lb_idx;
+                     explode_col_gather_map[index_to_write] = idx;
+                     if (include_position) {
+                       position_array[index_to_write] = idx - (offsets[lb_idx] - offsets[0]);
+                     }
+                     if (null_or_empty[idx]) {
+                       auto invalid_index = null_or_empty_offset[idx] == 0
+                                              ? offsets[idx]
+                                              : offsets[idx] + null_or_empty_offset[idx] - 1;
+                       gather_map[invalid_index] = idx;
+
+                       // negative one to indicate a null value
+                       explode_col_gather_map[invalid_index] = -1;
+
+                       if (include_position) { position_array[invalid_index] = 0; }
+                     }
+                   });
+
+  return build_table(
+    input_table,
+    explode_column_idx,
+    sliced_child,
+    gather_map,
+    explode_col_gather_map,
+    include_position ? std::move(pos) : thrust::optional<rmm::device_uvector<size_type>>{},
+    stream,
+    mr);
+}
+
+}  // namespace detail
+
+/**
+ * @copydoc cudf::explode(input_table,explode_column_idx,rmm::mr::device_memory_resource)
+ */
+std::unique_ptr<table> explode(table_view const& input_table,
+                               size_type explode_column_idx,
+                               rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
+               "Unsupported non-list column");
+  return detail::explode(input_table, explode_column_idx, rmm::cuda_stream_default, mr);
+}
+
+/**
+ * @copydoc cudf::explode_position(input_table,explode_column_idx,rmm::mr::device_memory_resource)
+ */
+std::unique_ptr<table> explode_position(table_view const& input_table,
+                                        size_type explode_column_idx,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
+               "Unsupported non-list column");
+  return detail::explode_position(input_table, explode_column_idx, rmm::cuda_stream_default, mr);
+}
+
+/**
+ * @copydoc cudf::explode_outer(input_table,explode_column_idx,rmm::mr::device_memory_resource)
+ */
+std::unique_ptr<table> explode_outer(table_view const& input_table,
+                                     size_type explode_column_idx,
+                                     rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
+               "Unsupported non-list column");
+  return detail::explode_outer(
+    input_table, explode_column_idx, false, rmm::cuda_stream_default, mr);
+}
+
+/**
+ * @copydoc
+ * cudf::explode_outer_position(input_table,explode_column_idx,rmm::mr::device_memory_resource)
+ */
+std::unique_ptr<table> explode_outer_position(table_view const& input_table,
+                                              size_type explode_column_idx,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
+               "Unsupported non-list column");
+  return detail::explode_outer(input_table, explode_column_idx, true, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace cudf
diff --git a/cpp/src/reshape/explode.cu b/cpp/src/reshape/explode.cu
deleted file mode 100644
index 34d7d8fe31d..00000000000
--- a/cpp/src/reshape/explode.cu
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/detail/gather.cuh>
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/lists/lists_column_view.hpp>
-#include <cudf/reshape.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/binary_search.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-
-#include <memory>
-#include <type_traits>
-
-namespace cudf {
-namespace detail {
-namespace {
-/**
- * @brief Function object for exploding a column.
- */
-struct explode_functor {
-  /**
-   * @brief Function object for exploding a column.
-   */
-  template <typename T>
-  std::unique_ptr<table> operator()(table_view const& input_table,
-                                    size_type const explode_column_idx,
-                                    bool include_pos,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr) const
-  {
-    CUDF_FAIL("Unsupported non-list column");
-
-    return std::make_unique<table>();
-  }
-};
-
-template <>
-std::unique_ptr<table> explode_functor::operator()<list_view>(
-  table_view const& input_table,
-  size_type const explode_column_idx,
-  bool include_pos,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
-{
-  lists_column_view lc{input_table.column(explode_column_idx)};
-  auto sliced_child = lc.get_sliced_child(stream);
-  rmm::device_uvector<size_type> gather_map_indices(sliced_child.size(), stream);
-
-  // Sliced columns may require rebasing of the offsets.
-  auto offsets = lc.offsets_begin();
-  // offsets + 1 here to skip the 0th offset, which removes a - 1 operation later.
-  auto offsets_minus_one = thrust::make_transform_iterator(
-    offsets + 1, [offsets] __device__(auto i) { return (i - offsets[0]) - 1; });
-  auto counting_iter = thrust::make_counting_iterator(0);
-
-  rmm::device_uvector<size_type> pos(include_pos ? sliced_child.size() : 0, stream, mr);
-
-  // This looks like an off-by-one bug, but what is going on here is that we need to reduce each
-  // result from `lower_bound` by 1 to build the correct gather map. This can be accomplished by
-  // skipping the first entry and using the result of `lower_bound` directly.
-  if (include_pos) {
-    thrust::transform(
-      rmm::exec_policy(stream),
-      counting_iter,
-      counting_iter + gather_map_indices.size(),
-      gather_map_indices.begin(),
-      [position_array = pos.data(), offsets_minus_one, offsets, offset_size = lc.size()] __device__(
-        auto idx) -> size_type {
-        auto lb_idx = thrust::lower_bound(
-                        thrust::seq, offsets_minus_one, offsets_minus_one + offset_size, idx) -
-                      offsets_minus_one;
-        position_array[idx] = idx - (offsets[lb_idx] - offsets[0]);
-        return lb_idx;
-      });
-  } else {
-    thrust::lower_bound(rmm::exec_policy(stream),
-                        offsets_minus_one,
-                        offsets_minus_one + lc.size(),
-                        counting_iter,
-                        counting_iter + gather_map_indices.size(),
-                        gather_map_indices.begin());
-  }
-
-  auto select_iter = thrust::make_transform_iterator(
-    thrust::make_counting_iterator(0),
-    [explode_column_idx](size_type i) { return i >= explode_column_idx ? i + 1 : i; });
-  std::vector<size_type> selected_columns(select_iter, select_iter + input_table.num_columns() - 1);
-
-  auto gathered_table = cudf::detail::gather(input_table.select(selected_columns),
-                                             gather_map_indices.begin(),
-                                             gather_map_indices.end(),
-                                             cudf::out_of_bounds_policy::DONT_CHECK,
-                                             stream,
-                                             mr);
-
-  std::vector<std::unique_ptr<column>> columns = gathered_table.release()->release();
-
-  columns.insert(columns.begin() + explode_column_idx,
-                 std::make_unique<column>(sliced_child, stream, mr));
-
-  if (include_pos) {
-    columns.insert(columns.begin() + explode_column_idx,
-                   std::make_unique<column>(
-                     data_type(type_to_id<size_type>()), sliced_child.size(), pos.release()));
-  }
-
-  return std::make_unique<table>(std::move(columns));
-}
-}  // namespace
-
-/**
- * @copydoc
- * cudf::explode(input_table,explode_column_idx,rmm::mr::device_memory_resource)
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
- */
-std::unique_ptr<table> explode(table_view const& input_table,
-                               size_type explode_column_idx,
-                               bool include_pos,
-                               rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
-{
-  return type_dispatcher(input_table.column(explode_column_idx).type(),
-                         explode_functor{},
-                         input_table,
-                         explode_column_idx,
-                         include_pos,
-                         stream,
-                         mr);
-}
-
-}  // namespace detail
-
-/**
- * @copydoc cudf::explode(input_table,explode_column_idx,rmm::mr::device_memory_resource)
- */
-std::unique_ptr<table> explode(table_view const& input_table,
-                               size_type explode_column_idx,
-                               rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::explode(input_table, explode_column_idx, false, rmm::cuda_stream_default, mr);
-}
-
-/**
- * @copydoc cudf::explode_position(input_table,explode_column_idx,rmm::mr::device_memory_resource)
- */
-std::unique_ptr<table> explode_position(table_view const& input_table,
-                                        size_type explode_column_idx,
-                                        rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::explode(input_table, explode_column_idx, true, rmm::cuda_stream_default, mr);
-}
-
-}  // namespace cudf
diff --git a/cpp/src/table/table.cpp b/cpp/src/table/table.cpp
index afda6313254..4cd85fc5e7e 100644
--- a/cpp/src/table/table.cpp
+++ b/cpp/src/table/table.cpp
@@ -81,12 +81,4 @@ std::vector<std::unique_ptr<column>> table::release()
   return std::move(_columns);
 }
 
-// Returns a table_view with set of specified columns
-table_view table::select(std::vector<cudf::size_type> const& column_indices) const
-{
-  std::vector<column_view> columns;
-  for (auto index : column_indices) { columns.push_back(_columns.at(index)->view()); }
-  return table_view(columns);
-}
-
 }  // namespace cudf
diff --git a/cpp/src/table/table_view.cpp b/cpp/src/table/table_view.cpp
index 9c421f6fd36..c64bf5b2823 100644
--- a/cpp/src/table/table_view.cpp
+++ b/cpp/src/table/table_view.cpp
@@ -63,11 +63,7 @@ template class table_view_base<mutable_column_view>;
 // Returns a table_view with set of specified columns
 table_view table_view::select(std::vector<size_type> const& column_indices) const
 {
-  std::vector<column_view> columns(column_indices.size());
-  std::transform(column_indices.begin(), column_indices.end(), columns.begin(), [this](auto index) {
-    return this->column(index);
-  });
-  return table_view(columns);
+  return select(column_indices.begin(), column_indices.end());
 }
 
 // Convert mutable view to immutable view
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index b94e9587fc0..e95aab16098 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -287,7 +287,6 @@ ConfigureTest(SEARCH_TEST search/search_test.cpp)
 # - reshape test ----------------------------------------------------------------------------------
 ConfigureTest(RESHAPE_TEST
     reshape/byte_cast_tests.cpp
-    reshape/explode_tests.cpp
     reshape/interleave_columns_tests.cpp
     reshape/tile_tests.cpp)
 
@@ -390,6 +389,7 @@ ConfigureTest(AST_TEST ast/transform_tests.cpp)
 ConfigureTest(LISTS_TEST
     lists/contains_tests.cpp
     lists/count_elements_tests.cpp
+    lists/explode_tests.cpp
     lists/drop_list_duplicates_tests.cpp
     lists/extract_tests.cpp
     lists/sort_lists_tests.cpp)
diff --git a/cpp/tests/lists/explode_tests.cpp b/cpp/tests/lists/explode_tests.cpp
new file mode 100644
index 00000000000..2ec9294d118
--- /dev/null
+++ b/cpp/tests/lists/explode_tests.cpp
@@ -0,0 +1,819 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/lists/explode.hpp>
+
+using namespace cudf::test;
+using FCW = fixed_width_column_wrapper<int32_t>;
+using LCW = lists_column_wrapper<int32_t>;
+
+class ExplodeTest : public cudf::test::BaseFixture {
+};
+
+class ExplodeOuterTest : public cudf::test::BaseFixture {
+};
+
+template <typename T>
+class ExplodeTypedTest : public cudf::test::BaseFixture {
+};
+
+template <typename T>
+class ExplodeOuterTypedTest : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_CASE(ExplodeTypedTest, cudf::test::FixedPointTypes);
+
+TYPED_TEST_CASE(ExplodeOuterTypedTest, cudf::test::FixedPointTypes);
+
+TEST_F(ExplodeTest, Empty)
+{
+  cudf::table_view t({LCW{}, FCW{}});
+
+  auto ret = cudf::explode(t, 0);
+
+  cudf::table_view expected({FCW{}, FCW{}});
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  auto pos_ret = cudf::explode_position(t, 0);
+
+  cudf::table_view pos_expected({FCW{}, FCW{}, FCW{}});
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeTest, NonList)
+{
+  cudf::table_view t({FCW{100, 200, 300}, FCW{100, 200, 300}});
+
+  EXPECT_THROW(cudf::explode(t, 1), cudf::logic_error);
+  EXPECT_THROW(cudf::explode_position(t, 1), cudf::logic_error);
+}
+
+TEST_F(ExplodeTest, Basics)
+{
+  //    a                   b                  c
+  //    100                [1, 2, 7]           string0
+  //    200                [5, 6]              string1
+  //    300                [0, 3]              string2
+
+  FCW a{100, 200, 300};
+  LCW b{LCW{1, 2, 7}, LCW{5, 6}, LCW{0, 3}};
+  strings_column_wrapper c{"string0", "string1", "string2"};
+
+  FCW expected_a{100, 100, 100, 200, 200, 300, 300};
+  FCW expected_b{1, 2, 7, 5, 6, 0, 3};
+  strings_column_wrapper expected_c{
+    "string0", "string0", "string0", "string1", "string1", "string2", "string2"};
+
+  cudf::table_view t({a, b, c});
+  cudf::table_view expected({expected_a, expected_b, expected_c});
+
+  auto ret = cudf::explode(t, 1);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 2, 0, 1, 0, 1};
+  cudf::table_view pos_expected({expected_a, expected_pos_col, expected_b, expected_c});
+
+  auto pos_ret = cudf::explode_position(t, 1);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeTest, SingleNull)
+{
+  //    a                   b
+  //    [1, 2, 7]           100
+  //    [5, 6]              200
+  //    []                  300
+  //    [0, 3]              400
+
+  auto first_invalid =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 0 ? false : true; });
+
+  LCW a({LCW{1, 2, 7}, LCW{5, 6}, LCW{}, LCW{0, 3}}, first_invalid);
+  FCW b({100, 200, 300, 400});
+
+  FCW expected_a{5, 6, 0, 3};
+  FCW expected_b{200, 200, 400, 400};
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 1};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeTest, Nulls)
+{
+  //    a                   b
+  //    [1, 2, 7]           100
+  //    [5, 6]              200
+  //    [0, 3]              300
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+  auto always_valid =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+
+  LCW a({LCW{1, 2, 7}, LCW{5, 6}, LCW{0, 3}}, valids);
+  FCW b({100, 200, 300}, valids);
+
+  FCW expected_a({1, 2, 7, 0, 3});
+  FCW expected_b({100, 100, 100, 300, 300}, always_valid);
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 2, 0, 1};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeTest, NullsInList)
+{
+  //    a                   b
+  //    [1, 2, 7]           100
+  //    [5, 6, 0, 9]        200
+  //    []                  300
+  //    [0, 3, 8]           400
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  LCW a{LCW({1, 2, 7}, valids), LCW({5, 6, 0, 9}, valids), LCW{}, LCW({0, 3, 8}, valids)};
+  FCW b{100, 200, 300, 400};
+
+  FCW expected_a({1, 2, 7, 5, 6, 0, 9, 0, 3, 8}, {1, 0, 1, 1, 0, 1, 0, 1, 0, 1});
+  FCW expected_b{100, 100, 100, 200, 200, 200, 200, 400, 400, 400};
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 2, 0, 1, 2, 3, 0, 1, 2};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeTest, Nested)
+{
+  //    a                      b
+  //    [[1, 2], [7, 6, 5]]    100
+  //    [[5, 6]]               200
+  //    [[0, 3],[],[5],[2, 1]] 300
+
+  LCW a{LCW{LCW{1, 2}, LCW{7, 6, 5}}, LCW{LCW{5, 6}}, LCW{LCW{0, 3}, LCW{}, LCW{5}, LCW{2, 1}}};
+  FCW b{100, 200, 300};
+
+  LCW expected_a{LCW{1, 2}, LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{}, LCW{5}, LCW{2, 1}};
+  FCW expected_b{100, 100, 200, 300, 300, 300, 300};
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 0, 1, 2, 3};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeTest, NestedNulls)
+{
+  //    a                   b
+  //    [[1, 2], [7, 6, 5]] 100
+  //    [[5, 6]]            200
+  //    [[0, 3],[5],[2, 1]] 300
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+  auto always_valid =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+
+  LCW a({LCW{LCW{1, 2}, LCW{7, 6, 5}}, LCW{LCW{5, 6}}, LCW{LCW{0, 3}, LCW{5}, LCW{2, 1}}}, valids);
+  FCW b({100, 200, 300}, valids);
+
+  LCW expected_a{LCW{1, 2}, LCW{7, 6, 5}, LCW{0, 3}, LCW{5}, LCW{2, 1}};
+  FCW expected_b({100, 100, 300, 300, 300}, always_valid);
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 1, 2};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeTest, NullsInNested)
+{
+  //    a                   b
+  //    [[1, 2], [7, 6, 5]] 100
+  //    [[5, 6]]            200
+  //    [[0, 3],[5],[2, 1]] 300
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}},
+         LCW{LCW{5, 6}},
+         LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}});
+  FCW b({100, 200, 300});
+
+  LCW expected_a{
+    LCW({1, 2}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)};
+  FCW expected_b{100, 100, 200, 300, 300, 300};
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 0, 1, 2};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeTest, NullsInNestedDoubleExplode)
+{
+  //    a                       b
+  //    [[1, 2], [], [7, 6, 5]] 100
+  //    [[5, 6]]                200
+  //    [[0, 3],[5],[2, 1]]     300
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  LCW a{LCW{LCW({1, 2}, valids), LCW{}, LCW{7, 6, 5}},
+        LCW{LCW{5, 6}},
+        LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}};
+  FCW b{100, 200, 300};
+
+  FCW expected_a({1, 2, 7, 6, 5, 5, 6, 0, 3, 5, 2, 1}, {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+  FCW expected_b{100, 100, 100, 100, 100, 200, 200, 300, 300, 300, 300, 300};
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto first_explode_ret = cudf::explode(t, 0);
+  auto ret               = cudf::explode(first_explode_ret->view(), 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 1, 2, 0, 1, 0, 1, 0, 0, 1};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_position(first_explode_ret->view(), 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeTest, NestedStructs)
+{
+  //    a                   b
+  //    [[1, 2], [7, 6, 5]] {100, "100"}
+  //    [[5, 6]]            {200, "200"}
+  //    [[0, 3],[5],[2, 1]] {300, "300"}
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}},
+         LCW{LCW{5, 6}},
+         LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}});
+  FCW b1({100, 200, 300});
+  strings_column_wrapper b2{"100", "200", "300"};
+  structs_column_wrapper b({b1, b2});
+
+  LCW expected_a{
+    LCW({1, 2}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)};
+  FCW expected_b1{100, 100, 200, 300, 300, 300};
+  strings_column_wrapper expected_b2{"100", "100", "200", "300", "300", "300"};
+  structs_column_wrapper expected_b({expected_b1, expected_b2});
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 0, 1, 2};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TYPED_TEST(ExplodeTypedTest, ListOfStructs)
+{
+  //  a                        b
+  //  [{70, "70"}, {75, "75"}] 100
+  //  [{50, "50"}, {55, "55"}] 200
+  //  [{35, "35"}, {45, "45"}] 300
+  //  [{25, "25"}, {30, "30"}] 400
+  //  [{15, "15"}, {20, "20"}] 500
+
+  auto numeric_col =
+    fixed_width_column_wrapper<TypeParam, int32_t>{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}};
+  strings_column_wrapper string_col{"70", "75", "50", "55", "35", "45", "25", "30", "15", "20"};
+  auto struct_col = structs_column_wrapper{{numeric_col, string_col}}.release();
+  auto a          = cudf::make_lists_column(
+    5, FCW{0, 2, 4, 6, 8, 10}.release(), std::move(struct_col), cudf::UNKNOWN_NULL_COUNT, {});
+
+  FCW b{100, 200, 300, 400, 500};
+
+  cudf::table_view t({a->view(), b});
+  auto ret = cudf::explode(t, 0);
+
+  auto expected_numeric_col =
+    fixed_width_column_wrapper<TypeParam, int32_t>{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}};
+  strings_column_wrapper expected_string_col{
+    "70", "75", "50", "55", "35", "45", "25", "30", "15", "20"};
+
+  auto expected_a = structs_column_wrapper{{expected_numeric_col, expected_string_col}}.release();
+  FCW expected_b{100, 100, 200, 200, 300, 300, 400, 400, 500, 500};
+
+  cudf::table_view expected({expected_a->view(), expected_b});
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
+  cudf::table_view pos_expected({expected_pos_col, expected_a->view(), expected_b});
+
+  auto pos_ret = cudf::explode_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeTest, SlicedList)
+{
+  //    a                        b
+  //    [[1, 2],[7, 6, 5]]       100
+  //    [[5, 6]]                 200
+  //    [[0, 3],[5],[2, 1]]      300
+  //    [[8, 3],[],[4, 3, 1, 2]] 400
+  //    [[2, 3, 4],[9, 8]]       500
+
+  //    slicing the top 2 rows and the bottom row off
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}},
+         LCW{LCW{5, 6}},
+         LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)},
+         LCW{LCW{8, 3}, LCW{}, LCW({4, 3, 1, 2}, valids)},
+         LCW{LCW{2, 3, 4}, LCW{9, 8}}});
+  FCW b({100, 200, 300, 400, 500});
+
+  LCW expected_a{
+    LCW{0, 3}, LCW{5}, LCW({2, 1}, valids), LCW{8, 3}, LCW{}, LCW({4, 3, 1, 2}, valids)};
+  FCW expected_b{300, 300, 300, 400, 400, 400};
+
+  cudf::table_view t({a, b});
+  auto sliced_t = cudf::slice(t, {2, 4});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode(sliced_t[0], 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 2, 0, 1, 2};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_position(sliced_t[0], 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, Empty)
+{
+  LCW a{};
+  FCW b{};
+
+  cudf::table_view t({LCW{}, FCW{}});
+
+  auto ret = cudf::explode_outer(t, 0);
+
+  FCW expected_a{};
+  FCW expected_b{};
+  cudf::table_view expected({FCW{}, FCW{}});
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+}
+
+TEST_F(ExplodeOuterTest, NonList)
+{
+  cudf::table_view t({FCW{100, 200, 300}, FCW{100, 200, 300}});
+
+  EXPECT_THROW(cudf::explode_outer(t, 1), cudf::logic_error);
+  EXPECT_THROW(cudf::explode_outer_position(t, 1), cudf::logic_error);
+}
+
+TEST_F(ExplodeOuterTest, Basics)
+{
+  //    a                   b                  c
+  //    100                [1, 2, 7]           string0
+  //    200                [5, 6]              string1
+  //    300                [0, 3]              string2
+
+  FCW a{100, 200, 300};
+  LCW b{LCW{1, 2, 7}, LCW{5, 6}, LCW{0, 3}};
+  strings_column_wrapper c{"string0", "string1", "string2"};
+
+  FCW expected_a{100, 100, 100, 200, 200, 300, 300};
+  FCW expected_b{1, 2, 7, 5, 6, 0, 3};
+  strings_column_wrapper expected_c{
+    "string0", "string0", "string0", "string1", "string1", "string2", "string2"};
+
+  cudf::table_view t({a, b, c});
+  cudf::table_view expected({expected_a, expected_b, expected_c});
+
+  auto ret = cudf::explode_outer(t, 1);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 2, 0, 1, 0, 1};
+  cudf::table_view pos_expected({expected_a, expected_pos_col, expected_b, expected_c});
+
+  auto pos_ret = cudf::explode_outer_position(t, 1);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, SingleNull)
+{
+  //    a                   b
+  //    [1, 2, 7]           100
+  //    [5, 6]              200
+  //    []                  300
+  //    [0, 3]              400
+
+  auto first_invalid =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 0 ? false : true; });
+
+  LCW a({LCW{1, 2, 7}, LCW{5, 6}, LCW{}, LCW{0, 3}}, first_invalid);
+  FCW b({100, 200, 300, 400});
+
+  FCW expected_a{{0, 5, 6, 0, 0, 3}, {0, 1, 1, 0, 1, 1}};
+  FCW expected_b{100, 200, 200, 300, 400, 400};
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode_outer(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 0, 1, 0, 0, 1};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_outer_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, Nulls)
+{
+  //    a                   b
+  //    [1, 2, 7]           100
+  //    [5, 6]              200
+  //    [0, 3]              300
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  LCW a({LCW{1, 2, 7}, LCW{5, 6}, LCW{0, 3}}, valids);
+  FCW b({100, 200, 300}, valids);
+
+  FCW expected_a({1, 2, 7, 0, 0, 3}, {1, 1, 1, 0, 1, 1});
+  FCW expected_b({100, 100, 100, 200, 300, 300}, {1, 1, 1, 0, 1, 1});
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode_outer(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 2, 0, 0, 1};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_outer_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, NullsInList)
+{
+  //    a                   b
+  //    [1, 2, 7]           100
+  //    [5, 6, 0, 9]        200
+  //    []                  300
+  //    [0, 3, 8]           400
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  LCW a{LCW({1, 2, 7}, valids), LCW({5, 6, 0, 9}, valids), LCW{}, LCW({0, 3, 8}, valids)};
+  FCW b{100, 200, 300, 400};
+
+  FCW expected_a({1, 2, 7, 5, 6, 0, 9, 0, 0, 3, 8}, {1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1});
+  FCW expected_b{100, 100, 100, 200, 200, 200, 200, 300, 400, 400, 400};
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode_outer(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 2, 0, 1, 2, 3, 0, 0, 1, 2};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_outer_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, Nested)
+{
+  //    a                      b
+  //    [[1, 2], [7, 6, 5]]    100
+  //    [[5, 6]]               200
+  //    [[0, 3],[],[5],[2, 1]] 300
+
+  LCW a{LCW{LCW{1, 2}, LCW{7, 6, 5}}, LCW{LCW{5, 6}}, LCW{LCW{0, 3}, LCW{}, LCW{5}, LCW{2, 1}}};
+  FCW b{100, 200, 300};
+
+  LCW expected_a{LCW{1, 2}, LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{}, LCW{5}, LCW{2, 1}};
+  FCW expected_b{100, 100, 200, 300, 300, 300, 300};
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode_outer(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 0, 1, 2, 3};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_outer_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, NestedNulls)
+{
+  //    a                   b
+  //    [[1, 2], [7, 6, 5]] 100
+  //    [[5, 6]]            200
+  //    [[0, 3],[5],[2, 1]] 300
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  LCW a({LCW{LCW{1, 2}, LCW{7, 6, 5}}, LCW{LCW{5, 6}}, LCW{LCW{0, 3}, LCW{5}, LCW{2, 1}}}, valids);
+  FCW b({100, 200, 300});
+
+  auto expected_valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 2 ? false : true; });
+  LCW expected_a({LCW{1, 2}, LCW{7, 6, 5}, LCW{}, LCW{0, 3}, LCW{5}, LCW{2, 1}}, expected_valids);
+  FCW expected_b({100, 100, 200, 300, 300, 300});
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode_outer(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 0, 1, 2};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_outer_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, NullsInNested)
+{
+  //    a                   b
+  //    [[1, 2], [7, 6, 5]] 100
+  //    [[5, 6]]            200
+  //    [[0, 3],[5],[2, 1]] 300
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}},
+         LCW{LCW{5, 6}},
+         LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}});
+  FCW b({100, 200, 300});
+
+  LCW expected_a{
+    LCW({1, 2}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)};
+  FCW expected_b{100, 100, 200, 300, 300, 300};
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode_outer(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 0, 1, 2};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_outer_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, NullsInNestedDoubleExplode)
+{
+  //    a                       b
+  //    [[1, 2], [], [7, 6, 5]] 100
+  //    [[5, 6]]                200
+  //    [[0, 3],[5],[2, 1]]     300
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  LCW a{LCW{LCW({1, 2}, valids), LCW{}, LCW{7, 6, 5}},
+        LCW{LCW{5, 6}},
+        LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}};
+  FCW b{100, 200, 300};
+
+  FCW expected_a({1, 2, 0, 7, 6, 5, 5, 6, 0, 3, 5, 2, 1}, {1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+  FCW expected_b{100, 100, 100, 100, 100, 100, 200, 200, 300, 300, 300, 300, 300};
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto first_explode_ret = cudf::explode_outer(t, 0);
+  auto ret               = cudf::explode_outer(first_explode_ret->view(), 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 0, 1, 2, 0, 1, 0, 1, 0, 0, 1};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_outer_position(first_explode_ret->view(), 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, NestedStructs)
+{
+  //    a                   b
+  //    [[1, 2], [7, 6, 5]] {100, "100"}
+  //    [[5, 6]]            {200, "200"}
+  //    [[0, 3],[5],[2, 1]] {300, "300"}
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}},
+         LCW{LCW{5, 6}},
+         LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}});
+  FCW b1({100, 200, 300});
+  strings_column_wrapper b2{"100", "200", "300"};
+  structs_column_wrapper b({b1, b2});
+
+  LCW expected_a{
+    LCW({1, 2}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)};
+  FCW expected_b1{100, 100, 200, 300, 300, 300};
+  strings_column_wrapper expected_b2{"100", "100", "200", "300", "300", "300"};
+  structs_column_wrapper expected_b({expected_b1, expected_b2});
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode_outer(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 0, 1, 2};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_outer_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TYPED_TEST(ExplodeOuterTypedTest, ListOfStructs)
+{
+  //  a                        b
+  //  [{70, "70"}, {75, "75"}] 100
+  //  [{50, "50"}, {55, "55"}] 200
+  //  [{35, "35"}, {45, "45"}] 300
+  //  [{25, "25"}, {30, "30"}] 400
+  //  [{15, "15"}, {20, "20"}] 500
+
+  auto numeric_col =
+    fixed_width_column_wrapper<TypeParam, int32_t>{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}};
+  strings_column_wrapper string_col{"70", "75", "50", "55", "35", "45", "25", "30", "15", "20"};
+  auto struct_col = structs_column_wrapper{{numeric_col, string_col}}.release();
+  auto a          = cudf::make_lists_column(
+    5, FCW{0, 2, 4, 6, 8, 10}.release(), std::move(struct_col), cudf::UNKNOWN_NULL_COUNT, {});
+
+  FCW b{100, 200, 300, 400, 500};
+
+  cudf::table_view t({a->view(), b});
+  auto ret = cudf::explode_outer(t, 0);
+
+  auto expected_numeric_col =
+    fixed_width_column_wrapper<TypeParam, int32_t>{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}};
+  strings_column_wrapper expected_string_col{
+    "70", "75", "50", "55", "35", "45", "25", "30", "15", "20"};
+
+  auto expected_a = structs_column_wrapper{{expected_numeric_col, expected_string_col}}.release();
+  FCW expected_b{100, 100, 200, 200, 300, 300, 400, 400, 500, 500};
+
+  cudf::table_view expected({expected_a->view(), expected_b});
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
+  cudf::table_view pos_expected({expected_pos_col, expected_a->view(), expected_b});
+
+  auto pos_ret = cudf::explode_outer_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, SlicedList)
+{
+  //    a                        b
+  //    [[1, 2],[7, 6, 5]]       100
+  //    [[5, 6]]                 200
+  //    [[0, 3],[5],[2, 1]]      300
+  //    [[8, 3],[],[4, 3, 1, 2]] 400
+  //    [[2, 3, 4],[9, 8]]       500
+
+  //    slicing the top 2 rows and the bottom row off
+
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}},
+         LCW{LCW{5, 6}},
+         LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)},
+         LCW{LCW{8, 3}, LCW{}, LCW({4, 3, 1, 2}, valids)},
+         LCW{LCW{2, 3, 4}, LCW{9, 8}}});
+  FCW b({100, 200, 300, 400, 500});
+
+  LCW expected_a{
+    LCW{0, 3}, LCW{5}, LCW({2, 1}, valids), LCW{8, 3}, LCW{}, LCW({4, 3, 1, 2}, valids)};
+  FCW expected_b{300, 300, 300, 400, 400, 400};
+
+  cudf::table_view t({a, b});
+  auto sliced_t = cudf::slice(t, {2, 4});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode_outer(sliced_t[0], 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+  FCW expected_pos_col{0, 1, 2, 0, 1, 2};
+  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+  auto pos_ret = cudf::explode_outer_position(sliced_t[0], 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
diff --git a/cpp/tests/reshape/explode_tests.cpp b/cpp/tests/reshape/explode_tests.cpp
deleted file mode 100644
index 5f3237ce46d..00000000000
--- a/cpp/tests/reshape/explode_tests.cpp
+++ /dev/null
@@ -1,530 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/detail/iterator.cuh>
-#include <cudf/reshape.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/table_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
-
-using namespace cudf::test;
-
-class ExplodeTest : public cudf::test::BaseFixture {
-};
-
-template <typename T>
-class ExplodeTypedTest : public cudf::test::BaseFixture {
-};
-
-TYPED_TEST_CASE(ExplodeTypedTest, cudf::test::FixedPointTypes);
-
-TEST_F(ExplodeTest, Empty)
-{
-  lists_column_wrapper<int32_t> a{};
-  fixed_width_column_wrapper<int32_t> b{};
-
-  cudf::table_view t({a, b});
-
-  auto ret = cudf::explode(t, 0);
-
-  fixed_width_column_wrapper<int32_t> expected_a{};
-  fixed_width_column_wrapper<int32_t> expected_b{};
-  cudf::table_view expected({expected_a, expected_b});
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
-
-  auto pos_ret = cudf::explode_position(t, 0);
-
-  fixed_width_column_wrapper<int32_t> expected_c{};
-  cudf::table_view pos_expected({expected_a, expected_b, expected_c});
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
-}
-
-TEST_F(ExplodeTest, NonList)
-{
-  fixed_width_column_wrapper<int32_t> a{100, 200, 300};
-  fixed_width_column_wrapper<int32_t> b{100, 200, 300};
-
-  cudf::table_view t({a, b});
-
-  EXPECT_THROW(cudf::explode(t, 1), cudf::logic_error);
-  EXPECT_THROW(cudf::explode_position(t, 1), cudf::logic_error);
-}
-
-TEST_F(ExplodeTest, Basics)
-{
-  /*
-      a                   b
-      [1, 2, 7]           100
-      [5, 6]              200
-      [0, 3]              300
-  */
-
-  fixed_width_column_wrapper<int32_t> a{100, 200, 300};
-  lists_column_wrapper<int32_t> b{lists_column_wrapper<int32_t>{1, 2, 7},
-                                  lists_column_wrapper<int32_t>{5, 6},
-                                  lists_column_wrapper<int32_t>{0, 3}};
-  strings_column_wrapper c{"string0", "string1", "string2"};
-
-  fixed_width_column_wrapper<int32_t> expected_a{100, 100, 100, 200, 200, 300, 300};
-  fixed_width_column_wrapper<int32_t> expected_b{1, 2, 7, 5, 6, 0, 3};
-  strings_column_wrapper expected_c{
-    "string0", "string0", "string0", "string1", "string1", "string2", "string2"};
-
-  cudf::table_view t({a, b, c});
-  cudf::table_view expected({expected_a, expected_b, expected_c});
-
-  auto ret = cudf::explode(t, 1);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
-
-  fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 2, 0, 1, 0, 1};
-  cudf::table_view pos_expected({expected_a, expected_pos_col, expected_b, expected_c});
-
-  auto pos_ret = cudf::explode_position(t, 1);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
-}
-
-TEST_F(ExplodeTest, SingleNull)
-{
-  /*
-      a                   b
-      [1, 2, 7]           100
-      [5, 6]              200
-      []                  300
-      [0, 3]              400
-  */
-
-  auto first_invalid =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 0 ? false : true; });
-
-  lists_column_wrapper<int32_t> a({lists_column_wrapper<int32_t>{1, 2, 7},
-                                   lists_column_wrapper<int32_t>{5, 6},
-                                   lists_column_wrapper<int32_t>{},
-                                   lists_column_wrapper<int32_t>{0, 3}},
-                                  first_invalid);
-  fixed_width_column_wrapper<int32_t> b({100, 200, 300, 400});
-
-  fixed_width_column_wrapper<int32_t> expected_a{5, 6, 0, 3};
-  fixed_width_column_wrapper<int32_t> expected_b{200, 200, 400, 400};
-
-  cudf::table_view t({a, b});
-  cudf::table_view expected({expected_a, expected_b});
-
-  auto ret = cudf::explode(t, 0);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
-
-  fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 0, 1};
-  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
-
-  auto pos_ret = cudf::explode_position(t, 0);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
-}
-
-TEST_F(ExplodeTest, Nulls)
-{
-  /*
-      a                   b
-      [1, 2, 7]           100
-      [5, 6]              200
-      [0, 3]              300
-  */
-
-  auto valids = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return i % 2 == 0 ? true : false; });
-  auto always_valid =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
-
-  lists_column_wrapper<int32_t> a({lists_column_wrapper<int32_t>{1, 2, 7},
-                                   lists_column_wrapper<int32_t>{5, 6},
-                                   lists_column_wrapper<int32_t>{0, 3}},
-                                  valids);
-  fixed_width_column_wrapper<int32_t> b({100, 200, 300}, valids);
-
-  fixed_width_column_wrapper<int32_t> expected_a({1, 2, 7, 0, 3});
-  fixed_width_column_wrapper<int32_t> expected_b({100, 100, 100, 300, 300}, always_valid);
-
-  cudf::table_view t({a, b});
-  cudf::table_view expected({expected_a, expected_b});
-
-  auto ret = cudf::explode(t, 0);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
-
-  fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 2, 0, 1};
-  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
-
-  auto pos_ret = cudf::explode_position(t, 0);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
-}
-
-TEST_F(ExplodeTest, NullsInList)
-{
-  /*
-      a                   b
-      [1, 2, 7]           100
-      [5, 6, 0, 9]        200
-      []                  300
-      [0, 3, 8]           400
-  */
-
-  auto valids = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return i % 2 == 0 ? true : false; });
-
-  lists_column_wrapper<int32_t> a{lists_column_wrapper<int32_t>({1, 2, 7}, valids),
-                                  lists_column_wrapper<int32_t>({5, 6, 0, 9}, valids),
-                                  lists_column_wrapper<int32_t>{},
-                                  lists_column_wrapper<int32_t>({0, 3, 8}, valids)};
-  fixed_width_column_wrapper<int32_t> b{100, 200, 300, 400};
-
-  fixed_width_column_wrapper<int32_t> expected_a({1, 2, 7, 5, 6, 0, 9, 0, 3, 8},
-                                                 {1, 0, 1, 1, 0, 1, 0, 1, 0, 1});
-  fixed_width_column_wrapper<int32_t> expected_b{100, 100, 100, 200, 200, 200, 200, 400, 400, 400};
-
-  cudf::table_view t({a, b});
-  cudf::table_view expected({expected_a, expected_b});
-
-  auto ret = cudf::explode(t, 0);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
-
-  fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 2, 0, 1, 2, 3, 0, 1, 2};
-  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
-
-  auto pos_ret = cudf::explode_position(t, 0);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
-}
-
-TEST_F(ExplodeTest, Nested)
-{
-  /*
-      a                      b
-      [[1, 2], [7, 6, 5]]    100
-      [[5, 6]]               200
-      [[0, 3],[],[5],[2, 1]] 300
-  */
-
-  lists_column_wrapper<int32_t> a{
-    lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{1, 2},
-                                  lists_column_wrapper<int32_t>{7, 6, 5}},
-    lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{5, 6}},
-    lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{0, 3},
-                                  lists_column_wrapper<int32_t>{},
-                                  lists_column_wrapper<int32_t>{5},
-                                  lists_column_wrapper<int32_t>{2, 1}}};
-  fixed_width_column_wrapper<int32_t> b{100, 200, 300};
-
-  lists_column_wrapper<int32_t> expected_a{lists_column_wrapper<int32_t>{1, 2},
-                                           lists_column_wrapper<int32_t>{7, 6, 5},
-                                           lists_column_wrapper<int32_t>{5, 6},
-                                           lists_column_wrapper<int32_t>{0, 3},
-                                           lists_column_wrapper<int32_t>{},
-                                           lists_column_wrapper<int32_t>{5},
-                                           lists_column_wrapper<int32_t>{2, 1}};
-  fixed_width_column_wrapper<int32_t> expected_b{100, 100, 200, 300, 300, 300, 300};
-
-  cudf::table_view t({a, b});
-  cudf::table_view expected({expected_a, expected_b});
-
-  auto ret = cudf::explode(t, 0);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
-
-  fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 0, 0, 1, 2, 3};
-  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
-
-  auto pos_ret = cudf::explode_position(t, 0);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
-}
-
-TEST_F(ExplodeTest, NestedNulls)
-{
-  /*
-      a                   b
-      [[1, 2], [7, 6, 5]] 100
-      [[5, 6]]            200
-      [[0, 3],[5],[2, 1]] 300
-  */
-
-  auto valids = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return i % 2 == 0 ? true : false; });
-  auto always_valid =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
-
-  lists_column_wrapper<int32_t> a(
-    {lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{1, 2},
-                                   lists_column_wrapper<int32_t>{7, 6, 5}},
-     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{5, 6}},
-     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{0, 3},
-                                   lists_column_wrapper<int32_t>{5},
-                                   lists_column_wrapper<int32_t>{2, 1}}},
-    valids);
-  fixed_width_column_wrapper<int32_t> b({100, 200, 300}, valids);
-
-  lists_column_wrapper<int32_t> expected_a{lists_column_wrapper<int32_t>{1, 2},
-                                           lists_column_wrapper<int32_t>{7, 6, 5},
-                                           lists_column_wrapper<int32_t>{0, 3},
-                                           lists_column_wrapper<int32_t>{5},
-                                           lists_column_wrapper<int32_t>{2, 1}};
-  fixed_width_column_wrapper<int32_t> expected_b({100, 100, 300, 300, 300}, always_valid);
-
-  cudf::table_view t({a, b});
-  cudf::table_view expected({expected_a, expected_b});
-
-  auto ret = cudf::explode(t, 0);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
-
-  fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 0, 1, 2};
-  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
-
-  auto pos_ret = cudf::explode_position(t, 0);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
-}
-
-TEST_F(ExplodeTest, NullsInNested)
-{
-  /*
-      a                   b
-      [[1, 2], [7, 6, 5]] 100
-      [[5, 6]]            200
-      [[0, 3],[5],[2, 1]] 300
-  */
-
-  auto valids = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return i % 2 == 0 ? true : false; });
-
-  lists_column_wrapper<int32_t> a(
-    {lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>({1, 2}, valids),
-                                   lists_column_wrapper<int32_t>{7, 6, 5}},
-     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{5, 6}},
-     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{0, 3},
-                                   lists_column_wrapper<int32_t>{5},
-                                   lists_column_wrapper<int32_t>({2, 1}, valids)}});
-  fixed_width_column_wrapper<int32_t> b({100, 200, 300});
-
-  lists_column_wrapper<int32_t> expected_a{lists_column_wrapper<int32_t>({1, 2}, valids),
-                                           lists_column_wrapper<int32_t>{7, 6, 5},
-                                           lists_column_wrapper<int32_t>{5, 6},
-                                           lists_column_wrapper<int32_t>{0, 3},
-                                           lists_column_wrapper<int32_t>{5},
-                                           lists_column_wrapper<int32_t>({2, 1}, valids)};
-  fixed_width_column_wrapper<int32_t> expected_b{100, 100, 200, 300, 300, 300};
-
-  cudf::table_view t({a, b});
-  cudf::table_view expected({expected_a, expected_b});
-
-  auto ret = cudf::explode(t, 0);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
-
-  fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 0, 0, 1, 2};
-  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
-
-  auto pos_ret = cudf::explode_position(t, 0);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
-}
-
-TEST_F(ExplodeTest, NullsInNestedDoubleExplode)
-{
-  /*
-      a                       b
-      [[1, 2], [], [7, 6, 5]] 100
-      [[5, 6]]                200
-      [[0, 3],[5],[2, 1]]     300
-  */
-
-  auto valids = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return i % 2 == 0 ? true : false; });
-
-  lists_column_wrapper<int32_t> a{
-    lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>({1, 2}, valids),
-                                  lists_column_wrapper<int32_t>{},
-                                  lists_column_wrapper<int32_t>{7, 6, 5}},
-    lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{5, 6}},
-    lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{0, 3},
-                                  lists_column_wrapper<int32_t>{5},
-                                  lists_column_wrapper<int32_t>({2, 1}, valids)}};
-  fixed_width_column_wrapper<int32_t> b{100, 200, 300};
-
-  fixed_width_column_wrapper<int32_t> expected_a({1, 2, 7, 6, 5, 5, 6, 0, 3, 5, 2, 1},
-                                                 {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
-  fixed_width_column_wrapper<int32_t> expected_b{
-    100, 100, 100, 100, 100, 200, 200, 300, 300, 300, 300, 300};
-
-  cudf::table_view t({a, b});
-  cudf::table_view expected({expected_a, expected_b});
-
-  auto first_explode_ret = cudf::explode(t, 0);
-  auto ret               = cudf::explode(first_explode_ret->view(), 0);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
-
-  fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 0, 1, 2, 0, 1, 0, 1, 0, 0, 1};
-  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
-
-  auto pos_ret = cudf::explode_position(first_explode_ret->view(), 0);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
-}
-
-TEST_F(ExplodeTest, NestedStructs)
-{
-  /*
-      a                   b
-      [[1, 2], [7, 6, 5]] {100, "100"}
-      [[5, 6]]            {200, "200"}
-      [[0, 3],[5],[2, 1]] {300, "300"}
-  */
-
-  auto valids = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return i % 2 == 0 ? true : false; });
-
-  lists_column_wrapper<int32_t> a(
-    {lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>({1, 2}, valids),
-                                   lists_column_wrapper<int32_t>{7, 6, 5}},
-     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{5, 6}},
-     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{0, 3},
-                                   lists_column_wrapper<int32_t>{5},
-                                   lists_column_wrapper<int32_t>({2, 1}, valids)}});
-  fixed_width_column_wrapper<int32_t> b1({100, 200, 300});
-  strings_column_wrapper b2{"100", "200", "300"};
-  structs_column_wrapper b({b1, b2});
-
-  lists_column_wrapper<int32_t> expected_a{lists_column_wrapper<int32_t>({1, 2}, valids),
-                                           lists_column_wrapper<int32_t>{7, 6, 5},
-                                           lists_column_wrapper<int32_t>{5, 6},
-                                           lists_column_wrapper<int32_t>{0, 3},
-                                           lists_column_wrapper<int32_t>{5},
-                                           lists_column_wrapper<int32_t>({2, 1}, valids)};
-  fixed_width_column_wrapper<int32_t> expected_b1{100, 100, 200, 300, 300, 300};
-  strings_column_wrapper expected_b2{"100", "100", "200", "300", "300", "300"};
-  structs_column_wrapper expected_b({expected_b1, expected_b2});
-
-  cudf::table_view t({a, b});
-  cudf::table_view expected({expected_a, expected_b});
-
-  auto ret = cudf::explode(t, 0);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
-
-  fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 0, 0, 1, 2};
-  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
-
-  auto pos_ret = cudf::explode_position(t, 0);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
-}
-
-TYPED_TEST(ExplodeTypedTest, ListOfStructs)
-{
-  /*
-    a                        b
-    [{70, "70"}, {75, "75"}] 100
-    [{50, "50"}, {55, "55"}] 200
-    [{35, "35"}, {45, "45"}] 300
-    [{25, "25"}, {30, "30"}] 400
-    [{15, "15"}, {20, "20"}] 500
-*/
-
-  auto numeric_col =
-    fixed_width_column_wrapper<TypeParam, int32_t>{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}};
-  strings_column_wrapper string_col{"70", "75", "50", "55", "35", "45", "25", "30", "15", "20"};
-  auto struct_col = structs_column_wrapper{{numeric_col, string_col}}.release();
-  auto a          = cudf::make_lists_column(5,
-                                   fixed_width_column_wrapper<int32_t>{0, 2, 4, 6, 8, 10}.release(),
-                                   std::move(struct_col),
-                                   cudf::UNKNOWN_NULL_COUNT,
-                                   {});
-
-  fixed_width_column_wrapper<int32_t> b{100, 200, 300, 400, 500};
-
-  cudf::table_view t({a->view(), b});
-  auto ret = cudf::explode(t, 0);
-
-  auto expected_numeric_col =
-    fixed_width_column_wrapper<TypeParam, int32_t>{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}};
-  strings_column_wrapper expected_string_col{
-    "70", "75", "50", "55", "35", "45", "25", "30", "15", "20"};
-
-  auto expected_a = structs_column_wrapper{{expected_numeric_col, expected_string_col}}.release();
-  fixed_width_column_wrapper<int32_t> expected_b{100, 100, 200, 200, 300, 300, 400, 400, 500, 500};
-
-  cudf::table_view expected({expected_a->view(), expected_b});
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
-
-  fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
-  cudf::table_view pos_expected({expected_pos_col, expected_a->view(), expected_b});
-
-  auto pos_ret = cudf::explode_position(t, 0);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
-}
-
-TEST_F(ExplodeTest, SlicedList)
-{
-  /*
-      a                        b
-      [[1, 2],[7, 6, 5]]       100
-      [[5, 6]]                 200
-      [[0, 3],[5],[2, 1]]      300
-      [[8, 3],[],[4, 3, 1, 2]] 400
-      [[2, 3, 4],[9, 8]]       500
-
-      slicing the top 2 rows and the bottom row off
-  */
-
-  auto valids = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return i % 2 == 0 ? true : false; });
-
-  lists_column_wrapper<int32_t> a(
-    {lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>({1, 2}, valids),
-                                   lists_column_wrapper<int32_t>{7, 6, 5}},
-     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{5, 6}},
-     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{0, 3},
-                                   lists_column_wrapper<int32_t>{5},
-                                   lists_column_wrapper<int32_t>({2, 1}, valids)},
-     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{8, 3},
-                                   lists_column_wrapper<int32_t>{},
-                                   lists_column_wrapper<int32_t>({4, 3, 1, 2}, valids)},
-     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{2, 3, 4},
-                                   lists_column_wrapper<int32_t>{9, 8}}});
-  fixed_width_column_wrapper<int32_t> b({100, 200, 300, 400, 500});
-
-  lists_column_wrapper<int32_t> expected_a{lists_column_wrapper<int32_t>{0, 3},
-                                           lists_column_wrapper<int32_t>{5},
-                                           lists_column_wrapper<int32_t>({2, 1}, valids),
-                                           lists_column_wrapper<int32_t>{8, 3},
-                                           lists_column_wrapper<int32_t>{},
-                                           lists_column_wrapper<int32_t>({4, 3, 1, 2}, valids)};
-  fixed_width_column_wrapper<int32_t> expected_b{300, 300, 300, 400, 400, 400};
-
-  cudf::table_view t({a, b});
-  auto sliced_t = cudf::slice(t, {2, 4});
-  cudf::table_view expected({expected_a, expected_b});
-
-  auto ret = cudf::explode(sliced_t[0], 0);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
-
-  fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 2, 0, 1, 2};
-  cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
-
-  auto pos_ret = cudf::explode_position(sliced_t[0], 0);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
-}

From 39ad863652d02b5f5e3b16b0fcdfe254a9e6a348 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 17 Mar 2021 06:06:13 -0700
Subject: [PATCH 11/21] Use cuFile for Parquet IO when available (#7444)

Adds optional cuFile integration:
- `cufile.h` is included in the build when available.
- `libcufile.so` is loaded at runtime if `LIBCUDF_CUFILE_POLICY` environment variable is set to "ALWAYS" or "GDS".
- cuFile compatibility mode is set through the same policy variable - "ALWAYS" means on, "GDS" means off.
- cuFile is currently only used on Parquet R/W and in CSV writer.
- device_read/write API can be used with file datasource/data_sink.
- Added CUDA stream to `device_read`.

Authors:
  - Vukasin Milovanovic (@vuule)

Approvers:
  - Keith Kraus (@kkraus14)
  - Karthikeyan (@karthikeyann)
  - Devavret Makkar (@devavret)
  - Robert Maynard (@robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/7444
---
 cpp/CMakeLists.txt                           |   8 +
 cpp/benchmarks/fixture/benchmark_fixture.hpp |   2 +-
 cpp/cmake/Modules/FindcuFile.cmake           |   6 +
 cpp/include/cudf/io/data_sink.hpp            |  26 +-
 cpp/include/cudf/io/datasource.hpp           |  94 ++++++-
 cpp/src/io/csv/writer_impl.cu                |  32 +--
 cpp/src/io/parquet/parquet_gpu.hpp           |   4 +-
 cpp/src/io/parquet/reader_impl.cu            |  23 +-
 cpp/src/io/parquet/reader_impl.hpp           |   4 +-
 cpp/src/io/parquet/writer_impl.cu            |  24 +-
 cpp/src/io/utilities/data_sink.cpp           |  45 +++-
 cpp/src/io/utilities/datasource.cpp          | 107 +++++---
 cpp/src/io/utilities/file_io_utilities.cpp   | 264 +++++++++++++++++++
 cpp/src/io/utilities/file_io_utilities.hpp   | 242 +++++++++++++++++
 14 files changed, 757 insertions(+), 124 deletions(-)
 create mode 100644 cpp/src/io/utilities/file_io_utilities.cpp
 create mode 100644 cpp/src/io/utilities/file_io_utilities.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 39acc362450..f15fd649b83 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -139,6 +139,8 @@ include(cmake/thirdparty/CUDF_GetLibcudacxx.cmake)
 include(cmake/thirdparty/CUDF_GetGTest.cmake)
 # Stringify libcudf and libcudacxx headers used in JIT operations
 include(cmake/Modules/StringifyJITHeaders.cmake)
+# find cuFile
+include(cmake/Modules/FindcuFile.cmake)
 
 ###################################################################################################
 # - library targets -------------------------------------------------------------------------------
@@ -244,6 +246,7 @@ add_library(cudf
     src/io/statistics/column_stats.cu
     src/io/utilities/data_sink.cpp
     src/io/utilities/datasource.cpp
+    src/io/utilities/file_io_utilities.cpp
     src/io/utilities/parsing_utils.cu
     src/io/utilities/type_conversion.cpp
     src/jit/cache.cpp
@@ -469,6 +472,11 @@ else()
     target_link_libraries(cudf PUBLIC CUDA::nvrtc CUDA::cudart CUDA::cuda_driver)
 endif()
 
+# Add cuFile interface if available
+if(TARGET cuFile::cuFile_interface)
+    target_link_libraries(cudf PRIVATE cuFile::cuFile_interface)
+endif()
+
 file(WRITE "${CUDF_BINARY_DIR}/fatbin.ld"
 [=[
 SECTIONS
diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp
index ad2ce095b6e..dd1bbcba0b4 100644
--- a/cpp/benchmarks/fixture/benchmark_fixture.hpp
+++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp
@@ -88,4 +88,4 @@ class benchmark : public ::benchmark::Fixture {
   std::shared_ptr<rmm::mr::device_memory_resource> mr;
 };
 
-};  // namespace cudf
+}  // namespace cudf
diff --git a/cpp/cmake/Modules/FindcuFile.cmake b/cpp/cmake/Modules/FindcuFile.cmake
index e67b79d9d60..4f67e186f42 100644
--- a/cpp/cmake/Modules/FindcuFile.cmake
+++ b/cpp/cmake/Modules/FindcuFile.cmake
@@ -93,6 +93,12 @@ find_package_handle_standard_args(cuFile
     cuFile_VERSION
 )
 
+if (cuFile_INCLUDE_DIR AND NOT TARGET cuFile::cuFile_interface)
+  add_library(cuFile::cuFile_interface IMPORTED INTERFACE)
+  target_include_directories(cuFile::cuFile_interface INTERFACE "$<BUILD_INTERFACE:${cuFile_INCLUDE_DIR}>")
+  target_compile_options(cuFile::cuFile_interface INTERFACE "${cuFile_COMPILE_OPTIONS}")
+  target_compile_definitions(cuFile::cuFile_interface INTERFACE CUFILE_FOUND)
+endif ()
 
 if (cuFile_FOUND AND NOT TARGET cuFile::cuFile)
   add_library(cuFile::cuFile UNKNOWN IMPORTED)
diff --git a/cpp/include/cudf/io/data_sink.hpp b/cpp/include/cudf/io/data_sink.hpp
index 0ae403458a0..e0eb60af070 100644
--- a/cpp/include/cudf/io/data_sink.hpp
+++ b/cpp/include/cudf/io/data_sink.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -107,23 +107,35 @@ class data_sink {
    */
   virtual bool supports_device_write() const { return false; }
 
+  /**
+   * @brief Estimates whether a direct device write would be more optimal for the given size.
+   *
+   * @param size Number of bytes to write
+   * @return whether the device write is expected to be more performant for the given size
+   */
+  virtual bool is_device_write_preferred(size_t size) const { return supports_device_write(); }
+
   /**
    * @brief Append the buffer content to the sink from a gpu address
    *
-   * @param[in] data Pointer to the buffer to be written into the sink object
-   * @param[in] size Number of bytes to write
+   * For optimal performance, should only be called when `is_device_write_preferred` returns `true`.
+   * Data sink implementations that don't support direct device writes don't need to override
+   * this function.
    *
-   * @return void
+   * @throws cudf::logic_error the object does not support direct device writes, i.e.
+   * `supports_device_write` returns `false`.
+   *
+   * @param gpu_data Pointer to the buffer to be written into the sink object
+   * @param size Number of bytes to write
+   * @param stream CUDA stream to use
    */
   virtual void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream)
   {
-    CUDF_FAIL("data_sink classes that support device_write must override this function.");
+    CUDF_FAIL("data_sink classes that support device_write must override it.");
   }
 
   /**
    * @brief Flush the data written into the sink
-   *
-   * @return void
    */
   virtual void flush() = 0;
 
diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp
index 88f2bd187e2..8fcc045e6d2 100644
--- a/cpp/include/cudf/io/datasource.hpp
+++ b/cpp/include/cudf/io/datasource.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/io/types.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <arrow/buffer.h>
 #include <arrow/io/file.h>
 #include <arrow/io/interfaces.h>
@@ -50,12 +52,15 @@ class datasource {
     /**
      * @brief Returns the address of the data in the buffer.
      */
-    virtual const uint8_t* data() const = 0;
+    virtual uint8_t const* data() const = 0;
 
     /**
      * @brief Base class destructor
      */
     virtual ~buffer() {}
+
+    template <typename Container>
+    static std::unique_ptr<buffer> create(Container&& data_owner);
   };
 
   /**
@@ -147,37 +152,57 @@ class datasource {
    */
   virtual bool supports_device_read() const { return false; }
 
+  /**
+   * @brief Estimates whether a direct device read would be more optimal for the given size.
+   *
+   * @param size Number of bytes to read
+   * @return whether the device read is expected to be more performant for the given size
+   */
+  virtual bool is_device_read_preferred(size_t size) const { return supports_device_read(); }
+
   /**
    * @brief Returns a device buffer with a subset of data from the source.
    *
+   * For optimal performance, should only be called when `is_device_read_preferred` returns `true`.
    * Data source implementations that don't support direct device reads don't need to override this
    * function.
    *
-   * @param[in] offset Bytes from the start
-   * @param[in] size Bytes to read
+   *  @throws cudf::logic_error the object does not support direct device reads, i.e.
+   * `supports_device_read` returns `false`.
+   *
+   * @param offset Number of bytes from the start
+   * @param size Number of bytes to read
+   * @param stream CUDA stream to use
    *
    * @return The data buffer in the device memory
    */
-  virtual std::unique_ptr<datasource::buffer> device_read(size_t offset, size_t size)
+  virtual std::unique_ptr<datasource::buffer> device_read(size_t offset,
+                                                          size_t size,
+                                                          rmm::cuda_stream_view stream)
   {
-    CUDF_FAIL("datasource classes that support device_read must override this function.");
+    CUDF_FAIL("datasource classes that support device_read must override it.");
   }
 
   /**
    * @brief Reads a selected range into a preallocated device buffer
    *
+   * For optimal performance, should only be called when `is_device_read_preferred` returns `true`.
    * Data source implementations that don't support direct device reads don't need to override this
    * function.
    *
-   * @param[in] offset Bytes from the start
-   * @param[in] size Bytes to read
-   * @param[in] dst Address of the existing device memory
+   *  @throws cudf::logic_error when the object does not support direct device reads, i.e.
+   * `supports_device_read` returns `false`.
+   *
+   * @param offset Number of bytes from the start
+   * @param size Number of bytes to read
+   * @param dst Address of the existing device memory
+   * @param stream CUDA stream to use
    *
    * @return The number of bytes read (can be smaller than size)
    */
-  virtual size_t device_read(size_t offset, size_t size, uint8_t* dst)
+  virtual size_t device_read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream)
   {
-    CUDF_FAIL("datasource classes that support device_read must override this function.");
+    CUDF_FAIL("datasource classes that support device_read must override it.");
   }
 
   /**
@@ -205,14 +230,57 @@ class datasource {
 
     size_t size() const override { return _size; }
 
-    const uint8_t* data() const override { return _data; }
+    uint8_t const* data() const override { return _data; }
 
    private:
     uint8_t* const _data;
     size_t const _size;
   };
+
+  /**
+   * @brief Derived implementation of `buffer` that owns the data.
+   *
+   * Can use different container types to hold the data buffer.
+   *
+   * @tparam Container Type of the container object that owns the data
+   */
+  template <typename Container>
+  class owning_buffer : public buffer {
+   public:
+    /**
+     * @brief Moves the input container into the newly created object.
+     */
+    owning_buffer(Container&& data_owner)
+      : _data(std::move(data_owner)), _data_ptr(_data.data()), _size(_data.size())
+    {
+    }
+
+    /**
+     * @brief Moves the input container into the newly created object, and exposes a subspan of the
+     * buffer.
+     */
+    owning_buffer(Container&& data_owner, uint8_t const* data_ptr, size_t size)
+      : _data(std::move(data_owner)), _data_ptr(data_ptr), _size(size)
+    {
+    }
+
+    size_t size() const override { return _size; }
+
+    uint8_t const* data() const override { return static_cast<uint8_t const*>(_data_ptr); }
+
+   private:
+    Container _data;
+    void const* _data_ptr;
+    size_t _size;
+  };
 };
 
+template <typename Container>
+std::unique_ptr<datasource::buffer> datasource::buffer::create(Container&& data_owner)
+{
+  return std::make_unique<owning_buffer<Container>>(std::move(data_owner));
+}
+
 /**
  * @brief Implementation class for reading from an Apache Arrow file. The file
  * could be a memory-mapped file or other implementation supported by Arrow.
@@ -230,7 +298,7 @@ class arrow_io_source : public datasource {
     {
     }
     size_t size() const override { return arrow_buffer->size(); }
-    const uint8_t* data() const override { return arrow_buffer->data(); }
+    uint8_t const* data() const override { return arrow_buffer->data(); }
   };
 
  public:
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index dda2e0704f6..f7e153d71f4 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -416,36 +416,28 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view,
   auto total_num_bytes      = strings_column.chars_size();
   char const* ptr_all_bytes = strings_column.chars().data<char>();
 
-  if (out_sink_->supports_device_write()) {
-    // host algorithm call, but the underlying call
-    // is a device_write taking a device buffer;
-    //
+  if (out_sink_->is_device_write_preferred(total_num_bytes)) {
+    // Direct write from device memory
     out_sink_->device_write(ptr_all_bytes, total_num_bytes, stream);
-    out_sink_->device_write(newline.data(),
-                            newline.size(),
-                            stream);  // needs newline at the end, to separate from next chunk
   } else {
-    // no device write possible;
-    //
-    // copy the bytes to host, too:
-    //
+    // copy the bytes to host to write them out
     thrust::host_vector<char> h_bytes(total_num_bytes);
     CUDA_TRY(cudaMemcpyAsync(h_bytes.data(),
                              ptr_all_bytes,
                              total_num_bytes * sizeof(char),
                              cudaMemcpyDeviceToHost,
                              stream.value()));
-
     stream.synchronize();
 
-    // host algorithm call, where the underlying call
-    // is also host_write taking a host buffer;
-    //
-    char const* ptr_h_bytes = h_bytes.data();
-    out_sink_->host_write(ptr_h_bytes, total_num_bytes);
+    out_sink_->host_write(h_bytes.data(), total_num_bytes);
+  }
+
+  // Needs newline at the end, to separate from next chunk
+  if (out_sink_->is_device_write_preferred(newline.size())) {
+    out_sink_->device_write(newline.data(), newline.size(), stream);
+  } else {
     out_sink_->host_write(options_.get_line_terminator().data(),
-                          options_.get_line_terminator()
-                            .size());  // needs newline at the end, to separate from next chunk
+                          options_.get_line_terminator().size());
   }
 }
 
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 43d144ec980..f920aee1c29 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -184,7 +184,7 @@ struct ColumnChunkDesc {
   {
   }
 
-  uint8_t *compressed_data;                        // pointer to compressed column chunk data
+  uint8_t const *compressed_data;                  // pointer to compressed column chunk data
   size_t compressed_size;                          // total compressed data size for this chunk
   size_t num_values;                               // total number of values in this column
   size_t start_row;                                // starting row of this chunk
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index a7a02cc6108..16cf0877c23 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -822,7 +822,7 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
  * @copydoc cudf::io::detail::parquet::read_column_chunks
  */
 void reader::impl::read_column_chunks(
-  std::vector<rmm::device_buffer> &page_data,
+  std::vector<std::unique_ptr<datasource::buffer>> &page_data,
   hostdevice_vector<gpu::ColumnChunkDesc> &chunks,  // TODO const?
   size_t begin_chunk,
   size_t end_chunk,
@@ -850,9 +850,15 @@ void reader::impl::read_column_chunks(
       next_chunk++;
     }
     if (io_size != 0) {
-      auto buffer         = _sources[chunk_source_map[chunk]]->host_read(io_offset, io_size);
-      page_data[chunk]    = rmm::device_buffer(buffer->data(), buffer->size(), stream);
-      uint8_t *d_compdata = static_cast<uint8_t *>(page_data[chunk].data());
+      auto &source = _sources[chunk_source_map[chunk]];
+      if (source->is_device_read_preferred(io_size)) {
+        page_data[chunk] = source->device_read(io_offset, io_size, stream);
+      } else {
+        auto const buffer = source->host_read(io_offset, io_size);
+        page_data[chunk] =
+          datasource::buffer::create(rmm::device_buffer(buffer->data(), buffer->size(), stream));
+      }
+      auto d_compdata = page_data[chunk]->data();
       do {
         chunks[chunk].compressed_data = d_compdata;
         d_compdata += chunks[chunk].compressed_size;
@@ -1414,7 +1420,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
     std::vector<size_type> chunk_source_map(num_chunks);
 
     // Tracker for eventually deallocating compressed and uncompressed data
-    std::vector<rmm::device_buffer> page_data(num_chunks);
+    std::vector<std::unique_ptr<datasource::buffer>> page_data(num_chunks);
 
     // Keep track of column chunk file offsets
     std::vector<size_t> column_chunk_offsets(num_chunks);
@@ -1516,10 +1522,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
         decomp_page_data = decompress_page_data(chunks, pages, stream);
         // Free compressed data
         for (size_t c = 0; c < chunks.size(); c++) {
-          if (chunks[c].codec != parquet::Compression::UNCOMPRESSED && page_data[c].size() != 0) {
-            page_data[c].resize(0);
-            page_data[c].shrink_to_fit();
-          }
+          if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) { page_data[c].reset(); }
         }
       }
 
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 137fca03bfd..ca200936134 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -91,7 +91,7 @@ class reader::impl {
    * @param stream CUDA stream used for device memory operations and kernel launches.
    *
    */
-  void read_column_chunks(std::vector<rmm::device_buffer> &page_data,
+  void read_column_chunks(std::vector<std::unique_ptr<datasource::buffer>> &page_data,
                           hostdevice_vector<gpu::ColumnChunkDesc> &chunks,
                           size_t begin_chunk,
                           size_t end_chunk,
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index a645ca0fd91..dd68bc50043 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1108,19 +1108,7 @@ void writer::impl::write(table_view const &table)
                        num_stats_bfr);
   }
 
-  auto host_bfr = [&]() {
-    // if the writer supports device_write(), we don't need this scratch space
-    if (out_sink_->supports_device_write()) {
-      return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
-    } else {
-      return pinned_buffer<uint8_t>{[](size_t size) {
-                                      uint8_t *ptr = nullptr;
-                                      CUDA_TRY(cudaMallocHost(&ptr, size));
-                                      return ptr;
-                                    }(max_chunk_bfr_size),
-                                    cudaFreeHost};
-    }
-  }();
+  pinned_buffer<uint8_t> host_bfr{nullptr, cudaFreeHost};
 
   // Encode row groups in batches
   for (uint32_t b = 0, r = 0, global_r = global_rowgroup_base; b < (uint32_t)batch_list.size();
@@ -1155,7 +1143,7 @@ void writer::impl::write(table_view const &table)
           dev_bfr = ck->uncompressed_bfr;
         }
 
-        if (out_sink_->supports_device_write()) {
+        if (out_sink_->is_device_write_preferred(ck->compressed_size)) {
           // let the writer do what it wants to retrieve the data from the gpu.
           out_sink_->device_write(dev_bfr + ck->ck_stat_size, ck->compressed_size, stream);
           // we still need to do a (much smaller) memcpy for the statistics.
@@ -1170,6 +1158,14 @@ void writer::impl::write(table_view const &table)
             stream.synchronize();
           }
         } else {
+          if (!host_bfr) {
+            host_bfr = pinned_buffer<uint8_t>{[](size_t size) {
+                                                uint8_t *ptr = nullptr;
+                                                CUDA_TRY(cudaMallocHost(&ptr, size));
+                                                return ptr;
+                                              }(max_chunk_bfr_size),
+                                              cudaFreeHost};
+          }
           // copy the full data
           CUDA_TRY(cudaMemcpyAsync(host_bfr.get(),
                                    dev_bfr,
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index 48558005303..10af7bcb0bd 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <cudf/io/data_sink.hpp>
 #include <cudf/utilities/error.hpp>
+#include <io/utilities/file_io_utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -29,24 +30,44 @@ namespace io {
 class file_sink : public data_sink {
  public:
   explicit file_sink(std::string const& filepath)
+    : _cufile_out(detail::make_cufile_output(filepath))
   {
-    outfile_.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc);
-    CUDF_EXPECTS(outfile_.is_open(), "Cannot open output file");
+    _output_stream.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc);
+    CUDF_EXPECTS(_output_stream.is_open(), "Cannot open output file");
   }
 
   virtual ~file_sink() { flush(); }
 
   void host_write(void const* data, size_t size) override
   {
-    outfile_.write(static_cast<char const*>(data), size);
+    _output_stream.seekp(_bytes_written);
+    _output_stream.write(static_cast<char const*>(data), size);
+    _bytes_written += size;
   }
 
-  void flush() override { outfile_.flush(); }
+  void flush() override { _output_stream.flush(); }
 
-  size_t bytes_written() override { return outfile_.tellp(); }
+  size_t bytes_written() override { return _bytes_written; }
+
+  bool supports_device_write() const override { return _cufile_out != nullptr; }
+
+  bool is_device_write_preferred(size_t size) const override
+  {
+    return _cufile_out != nullptr && _cufile_out->is_cufile_io_preferred(size);
+  }
+
+  void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
+  {
+    if (!supports_device_write()) CUDF_FAIL("Device writes are not supported for this file.");
+
+    _cufile_out->write(gpu_data, _bytes_written, size);
+    _bytes_written += size;
+  }
 
  private:
-  std::ofstream outfile_;
+  std::ofstream _output_stream;
+  size_t _bytes_written = 0;
+  std::unique_ptr<detail::cufile_output_impl> _cufile_out;
 };
 
 /**
@@ -77,25 +98,25 @@ class host_buffer_sink : public data_sink {
  */
 class void_sink : public data_sink {
  public:
-  explicit void_sink() : bytes_written_(0) {}
+  explicit void_sink() : _bytes_written(0) {}
 
   virtual ~void_sink() {}
 
-  void host_write(void const* data, size_t size) override { bytes_written_ += size; }
+  void host_write(void const* data, size_t size) override { _bytes_written += size; }
 
   bool supports_device_write() const override { return true; }
 
   void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
   {
-    bytes_written_ += size;
+    _bytes_written += size;
   }
 
   void flush() override {}
 
-  size_t bytes_written() override { return bytes_written_; }
+  size_t bytes_written() override { return _bytes_written; }
 
  private:
-  size_t bytes_written_;
+  size_t _bytes_written;
 };
 
 class user_sink_wrapper : public data_sink {
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 74163d023be..3f2884d5b7d 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
+#include <cudf/io/datasource.hpp>
+
 #include <fcntl.h>
 #include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
 #include <unistd.h>
 
-#include <cudf/io/datasource.hpp>
 #include <cudf/utilities/error.hpp>
+#include <io/utilities/file_io_utilities.hpp>
 
 namespace cudf {
 namespace io {
@@ -34,12 +34,6 @@ namespace io {
  * mapping a subset of the file where the starting offset may not be zero.
  */
 class memory_mapped_source : public datasource {
-  struct file_wrapper {
-    const int fd = -1;
-    explicit file_wrapper(const char *filepath) : fd(open(filepath, O_RDONLY)) {}
-    ~file_wrapper() { close(fd); }
-  };
-
   class memory_mapped_buffer : public buffer {
     size_t _size   = 0;
     uint8_t *_data = nullptr;
@@ -52,77 +46,99 @@ class memory_mapped_source : public datasource {
 
  public:
   explicit memory_mapped_source(const char *filepath, size_t offset, size_t size)
+    : _cufile_in(detail::make_cufile_input(filepath))
   {
-    auto const file = file_wrapper(filepath);
-    CUDF_EXPECTS(file.fd != -1, "Cannot open file");
-
-    struct stat st;
-    CUDF_EXPECTS(fstat(file.fd, &st) != -1, "Cannot query file size");
-    file_size_ = static_cast<size_t>(st.st_size);
-
-    if (file_size_ != 0) { map(file.fd, offset, size); }
+    auto const file = detail::file_wrapper(filepath, O_RDONLY);
+    _file_size      = file.size();
+    if (_file_size != 0) { map(file.desc(), offset, size); }
   }
 
   virtual ~memory_mapped_source()
   {
-    if (map_addr_ != nullptr) { munmap(map_addr_, map_size_); }
+    if (_map_addr != nullptr) { munmap(_map_addr, _map_size); }
   }
 
   std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
   {
-    CUDF_EXPECTS(offset >= map_offset_, "Requested offset is outside mapping");
+    CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping");
 
     // Clamp length to available data in the mapped region
-    auto const read_size = std::min(size, map_size_ - (offset - map_offset_));
+    auto const read_size = std::min(size, _map_size - (offset - _map_offset));
 
     return std::make_unique<memory_mapped_buffer>(
-      static_cast<uint8_t *>(map_addr_) + (offset - map_offset_), read_size);
+      static_cast<uint8_t *>(_map_addr) + (offset - _map_offset), read_size);
   }
 
   size_t host_read(size_t offset, size_t size, uint8_t *dst) override
   {
-    CUDF_EXPECTS(offset >= map_offset_, "Requested offset is outside mapping");
+    CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping");
 
     // Clamp length to available data in the mapped region
-    auto const read_size = std::min(size, map_size_ - (offset - map_offset_));
+    auto const read_size = std::min(size, _map_size - (offset - _map_offset));
 
-    auto const src = static_cast<uint8_t *>(map_addr_) + (offset - map_offset_);
+    auto const src = static_cast<uint8_t *>(_map_addr) + (offset - _map_offset);
     std::memcpy(dst, src, read_size);
     return read_size;
   }
 
-  size_t size() const override { return file_size_; }
+  bool supports_device_read() const override { return _cufile_in != nullptr; }
+
+  bool is_device_read_preferred(size_t size) const
+  {
+    return _cufile_in != nullptr && _cufile_in->is_cufile_io_preferred(size);
+  }
+
+  std::unique_ptr<datasource::buffer> device_read(size_t offset,
+                                                  size_t size,
+                                                  rmm::cuda_stream_view stream) override
+  {
+    if (!supports_device_read()) CUDF_FAIL("Device reads are not supported for this file.");
+
+    auto const read_size = std::min(size, _map_size - (offset - _map_offset));
+    return _cufile_in->read(offset, read_size, stream);
+  }
+
+  size_t device_read(size_t offset,
+                     size_t size,
+                     uint8_t *dst,
+                     rmm::cuda_stream_view stream) override
+  {
+    if (!supports_device_read()) CUDF_FAIL("Device reads are not supported for this file.");
+    auto const read_size = std::min(size, _map_size - (offset - _map_offset));
+    return _cufile_in->read(offset, read_size, dst, stream);
+  }
+
+  size_t size() const override { return _file_size; }
 
  private:
   void map(int fd, size_t offset, size_t size)
   {
-    CUDF_EXPECTS(offset < file_size_, "Offset is past end of file");
+    CUDF_EXPECTS(offset < _file_size, "Offset is past end of file");
 
     // Offset for `mmap()` must be page aligned
-    auto const map_offset = offset & ~(sysconf(_SC_PAGESIZE) - 1);
+    _map_offset = offset & ~(sysconf(_SC_PAGESIZE) - 1);
 
     // Clamp length to available data in the file
     if (size == 0) {
-      size = file_size_ - offset;
+      size = _file_size - offset;
     } else {
-      if ((offset + size) > file_size_) { size = file_size_ - offset; }
+      if ((offset + size) > _file_size) { size = _file_size - offset; }
     }
 
     // Size for `mmap()` needs to include the page padding
-    const auto map_size = size + (offset - map_offset);
+    _map_size = size + (offset - _map_offset);
 
     // Check if accessing a region within already mapped area
-    map_addr_ = mmap(NULL, map_size, PROT_READ, MAP_PRIVATE, fd, map_offset);
-    CUDF_EXPECTS(map_addr_ != MAP_FAILED, "Cannot create memory mapping");
-    map_offset_ = map_offset;
-    map_size_   = map_size;
+    _map_addr = mmap(nullptr, _map_size, PROT_READ, MAP_PRIVATE, fd, _map_offset);
+    CUDF_EXPECTS(_map_addr != MAP_FAILED, "Cannot create memory mapping");
   }
 
  private:
-  size_t file_size_  = 0;
-  void *map_addr_    = nullptr;
-  size_t map_size_   = 0;
-  size_t map_offset_ = 0;
+  size_t _file_size  = 0;
+  void *_map_addr    = nullptr;
+  size_t _map_size   = 0;
+  size_t _map_offset = 0;
+  std::unique_ptr<detail::cufile_input_impl> _cufile_in;
 };
 
 /**
@@ -148,14 +164,19 @@ class user_datasource_wrapper : public datasource {
 
   bool supports_device_read() const override { return source->supports_device_read(); }
 
-  size_t device_read(size_t offset, size_t size, uint8_t *dst) override
+  size_t device_read(size_t offset,
+                     size_t size,
+                     uint8_t *dst,
+                     rmm::cuda_stream_view stream) override
   {
-    return source->device_read(offset, size, dst);
+    return source->device_read(offset, size, dst, stream);
   }
 
-  std::unique_ptr<buffer> device_read(size_t offset, size_t size) override
+  std::unique_ptr<buffer> device_read(size_t offset,
+                                      size_t size,
+                                      rmm::cuda_stream_view stream) override
   {
-    return source->device_read(offset, size);
+    return source->device_read(offset, size, stream);
   }
 
   size_t size() const override { return source->size(); }
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
new file mode 100644
index 00000000000..22ff057cbc1
--- /dev/null
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf_test/file_utilities.hpp>
+#include <io/utilities/file_io_utilities.hpp>
+
+#include <rmm/device_buffer.hpp>
+
+#include <dlfcn.h>
+
+#include <fstream>
+
+namespace cudf {
+namespace io {
+namespace detail {
+
+file_wrapper::file_wrapper(std::string const &filepath, int flags)
+  : fd(open(filepath.c_str(), flags))
+{
+  CUDF_EXPECTS(fd != -1, "Cannot open file " + filepath);
+}
+
+file_wrapper::file_wrapper(std::string const &filepath, int flags, mode_t mode)
+  : fd(open(filepath.c_str(), flags, mode))
+{
+  CUDF_EXPECTS(fd != -1, "Cannot open file " + filepath);
+}
+
+file_wrapper::~file_wrapper() { close(fd); }
+
+long file_wrapper::size() const
+{
+  if (_size < 0) {
+    struct stat st;
+    CUDF_EXPECTS(fstat(fd, &st) != -1, "Cannot query file size");
+    _size = static_cast<size_t>(st.st_size);
+  }
+  return _size;
+}
+
+#ifdef CUFILE_FOUND
+
+/**
+ * @brief Class that manages cuFile configuration.
+ */
+class cufile_config {
+  std::string const default_policy    = "OFF";
+  std::string const json_path_env_var = "CUFILE_ENV_PATH_JSON";
+
+  std::string const policy = default_policy;
+  temp_directory tmp_config_dir{"cudf_cufile_config"};
+
+  std::string getenv_or(std::string const &env_var_name, std::string const &default_val)
+  {
+    auto const env_val = std::getenv(env_var_name.c_str());
+    return (env_val == nullptr) ? default_val : std::string(env_val);
+  }
+
+  cufile_config() : policy{getenv_or("LIBCUDF_CUFILE_POLICY", default_policy)}
+  {
+    if (is_enabled()) {
+      // Modify the config file based on the policy
+      auto const config_file_path = getenv_or(json_path_env_var, "/etc/cufile.json");
+      std::ifstream user_config_file(config_file_path);
+      // Modified config file is stored in a temporary directory
+      auto const cudf_config_path = tmp_config_dir.path() + "/cufile.json";
+      std::ofstream cudf_config_file(cudf_config_path);
+
+      std::string line;
+      while (std::getline(user_config_file, line)) {
+        std::string const tag = "\"allow_compat_mode\"";
+        if (line.find(tag) != std::string::npos) {
+          // TODO: only replace the true/false value
+          // Enable compatiblity mode when cuDF does not fall back to host path
+          cudf_config_file << tag << ": " << (is_required() ? "true" : "false") << ",\n";
+        } else {
+          cudf_config_file << line << '\n';
+        }
+
+        // Point libcufile to the modified config file
+        CUDF_EXPECTS(setenv(json_path_env_var.c_str(), cudf_config_path.c_str(), 0) == 0,
+                     "Failed to set the cuFile config file environment variable.");
+      }
+    }
+  }
+
+ public:
+  /**
+   * @brief Returns true when cuFile use is enabled.
+   */
+  bool is_enabled() const { return policy == "ALWAYS" or policy == "GDS"; }
+
+  /**
+   * @brief Returns true when cuDF should not fall back to host IO.
+   */
+  bool is_required() const { return policy == "ALWAYS"; }
+
+  static cufile_config const *instance()
+  {
+    static cufile_config _instance;
+    return &_instance;
+  }
+};
+
+/**
+ * @brief Class that dynamically loads the cuFile library and manages the cuFile driver.
+ */
+class cufile_shim {
+ private:
+  cufile_shim();
+
+  void *cf_lib                              = nullptr;
+  decltype(cuFileDriverOpen) *driver_open   = nullptr;
+  decltype(cuFileDriverClose) *driver_close = nullptr;
+
+  std::unique_ptr<cudf::logic_error> init_error;
+  auto is_valid() const noexcept { return init_error == nullptr; }
+
+ public:
+  cufile_shim(cufile_shim const &) = delete;
+  cufile_shim &operator=(cufile_shim const &) = delete;
+
+  static cufile_shim const *instance();
+
+  ~cufile_shim()
+  {
+    driver_close();
+    dlclose(cf_lib);
+  }
+
+  decltype(cuFileHandleRegister) *handle_register     = nullptr;
+  decltype(cuFileHandleDeregister) *handle_deregister = nullptr;
+  decltype(cuFileRead) *read                          = nullptr;
+  decltype(cuFileWrite) *write                        = nullptr;
+};
+
+cufile_shim::cufile_shim()
+{
+  try {
+    cf_lib      = dlopen("libcufile.so", RTLD_NOW);
+    driver_open = reinterpret_cast<decltype(driver_open)>(dlsym(cf_lib, "cuFileDriverOpen"));
+    CUDF_EXPECTS(driver_open != nullptr, "could not find cuFile cuFileDriverOpen symbol");
+    driver_close = reinterpret_cast<decltype(driver_close)>(dlsym(cf_lib, "cuFileDriverClose"));
+    CUDF_EXPECTS(driver_close != nullptr, "could not find cuFile cuFileDriverClose symbol");
+    handle_register =
+      reinterpret_cast<decltype(handle_register)>(dlsym(cf_lib, "cuFileHandleRegister"));
+    CUDF_EXPECTS(handle_register != nullptr, "could not find cuFile cuFileHandleRegister symbol");
+    handle_deregister =
+      reinterpret_cast<decltype(handle_deregister)>(dlsym(cf_lib, "cuFileHandleDeregister"));
+    CUDF_EXPECTS(handle_deregister != nullptr,
+                 "could not find cuFile cuFileHandleDeregister symbol");
+    read = reinterpret_cast<decltype(read)>(dlsym(cf_lib, "cuFileRead"));
+    CUDF_EXPECTS(read != nullptr, "could not find cuFile cuFileRead symbol");
+    write = reinterpret_cast<decltype(write)>(dlsym(cf_lib, "cuFileWrite"));
+    CUDF_EXPECTS(write != nullptr, "could not find cuFile cuFileWrite symbol");
+
+    CUDF_EXPECTS(driver_open().err == CU_FILE_SUCCESS, "Failed to initialize cuFile driver");
+  } catch (cudf::logic_error const &err) {
+    init_error = std::make_unique<cudf::logic_error>(err);
+  }
+}
+
+cufile_shim const *cufile_shim::instance()
+{
+  static cufile_shim _instance;
+  // Defer throwing to avoid repeated attempts to load the library
+  if (!_instance.is_valid()) CUDF_FAIL("" + std::string(_instance.init_error->what()));
+
+  return &_instance;
+}
+
+void cufile_registered_file::register_handle()
+{
+  CUfileDescr_t cufile_desc{};
+  cufile_desc.handle.fd = _file.desc();
+  cufile_desc.type      = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
+  CUDF_EXPECTS(shim->handle_register(&cf_handle, &cufile_desc).err == CU_FILE_SUCCESS,
+               "Cannot register file handle with cuFile");
+}
+
+cufile_registered_file::~cufile_registered_file() { shim->handle_deregister(cf_handle); }
+
+cufile_input_impl::cufile_input_impl(std::string const &filepath)
+  : shim{cufile_shim::instance()}, cf_file(shim, filepath, O_RDONLY | O_DIRECT)
+{
+}
+
+std::unique_ptr<datasource::buffer> cufile_input_impl::read(size_t offset,
+                                                            size_t size,
+                                                            rmm::cuda_stream_view stream)
+{
+  rmm::device_buffer out_data(size, stream);
+  CUDF_EXPECTS(shim->read(cf_file.handle(), out_data.data(), size, offset, 0) != -1,
+               "cuFile error reading from a file");
+
+  return datasource::buffer::create(std::move(out_data));
+}
+
+size_t cufile_input_impl::read(size_t offset,
+                               size_t size,
+                               uint8_t *dst,
+                               rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(shim->read(cf_file.handle(), dst, size, offset, 0) != -1,
+               "cuFile error reading from a file");
+  // always read the requested size for now
+  return size;
+}
+
+cufile_output_impl::cufile_output_impl(std::string const &filepath)
+  : shim{cufile_shim::instance()}, cf_file(shim, filepath, O_CREAT | O_RDWR | O_DIRECT, 0664)
+{
+}
+
+void cufile_output_impl::write(void const *data, size_t offset, size_t size)
+{
+  CUDF_EXPECTS(shim->write(cf_file.handle(), data, size, offset, 0) != -1,
+               "cuFile error writing to a file");
+}
+#endif
+
+std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const &filepath)
+{
+#ifdef CUFILE_FOUND
+  if (cufile_config::instance()->is_enabled()) {
+    try {
+      return std::make_unique<cufile_input_impl>(filepath);
+    } catch (...) {
+      if (cufile_config::instance()->is_required()) throw;
+    }
+  }
+#endif
+  return nullptr;
+}
+
+std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const &filepath)
+{
+#ifdef CUFILE_FOUND
+  if (cufile_config::instance()->is_enabled()) {
+    try {
+      return std::make_unique<cufile_output_impl>(filepath);
+    } catch (...) {
+      if (cufile_config::instance()->is_required()) throw;
+    }
+  }
+#endif
+  return nullptr;
+}
+
+}  // namespace detail
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
new file mode 100644
index 00000000000..85399bdd44d
--- /dev/null
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef CUFILE_FOUND
+#include <cufile.h>
+#endif
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cudf/io/datasource.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <string>
+
+namespace cudf {
+namespace io {
+namespace detail {
+
+/**
+ * @brief Class that provides RAII for file handling.
+ */
+class file_wrapper {
+  int const fd       = -1;
+  long mutable _size = -1;
+
+ public:
+  explicit file_wrapper(std::string const &filepath, int flags);
+  explicit file_wrapper(std::string const &filepath, int flags, mode_t mode);
+  ~file_wrapper();
+  long size() const;
+  auto desc() const { return fd; }
+};
+
+/**
+ * @brief Base class for cuFile input/output.
+ *
+ * Contains the common API for cuFile input and output classes.
+ */
+class cufile_io_base {
+ public:
+  /**
+   * @brief Returns an estimate of whether the cuFile operation is the optimal option.
+   *
+   * @param size Read/write operation size, in bytes.
+   * @return Whether a cuFile operation with the given size is expected to be faster than a host
+   * read + H2D copy
+   */
+  static bool is_cufile_io_preferred(size_t size) { return size > op_size_threshold; }
+
+ protected:
+  /**
+   * @brief The read/write size above which cuFile is faster then host read + copy
+   *
+   * This may not be the optimal threshold for all systems. Derived `is_cufile_io_preferred`
+   * implementations can use a different logic.
+   */
+  static constexpr size_t op_size_threshold = 128 << 10;
+};
+
+/**
+ * @brief Interface class for cufile input.
+ */
+class cufile_input : public cufile_io_base {
+ public:
+  /**
+   * @brief Reads into a new device buffer.
+   *
+   *  @throws cudf::logic_error on cuFile error
+   *
+   * @param offset Number of bytes from the start
+   * @param size Number of bytes to read
+   * @param stream CUDA stream to use
+   *
+   * @return The data buffer in the device memory
+   */
+  virtual std::unique_ptr<datasource::buffer> read(size_t offset,
+                                                   size_t size,
+                                                   rmm::cuda_stream_view stream) = 0;
+
+  /**
+   * @brief Reads into existing device memory.
+   *
+   *  @throws cudf::logic_error on cuFile error
+   *
+   * @param offset Number of bytes from the start
+   * @param size Number of bytes to read
+   * @param dst Address of the existing device memory
+   * @param stream CUDA stream to use
+   *
+   * @return The number of bytes read
+   */
+  virtual size_t read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream) = 0;
+};
+
+/**
+ * @brief Interface class for cufile output.
+ */
+class cufile_output : public cufile_io_base {
+ public:
+  /**
+   * @brief Writes the data from a device buffer into a file.
+   *
+   *  @throws cudf::logic_error on cuFile error
+   *
+   * @param data Pointer to the buffer to be written into the output file
+   * @param offset Number of bytes from the start
+   * @param size Number of bytes to write
+   */
+  virtual void write(void const *data, size_t offset, size_t size) = 0;
+};
+
+#ifdef CUFILE_FOUND
+
+class cufile_shim;
+
+/**
+ * @brief Class that provides RAII for cuFile file registration.
+ */
+struct cufile_registered_file {
+  void register_handle();
+
+ public:
+  cufile_registered_file(cufile_shim const *shim, std::string const &filepath, int flags)
+    : _file(filepath, flags), shim{shim}
+  {
+    register_handle();
+  }
+
+  cufile_registered_file(cufile_shim const *shim,
+                         std::string const &filepath,
+                         int flags,
+                         mode_t mode)
+    : _file(filepath, flags, mode), shim{shim}
+  {
+    register_handle();
+  }
+
+  auto const &handle() const noexcept { return cf_handle; }
+
+  ~cufile_registered_file();
+
+ private:
+  file_wrapper const _file;
+  CUfileHandle_t cf_handle = nullptr;
+  cufile_shim const *shim  = nullptr;
+};
+
+/**
+ * @brief Adapter for the `cuFileRead` API.
+ *
+ * Exposes APIs to read directly from a file into device memory.
+ */
+class cufile_input_impl final : public cufile_input {
+ public:
+  cufile_input_impl(std::string const &filepath);
+
+  std::unique_ptr<datasource::buffer> read(size_t offset,
+                                           size_t size,
+                                           rmm::cuda_stream_view stream) override;
+
+  size_t read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream) override;
+
+ private:
+  cufile_shim const *shim = nullptr;
+  cufile_registered_file const cf_file;
+};
+
+/**
+ * @brief Adapter for the `cuFileWrite` API.
+ *
+ * Exposes an API to write directly into a file from device memory.
+ */
+class cufile_output_impl final : public cufile_output {
+ public:
+  cufile_output_impl(std::string const &filepath);
+
+  void write(void const *data, size_t offset, size_t size) override;
+
+ private:
+  cufile_shim const *shim = nullptr;
+  cufile_registered_file const cf_file;
+};
+#else
+
+class cufile_input_impl final : public cufile_input {
+ public:
+  std::unique_ptr<datasource::buffer> read(size_t offset,
+                                           size_t size,
+                                           rmm::cuda_stream_view stream) override
+  {
+    CUDF_FAIL("Only used to compile without cufile library, should not be called");
+  }
+
+  size_t read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream) override
+  {
+    CUDF_FAIL("Only used to compile without cufile library, should not be called");
+  }
+};
+
+class cufile_output_impl final : public cufile_output {
+ public:
+  void write(void const *data, size_t offset, size_t size) override
+  {
+    CUDF_FAIL("Only used to compile without cufile library, should not be called");
+  }
+};
+#endif
+
+/**
+ * @brief Creates a `cufile_input_impl` object
+ *
+ * Returns a null pointer if an exception occurs in the `cufile_input_impl` constructor, or if the
+ * cuFile library is not installed.
+ */
+std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const &filepath);
+
+/**
+ * @brief Creates a `cufile_output_impl` object
+ *
+ * Returns a null pointer if an exception occurs in the `cufile_output_impl` constructor, or if the
+ * cuFile library is not installed.
+ */
+std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const &filepath);
+
+}  // namespace detail
+}  // namespace io
+}  // namespace cudf

From 9c6e1baf023012aacc7bcada921658db0a9993eb Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Wed, 17 Mar 2021 10:47:39 -0500
Subject: [PATCH 12/21] Refactor Java host-side buffer concatenation to expose
 separate steps (#7610)

This refactors `JCudfSerialization.concatToContiguousTable` to expose the separate steps of concatenating to a single host-side buffer and constructing a device-side contiguous table from that host buffer.  This allows application code to perform other operations in-between those two steps.

Authors:
  - Jason Lowe (@jlowe)

Approvers:
  - Robert (Bobby) Evans (@revans2)

URL: https://github.com/rapidsai/cudf/pull/7610
---
 .../ai/rapids/cudf/JCudfSerialization.java    | 94 ++++++++++++++-----
 1 file changed, 71 insertions(+), 23 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java b/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
index bf49fb59d52..6c52b8fe798 100644
--- a/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
+++ b/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -353,6 +353,50 @@ static SerializedColumnHeader readFrom(DataInputStream din, long rowCount) throw
     }
   }
 
+  /** Class to hold the header and buffer pair result from host-side concatenation */
+  public static final class HostConcatResult implements AutoCloseable {
+    private final SerializedTableHeader tableHeader;
+    private final HostMemoryBuffer hostBuffer;
+
+    public HostConcatResult(SerializedTableHeader tableHeader, HostMemoryBuffer tableBuffer) {
+      this.tableHeader = tableHeader;
+      this.hostBuffer = tableBuffer;
+    }
+
+    public SerializedTableHeader getTableHeader() {
+      return tableHeader;
+    }
+
+    public HostMemoryBuffer getHostBuffer() {
+      return hostBuffer;
+    }
+
+    /** Build a contiguous table in device memory from this host-concatenated result */
+    public ContiguousTable toContiguousTable() {
+      DeviceMemoryBuffer devBuffer = DeviceMemoryBuffer.allocate(hostBuffer.length);
+      try {
+        if (hostBuffer.length > 0) {
+          devBuffer.copyFromHostBuffer(hostBuffer);
+        }
+        Table table = sliceUpColumnVectors(tableHeader, devBuffer, hostBuffer);
+        try {
+          return new ContiguousTable(table, devBuffer);
+        } catch (Exception e) {
+          table.close();
+          throw e;
+        }
+      } catch (Exception e) {
+        devBuffer.close();
+        throw e;
+      }
+    }
+
+    @Override
+    public void close() {
+      hostBuffer.close();
+    }
+  }
+
   /**
    * Visible for testing
    */
@@ -1681,15 +1725,32 @@ public static Table readAndConcat(SerializedTableHeader[] headers,
     return ct.getTable();
   }
 
+  /**
+   * Concatenate multiple tables in host memory into a contiguous table in device memory.
+   * @param headers table headers corresponding to the host table buffers
+   * @param dataBuffers host table buffer for each input table to be concatenated
+   * @return contiguous table in device memory
+   */
   public static ContiguousTable concatToContiguousTable(SerializedTableHeader[] headers,
                                                         HostMemoryBuffer[] dataBuffers) throws IOException {
+    try (HostConcatResult concatResult = concatToHostBuffer(headers, dataBuffers)) {
+      return concatResult.toContiguousTable();
+    }
+  }
+
+  /**
+   * Concatenate multiple tables in host memory into a single host table buffer.
+   * @param headers table headers corresponding to the host table buffers
+   * @param dataBuffers host table buffer for each input table to be concatenated
+   * @return host table header and buffer
+   */
+  public static HostConcatResult concatToHostBuffer(SerializedTableHeader[] headers,
+                                                    HostMemoryBuffer[] dataBuffers) throws IOException {
     ColumnBufferProvider[][] providersPerColumn = providersFrom(headers, dataBuffers);
-    DeviceMemoryBuffer devBuffer = null;
-    Table table = null;
     try {
       SerializedTableHeader combined = calcConcatHeader(providersPerColumn);
-
-      try (HostMemoryBuffer hostBuffer = HostMemoryBuffer.allocate(combined.dataLen)) {
+      HostMemoryBuffer hostBuffer = HostMemoryBuffer.allocate(combined.dataLen);
+      try {
         try (NvtxRange range = new NvtxRange("Concat Host Side", NvtxColor.GREEN)) {
           DataWriter writer = writerFrom(hostBuffer);
           int numColumns = combined.getNumColumns();
@@ -1697,27 +1758,14 @@ public static ContiguousTable concatToContiguousTable(SerializedTableHeader[] he
             writeConcat(writer, combined.getColumnHeader(columnIdx), providersPerColumn[columnIdx]);
           }
         }
-
-        devBuffer = DeviceMemoryBuffer.allocate(hostBuffer.length);
-        if (hostBuffer.length > 0) {
-          try (NvtxRange range = new NvtxRange("Copy Data To Device", NvtxColor.WHITE)) {
-            devBuffer.copyFromHostBuffer(hostBuffer);
-          }
-        }
-        table = sliceUpColumnVectors(combined, devBuffer, hostBuffer);
-        ContiguousTable result = new ContiguousTable(table, devBuffer);
-        table = null;
-        devBuffer = null;
-        return result;
+      } catch (Exception e) {
+        hostBuffer.close();
+        throw e;
       }
+
+      return new HostConcatResult(combined, hostBuffer);
     } finally {
       closeAll(providersPerColumn);
-      if (table != null) {
-        table.close();
-      }
-      if (devBuffer != null) {
-        devBuffer.close();
-      }
     }
   }
 

From 0b766c5a1de627ad70a6d83e167104a52db1565b Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Wed, 17 Mar 2021 10:47:56 -0500
Subject: [PATCH 13/21] Add JNI support for IDENTITY hash partitioning (#7626)

This adds in support for identity hash partitioning in JNI.

Authors:
  - Robert (Bobby) Evans (@revans2)

Approvers:
  - Jason Lowe (@jlowe)

URL: https://github.com/rapidsai/cudf/pull/7626
---
 .../main/java/ai/rapids/cudf/HashType.java    |  6 +-
 java/src/main/java/ai/rapids/cudf/Table.java  | 21 ++++++-
 java/src/main/native/src/TableJni.cpp         |  8 ++-
 .../test/java/ai/rapids/cudf/TableTest.java   | 58 +++++++++++++++++--
 4 files changed, 83 insertions(+), 10 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/HashType.java b/java/src/main/java/ai/rapids/cudf/HashType.java
index b521bc5c42c..eb31edd8222 100644
--- a/java/src/main/java/ai/rapids/cudf/HashType.java
+++ b/java/src/main/java/ai/rapids/cudf/HashType.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -22,8 +22,8 @@
  * Hash algorithm identifiers, mirroring native enum cudf::hash_id
  */
 public enum HashType {
-  // TODO IDENTITY(0),
-  // TODO MURMUR3(1),
+  IDENTITY(0),
+  MURMUR3(1),
   HASH_MD5(2),
   HASH_SERIAL_MURMUR3(3),
   HASH_SPARK_MURMUR3(4);
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index fcc23777d69..7385b55d0df 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -185,6 +185,7 @@ public long getDeviceMemorySize() {
 
   private static native long[] hashPartition(long inputTable,
                                              int[] columnsToHash,
+                                             int hashTypeId,
                                              int numberOfPartitions,
                                              int[] outputOffsets) throws CudfException;
 
@@ -2587,15 +2588,31 @@ public Table leftAntiJoin(TableOperation rightJoinIndices) {
     }
 
     /**
-     * Hash partition a table into the specified number of partitions.
+     * Hash partition a table into the specified number of partitions. Uses the default MURMUR3
+     * hashing.
      * @param numberOfPartitions - number of partitions to use
      * @return - {@link PartitionedTable} - Table that exposes a limited functionality of the
      * {@link Table} class
      */
     public PartitionedTable hashPartition(int numberOfPartitions) {
+      return hashPartition(HashType.MURMUR3, numberOfPartitions);
+    }
+
+    /**
+     * Hash partition a table into the specified number of partitions.
+     * @param type the type of hash to use. Depending on the type of hash different restrictions
+     *             on the hash column(s) may exist. Not all hash functions are guaranteed to work
+     *             besides IDENTITY and MURMUR3.
+     * @param numberOfPartitions - number of partitions to use
+     * @return {@link PartitionedTable} - Table that exposes a limited functionality of the
+     * {@link Table} class
+     */
+    public PartitionedTable hashPartition(HashType type, int numberOfPartitions) {
       int[] partitionOffsets = new int[numberOfPartitions];
-      return new PartitionedTable(new Table(Table.hashPartition(operation.table.nativeHandle,
+      return new PartitionedTable(new Table(Table.hashPartition(
+          operation.table.nativeHandle,
           operation.indices,
+          type.nativeId,
           partitionOffsets.length,
           partitionOffsets)), partitionOffsets);
     }
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index e051f68be4e..4548156055a 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -27,6 +27,7 @@
 #include <cudf/io/orc.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/join.hpp>
+#include <cudf/lists/explode.hpp>
 #include <cudf/merge.hpp>
 #include <cudf/partitioning.hpp>
 #include <cudf/reshape.hpp>
@@ -1616,6 +1617,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_concatenate(JNIEnv *env,
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(JNIEnv *env, jclass,
                                                                      jlong input_table,
                                                                      jintArray columns_to_hash,
+                                                                     jint hash_function,
                                                                      jint number_of_partitions,
                                                                      jintArray output_offsets) {
 
@@ -1626,6 +1628,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(JNIEnv *env
 
   try {
     cudf::jni::auto_set_device(env);
+    cudf::hash_id hash_func = static_cast<cudf::hash_id>(hash_function);
     cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
     cudf::jni::native_jintArray n_columns_to_hash(env, columns_to_hash);
     cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
@@ -1638,7 +1641,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(JNIEnv *env
     }
 
     std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> result =
-        cudf::hash_partition(*n_input_table, columns_to_hash_vec, number_of_partitions);
+        cudf::hash_partition(*n_input_table,
+                             columns_to_hash_vec,
+                             number_of_partitions,
+                             hash_func);
 
     for (size_t i = 0; i < result.second.size(); i++) {
       n_output_offsets[i] = result.second[i];
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 88196a4112a..626f7828012 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -1742,7 +1742,7 @@ void testPartStability() {
     final int PARTS = 5;
     int expectedPart = -1;
     try (Table start = new Table.TestBuilder().column(0).build();
-         PartitionedTable out = start.onColumns(0).partition(PARTS)) {
+         PartitionedTable out = start.onColumns(0).hashPartition(PARTS)) {
       // Lets figure out what partitions this is a part of.
       int[] parts = out.getPartitions();
       for (int i = 0; i < parts.length; i++) {
@@ -1755,7 +1755,7 @@ void testPartStability() {
     for (int numEntries = 1; numEntries < COUNT; numEntries++) {
       try (ColumnVector data = ColumnVector.build(DType.INT32, numEntries, Range.appendInts(0, numEntries));
            Table t = new Table(data);
-           PartitionedTable out = t.onColumns(0).partition(PARTS);
+           PartitionedTable out = t.onColumns(0).hashPartition(PARTS);
            HostColumnVector tmp = out.getColumn(0).copyToHost()) {
         // Now we need to get the range out for the partition we expect
         int[] parts = out.getPartitions();
@@ -1774,7 +1774,7 @@ void testPartStability() {
   }
 
   @Test
-  void testPartition() {
+  void testIdentityHashPartition() {
     final int count = 1024 * 1024;
     try (ColumnVector aIn = ColumnVector.build(DType.INT64, count, Range.appendLongs(count));
          ColumnVector bIn = ColumnVector.build(DType.INT32, count, (b) -> {
@@ -1793,7 +1793,57 @@ void testPartition() {
         expected.add(i);
       }
       try (Table input = new Table(new ColumnVector[]{aIn, bIn, cIn});
-           PartitionedTable output = input.onColumns(0).partition(5)) {
+           PartitionedTable output = input.onColumns(0).hashPartition(HashType.IDENTITY, 5)) {
+        int[] parts = output.getPartitions();
+        assertEquals(5, parts.length);
+        assertEquals(0, parts[0]);
+        int previous = 0;
+        long rows = 0;
+        for (int i = 1; i < parts.length; i++) {
+          assertTrue(parts[i] >= previous);
+          rows += parts[i] - previous;
+          previous = parts[i];
+        }
+        assertTrue(rows <= count);
+        try (HostColumnVector aOut = output.getColumn(0).copyToHost();
+             HostColumnVector bOut = output.getColumn(1).copyToHost();
+             HostColumnVector cOut = output.getColumn(2).copyToHost()) {
+
+          for (int i = 0; i < count; i++) {
+            long fromA = aOut.getLong(i);
+            long fromB = bOut.getInt(i);
+            String fromC = cOut.getJavaString(i);
+            assertTrue(expected.remove(fromA));
+            assertEquals(fromA / 2, fromB);
+            assertEquals(String.valueOf(fromA), fromC, "At Index " + i);
+          }
+          assertTrue(expected.isEmpty());
+        }
+      }
+    }
+  }
+
+  @Test
+  void testHashPartition() {
+    final int count = 1024 * 1024;
+    try (ColumnVector aIn = ColumnVector.build(DType.INT64, count, Range.appendLongs(count));
+         ColumnVector bIn = ColumnVector.build(DType.INT32, count, (b) -> {
+           for (int i = 0; i < count; i++) {
+             b.append(i / 2);
+           }
+         });
+         ColumnVector cIn = ColumnVector.build(DType.STRING, count, (b) -> {
+           for (int i = 0; i < count; i++) {
+             b.appendUTF8String(String.valueOf(i).getBytes());
+           }
+         })) {
+
+      HashSet<Long> expected = new HashSet<>();
+      for (long i = 0; i < count; i++) {
+        expected.add(i);
+      }
+      try (Table input = new Table(new ColumnVector[]{aIn, bIn, cIn});
+           PartitionedTable output = input.onColumns(0).hashPartition(5)) {
         int[] parts = output.getPartitions();
         assertEquals(5, parts.length);
         assertEquals(0, parts[0]);

From 3349764b9a0699bf96f7403895df88f7308647fb Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Wed, 17 Mar 2021 12:48:09 -0600
Subject: [PATCH 14/21] Refactor string conversion check (#7599)

This addresses #7557.

In summary:
 * Move `cudf::strings::is_integer()` code from `strings/chars_types.*` to `strings/convert/convert_integers.hpp/cu`
 * Move `cudf::strings::is_float()` code from `strings/chars_types.*` to `strings/convert/convert_floats.hpp/cu`
 * Remove `cudf::strings::all_integer()` and `cudf::strings::all_float()`

Authors:
  - Nghia Truong (@ttnghia)

Approvers:
  - GALI PREM SAGAR (@galipremsagar)
  - Jason Lowe (@jlowe)
  - Jake Hemstad (@jrhemstad)
  - David (@davidwendt)

URL: https://github.com/rapidsai/cudf/pull/7599
---
 .../cudf/strings/char_types/char_types.hpp    |  78 +-----------
 .../cudf/strings/convert/convert_floats.hpp   |  26 +++-
 .../cudf/strings/convert/convert_integers.hpp |  26 +++-
 cpp/src/strings/char_types/char_types.cu      | 113 +-----------------
 cpp/src/strings/convert/convert_floats.cu     |  41 ++++++-
 cpp/src/strings/convert/convert_integers.cu   |  41 ++++++-
 cpp/tests/strings/chars_types_tests.cpp       |  63 ----------
 cpp/tests/strings/floats_tests.cpp            |  35 ++++++
 cpp/tests/strings/integers_tests.cu           |  23 +++-
 java/src/main/native/src/ColumnViewJni.cpp    |   1 -
 .../cudf/cudf/_lib/cpp/strings/char_types.pxd |  10 +-
 .../cpp/strings/convert/convert_floats.pxd    |   6 +-
 .../cpp/strings/convert/convert_integers.pxd  |   6 +-
 python/cudf/cudf/_lib/strings/char_types.pyx  |  36 +-----
 .../_lib/strings/convert/convert_floats.pyx   |  29 +++++
 .../_lib/strings/convert/convert_integers.pyx |  29 +++++
 python/cudf/cudf/core/column/string.py        |   6 +-
 python/cudf/cudf/core/tools/datetimes.py      |   4 +-
 18 files changed, 265 insertions(+), 308 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
 create mode 100644 python/cudf/cudf/_lib/strings/convert/convert_integers.pyx

diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp
index 300722920f4..1f5b6241850 100644
--- a/cpp/include/cudf/strings/char_types/char_types.hpp
+++ b/cpp/include/cudf/strings/char_types/char_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -146,82 +146,6 @@ std::unique_ptr<column> filter_characters_of_type(
   string_character_types types_to_keep = string_character_types::ALL_TYPES,
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Returns a boolean column identifying strings in which all
- * characters are valid for conversion to integers.
- *
- * The output row entry will be set to `true` if the corresponding string element
- * has at least one character in [-+0-9].
- *
- * @code{.pseudo}
- * Example:
- * s = ['123', '-456', '', 'A', '+7']
- * b = s.is_integer(s)
- * b is [true, true, false, false, true]
- * @endcode
- *
- * Any null row results in a null entry for that row in the output column.
- *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of boolean results for each string.
- */
-std::unique_ptr<column> is_integer(
-  strings_column_view const& strings,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Returns `true` if all strings contain
- * characters that are valid for conversion to integers.
- *
- * This function will return `true` if all string elements
- * has at least one character in [-+0-9].
- *
- * Any null entry or empty string will cause this function to return `false`.
- *
- * @param strings Strings instance for this operation.
- * @return true if all string are valid
- */
-bool all_integer(strings_column_view const& strings);
-
-/**
- * @brief Returns a boolean column identifying strings in which all
- * characters are valid for conversion to floats.
- *
- * The output row entry will be set to `true` if the corresponding string element
- * has at least one character in [-+0-9eE.].
- *
- * @code{.pseudo}
- * Example:
- * s = ['123', '-456', '', 'A', '+7', '8.9' '3.7e+5']
- * b = s.is_float(s)
- * b is [true, true, false, false, true, true, true]
- * @endcode
- *
- * Any null row results in a null entry for that row in the output column.
- *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of boolean results for each string.
- */
-std::unique_ptr<column> is_float(
-  strings_column_view const& strings,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Returns `true` if all strings contain
- * characters that are valid for conversion to floats.
- *
- * This function will return `true` if all string elements
- * has at least one character in [-+0-9eE.].
- *
- * Any null entry or empty string will cause this function to return `false`.
- *
- * @param strings Strings instance for this operation.
- * @return true if all string are valid
- */
-bool all_float(strings_column_view const& strings);
-
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/convert/convert_floats.hpp b/cpp/include/cudf/strings/convert/convert_floats.hpp
index cb4746dbf40..d1e00b36f6f 100644
--- a/cpp/include/cudf/strings/convert/convert_floats.hpp
+++ b/cpp/include/cudf/strings/convert/convert_floats.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -68,6 +68,30 @@ std::unique_ptr<column> from_floats(
   column_view const& floats,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a boolean column identifying strings in which all
+ * characters are valid for conversion to floats.
+ *
+ * The output row entry will be set to `true` if the corresponding string element
+ * has at least one character in [-+0-9eE.].
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ['123', '-456', '', 'A', '+7', '8.9' '3.7e+5']
+ * b = s.is_float(s)
+ * b is [true, true, false, false, true, true, true]
+ * @endcode
+ *
+ * Any null row results in a null entry for that row in the output column.
+ *
+ * @param strings Strings instance for this operation.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return New column of boolean results for each string.
+ */
+std::unique_ptr<column> is_float(
+  strings_column_view const& strings,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/convert/convert_integers.hpp b/cpp/include/cudf/strings/convert/convert_integers.hpp
index 8f42deb380d..1e2fa80b129 100644
--- a/cpp/include/cudf/strings/convert/convert_integers.hpp
+++ b/cpp/include/cudf/strings/convert/convert_integers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -73,6 +73,30 @@ std::unique_ptr<column> from_integers(
   column_view const& integers,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a boolean column identifying strings in which all
+ * characters are valid for conversion to integers.
+ *
+ * The output row entry will be set to `true` if the corresponding string element
+ * has at least one character in [-+0-9].
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ['123', '-456', '', 'A', '+7']
+ * b = s.is_integer(s)
+ * b is [true, true, false, false, true]
+ * @endcode
+ *
+ * Any null row results in a null entry for that row in the output column.
+ *
+ * @param strings Strings instance for this operation.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return New column of boolean results for each string.
+ */
+std::unique_ptr<column> is_integer(
+  strings_column_view const& strings,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Returns a new integer numeric column parsing hexadecimal values from the
  * provided strings column.
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index 10496b89328..0b384ad0631 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -186,91 +186,6 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
                              mr);
 }
 
-std::unique_ptr<column> is_integer(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_column       = *strings_column;
-  // create output column
-  auto results   = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings.size(),
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
-  auto d_results = results->mutable_view().data<bool>();
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(strings.size()),
-                    d_results,
-                    [d_column] __device__(size_type idx) {
-                      if (d_column.is_null(idx)) return false;
-                      return string::is_integer(d_column.element<string_view>(idx));
-                    });
-  results->set_null_count(strings.null_count());
-  return results;
-}
-
-bool all_integer(strings_column_view const& strings, rmm::cuda_stream_view stream)
-{
-  auto strings_column  = column_device_view::create(strings.parent(), stream);
-  auto d_column        = *strings_column;
-  auto transformer_itr = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_type>(0), [d_column] __device__(size_type idx) {
-      if (d_column.is_null(idx)) return false;
-      return string::is_integer(d_column.element<string_view>(idx));
-    });
-  return thrust::all_of(rmm::exec_policy(stream),
-                        transformer_itr,
-                        transformer_itr + strings.size(),
-                        thrust::identity<bool>());
-}
-
-std::unique_ptr<column> is_float(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_column       = *strings_column;
-  // create output column
-  auto results   = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings.size(),
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
-  auto d_results = results->mutable_view().data<bool>();
-  // check strings for valid float chars
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(strings.size()),
-                    d_results,
-                    [d_column] __device__(size_type idx) {
-                      if (d_column.is_null(idx)) return false;
-                      return string::is_float(d_column.element<string_view>(idx));
-                    });
-  results->set_null_count(strings.null_count());
-  return results;
-}
-
-bool all_float(strings_column_view const& strings, rmm::cuda_stream_view stream)
-{
-  auto strings_column  = column_device_view::create(strings.parent(), stream);
-  auto d_column        = *strings_column;
-  auto transformer_itr = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_type>(0), [d_column] __device__(size_type idx) {
-      if (d_column.is_null(idx)) return false;
-      return string::is_float(d_column.element<string_view>(idx));
-    });
-  return thrust::all_of(rmm::exec_policy(stream),
-                        transformer_itr,
-                        transformer_itr + strings.size(),
-                        thrust::identity<bool>());
-}
-
 }  // namespace detail
 
 // external API
@@ -295,31 +210,5 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
     strings, types_to_remove, replacement, types_to_keep, rmm::cuda_stream_default, mr);
 }
 
-std::unique_ptr<column> is_integer(strings_column_view const& strings,
-                                   rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::is_integer(strings, rmm::cuda_stream_default, mr);
-}
-
-std::unique_ptr<column> is_float(strings_column_view const& strings,
-                                 rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::is_float(strings, rmm::cuda_stream_default, mr);
-}
-
-bool all_integer(strings_column_view const& strings)
-{
-  CUDF_FUNC_RANGE();
-  return detail::all_integer(strings, rmm::cuda_stream_default);
-}
-
-bool all_float(strings_column_view const& strings)
-{
-  CUDF_FUNC_RANGE();
-  return detail::all_float(strings, rmm::cuda_stream_default);
-}
-
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index 2bf65976986..b6d99efd51f 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -21,6 +21,7 @@
 #include <cudf/strings/convert/convert_floats.hpp>
 #include <cudf/strings/detail/converters.hpp>
 #include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/string.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -536,12 +537,50 @@ std::unique_ptr<column> from_floats(column_view const& floats,
 }  // namespace detail
 
 // external API
-
 std::unique_ptr<column> from_floats(column_view const& floats, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_floats(floats, rmm::cuda_stream_default, mr);
 }
 
+namespace detail {
+std::unique_ptr<column> is_float(
+  strings_column_view const& strings,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto d_column       = *strings_column;
+  // create output column
+  auto results   = make_numeric_column(data_type{type_id::BOOL8},
+                                     strings.size(),
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
+  auto d_results = results->mutable_view().data<bool>();
+  // check strings for valid float chars
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(strings.size()),
+                    d_results,
+                    [d_column] __device__(size_type idx) {
+                      if (d_column.is_null(idx)) return false;
+                      return string::is_float(d_column.element<string_view>(idx));
+                    });
+  results->set_null_count(strings.null_count());
+  return results;
+}
+
+}  // namespace detail
+
+// external API
+std::unique_ptr<column> is_float(strings_column_view const& strings,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::is_float(strings, rmm::cuda_stream_default, mr);
+}
+
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index 112550fc25b..5c5032b5c87 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/strings/convert/convert_integers.hpp>
 #include <cudf/strings/detail/converters.hpp>
 #include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/string.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -245,7 +246,6 @@ std::unique_ptr<column> from_integers(column_view const& integers,
 }  // namespace detail
 
 // external API
-
 std::unique_ptr<column> from_integers(column_view const& integers,
                                       rmm::mr::device_memory_resource* mr)
 {
@@ -253,5 +253,42 @@ std::unique_ptr<column> from_integers(column_view const& integers,
   return detail::from_integers(integers, rmm::cuda_stream_default, mr);
 }
 
+namespace detail {
+std::unique_ptr<column> is_integer(
+  strings_column_view const& strings,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto d_column       = *strings_column;
+  // create output column
+  auto results   = make_numeric_column(data_type{type_id::BOOL8},
+                                     strings.size(),
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
+  auto d_results = results->mutable_view().data<bool>();
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(strings.size()),
+                    d_results,
+                    [d_column] __device__(size_type idx) {
+                      if (d_column.is_null(idx)) return false;
+                      return string::is_integer(d_column.element<string_view>(idx));
+                    });
+  results->set_null_count(strings.null_count());
+  return results;
+}
+}  // namespace detail
+
+// external API
+std::unique_ptr<column> is_integer(strings_column_view const& strings,
+                                   rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::is_integer(strings, rmm::cuda_stream_default, mr);
+}
+
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/tests/strings/chars_types_tests.cpp b/cpp/tests/strings/chars_types_tests.cpp
index 803a9b01b07..702329edaba 100644
--- a/cpp/tests/strings/chars_types_tests.cpp
+++ b/cpp/tests/strings/chars_types_tests.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <tests/strings/utilities.h>
 #include <cudf/column/column.hpp>
 #include <cudf/strings/char_types/char_types.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -228,54 +227,6 @@ TEST_F(StringsCharsTest, Numerics)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_F(StringsCharsTest, Integers)
-{
-  cudf::test::strings_column_wrapper strings1(
-    {"+175", "-34", "9.8", "17+2", "+-14", "1234567890", "67de", "", "1e10", "-", "++", ""});
-  auto results = cudf::strings::is_integer(cudf::strings_column_view(strings1));
-  cudf::test::fixed_width_column_wrapper<bool> expected1({1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);
-  EXPECT_FALSE(cudf::strings::all_integer(cudf::strings_column_view(strings1)));
-
-  cudf::test::strings_column_wrapper strings2(
-    {"0", "+0", "-0", "1234567890", "-27341132", "+012", "023", "-045"});
-  results = cudf::strings::is_integer(cudf::strings_column_view(strings2));
-  cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1, 1, 1});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
-  EXPECT_TRUE(cudf::strings::all_integer(cudf::strings_column_view(strings2)));
-}
-
-TEST_F(StringsCharsTest, Floats)
-{
-  cudf::test::strings_column_wrapper strings1({"+175",
-                                               "-9.8",
-                                               "7+2",
-                                               "+-4",
-                                               "6.7e17",
-                                               "-1.2e-5",
-                                               "e",
-                                               ".e",
-                                               "1.e+-2",
-                                               "00.00",
-                                               "1.0e+1.0",
-                                               "1.2.3",
-                                               "+",
-                                               "--",
-                                               ""});
-  auto results = cudf::strings::is_float(cudf::strings_column_view(strings1));
-  cudf::test::fixed_width_column_wrapper<bool> expected1(
-    {1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);
-  EXPECT_FALSE(cudf::strings::all_float(cudf::strings_column_view(strings1)));
-
-  cudf::test::strings_column_wrapper strings2(
-    {"+175", "-34", "9.8", "1234567890", "6.7e17", "-917.2e5"});
-  results = cudf::strings::is_float(cudf::strings_column_view(strings2));
-  cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
-  EXPECT_TRUE(cudf::strings::all_float(cudf::strings_column_view(strings2)));
-}
-
 TEST_F(StringsCharsTest, EmptyStrings)
 {
   cudf::test::strings_column_wrapper strings({"", "", ""});
@@ -284,12 +235,6 @@ TEST_F(StringsCharsTest, EmptyStrings)
   auto results = cudf::strings::all_characters_of_type(
     strings_view, cudf::strings::string_character_types::ALPHANUM);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-  results = cudf::strings::is_integer(strings_view);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-  EXPECT_FALSE(cudf::strings::all_integer(strings_view));
-  results = cudf::strings::is_float(strings_view);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-  EXPECT_FALSE(cudf::strings::all_float(strings_view));
 }
 
 TEST_F(StringsCharsTest, FilterCharTypes)
@@ -379,14 +324,6 @@ TEST_F(StringsCharsTest, EmptyStringsColumn)
   EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id());
   EXPECT_EQ(0, results->view().size());
 
-  results = cudf::strings::is_integer(strings_view);
-  EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id());
-  EXPECT_EQ(0, results->view().size());
-
-  results = cudf::strings::is_float(strings_view);
-  EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id());
-  EXPECT_EQ(0, results->view().size());
-
   results = cudf::strings::filter_characters_of_type(
     strings_view, cudf::strings::string_character_types::NUMERIC);
   EXPECT_EQ(cudf::type_id::STRING, results->view().type().id());
diff --git a/cpp/tests/strings/floats_tests.cpp b/cpp/tests/strings/floats_tests.cpp
index b98416d9edd..f7151363d83 100644
--- a/cpp/tests/strings/floats_tests.cpp
+++ b/cpp/tests/strings/floats_tests.cpp
@@ -27,6 +27,41 @@
 struct StringsConvertTest : public cudf::test::BaseFixture {
 };
 
+TEST_F(StringsConvertTest, IsFloat)
+{
+  cudf::test::strings_column_wrapper strings;
+  auto strings_view = cudf::strings_column_view(strings);
+  auto results      = cudf::strings::is_float(strings_view);
+  EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id());
+  EXPECT_EQ(0, results->view().size());
+
+  cudf::test::strings_column_wrapper strings1({"+175",
+                                               "-9.8",
+                                               "7+2",
+                                               "+-4",
+                                               "6.7e17",
+                                               "-1.2e-5",
+                                               "e",
+                                               ".e",
+                                               "1.e+-2",
+                                               "00.00",
+                                               "1.0e+1.0",
+                                               "1.2.3",
+                                               "+",
+                                               "--",
+                                               ""});
+  results = cudf::strings::is_float(cudf::strings_column_view(strings1));
+  cudf::test::fixed_width_column_wrapper<bool> expected1(
+    {1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);
+
+  cudf::test::strings_column_wrapper strings2(
+    {"+175", "-34", "9.8", "1234567890", "6.7e17", "-917.2e5"});
+  results = cudf::strings::is_float(cudf::strings_column_view(strings2));
+  cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
+}
+
 TEST_F(StringsConvertTest, ToFloats32)
 {
   std::vector<const char*> h_strings{"1234",
diff --git a/cpp/tests/strings/integers_tests.cu b/cpp/tests/strings/integers_tests.cu
index 9e2b9809b26..d6bf03b3f76 100644
--- a/cpp/tests/strings/integers_tests.cu
+++ b/cpp/tests/strings/integers_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,27 @@
 struct StringsConvertTest : public cudf::test::BaseFixture {
 };
 
+TEST_F(StringsConvertTest, IsInteger)
+{
+  cudf::test::strings_column_wrapper strings;
+  auto strings_view = cudf::strings_column_view(strings);
+  auto results      = cudf::strings::is_integer(strings_view);
+  EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id());
+  EXPECT_EQ(0, results->view().size());
+
+  cudf::test::strings_column_wrapper strings1(
+    {"+175", "-34", "9.8", "17+2", "+-14", "1234567890", "67de", "", "1e10", "-", "++", ""});
+  results = cudf::strings::is_integer(cudf::strings_column_view(strings1));
+  cudf::test::fixed_width_column_wrapper<bool> expected1({1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);
+
+  cudf::test::strings_column_wrapper strings2(
+    {"0", "+0", "-0", "1234567890", "-27341132", "+012", "023", "-045"});
+  results = cudf::strings::is_integer(cudf::strings_column_view(strings2));
+  cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
+}
+
 TEST_F(StringsConvertTest, ToInteger)
 {
   std::vector<const char*> h_strings{
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 0ce9d6303e4..ac14e1605d7 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -37,7 +37,6 @@
 #include <cudf/strings/attributes.hpp>
 #include <cudf/strings/capitalize.hpp>
 #include <cudf/strings/case.hpp>
-#include <cudf/strings/char_types/char_types.hpp>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/contains.hpp>
 #include <cudf/strings/convert/convert_booleans.hpp>
diff --git a/python/cudf/cudf/_lib/cpp/strings/char_types.pxd b/python/cudf/cudf/_lib/cpp/strings/char_types.pxd
index ad675027c10..934269c6f25 100644
--- a/python/cudf/cudf/_lib/cpp/strings/char_types.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/char_types.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2021, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -33,11 +33,3 @@ cdef extern from "cudf/strings/char_types/char_types.hpp" \
         string_character_types types_to_remove,
         string_scalar replacement,
         string_character_types types_to_keep) except +
-
-    cdef unique_ptr[column] is_integer(
-        column_view source_strings
-    ) except +
-
-    cdef unique_ptr[column] is_float(
-        column_view source_strings
-    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd b/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd
index baee01b8f99..55a84b60efd 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2021, NVIDIA CORPORATION.
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -14,3 +14,7 @@ cdef extern from "cudf/strings/convert/convert_floats.hpp" namespace \
 
     cdef unique_ptr[column] from_floats(
         column_view input_col) except +
+
+    cdef unique_ptr[column] is_float(
+        column_view source_strings
+    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd b/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd
index 92f99a2f5cb..6e45d4ba869 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2021, NVIDIA CORPORATION.
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -15,6 +15,10 @@ cdef extern from "cudf/strings/convert/convert_integers.hpp" namespace \
     cdef unique_ptr[column] from_integers(
         column_view input_col) except +
 
+    cdef unique_ptr[column] is_integer(
+        column_view source_strings
+    ) except +
+
     cdef unique_ptr[column] hex_to_integers(
         column_view input_col,
         data_type output_type) except +
diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx
index 5d8d1522418..1890e98f956 100644
--- a/python/cudf/cudf/_lib/strings/char_types.pyx
+++ b/python/cudf/cudf/_lib/strings/char_types.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2021, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -14,8 +14,6 @@ from cudf._lib.cpp.strings.char_types cimport (
     all_characters_of_type as cpp_all_characters_of_type,
     filter_characters_of_type as cpp_filter_characters_of_type,
     string_character_types as string_character_types,
-    is_integer as cpp_is_integer,
-    is_float as cpp_is_float,
 )
 
 
@@ -191,35 +189,3 @@ def is_space(Column source_strings):
         ))
 
     return Column.from_unique_ptr(move(c_result))
-
-
-def is_integer(Column source_strings):
-    """
-    Returns a Column of boolean values with True for `source_strings`
-    that have intergers.
-    """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_is_integer(
-            source_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
-
-
-def is_float(Column source_strings):
-    """
-    Returns a Column of boolean values with True for `source_strings`
-    that have floats.
-    """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_is_float(
-            source_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
new file mode 100644
index 00000000000..195d9b71f6e
--- /dev/null
+++ b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
@@ -0,0 +1,29 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column cimport column
+
+from cudf._lib.cpp.strings.convert.convert_floats cimport (
+    is_float as cpp_is_float,
+)
+
+
+def is_float(Column source_strings):
+    """
+    Returns a Column of boolean values with True for `source_strings`
+    that have floats.
+    """
+    cdef unique_ptr[column] c_result
+    cdef column_view source_view = source_strings.view()
+
+    with nogil:
+        c_result = move(cpp_is_float(
+            source_view
+        ))
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
new file mode 100644
index 00000000000..d1bae1edd37
--- /dev/null
+++ b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
@@ -0,0 +1,29 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column cimport column
+
+from cudf._lib.cpp.strings.convert.convert_integers cimport (
+    is_integer as cpp_is_integer,
+)
+
+
+def is_integer(Column source_strings):
+    """
+    Returns a Column of boolean values with True for `source_strings`
+    that have intergers.
+    """
+    cdef unique_ptr[column] c_result
+    cdef column_view source_view = source_strings.view()
+
+    with nogil:
+        c_result = move(cpp_is_integer(
+            source_view
+        ))
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index ea01aa07b91..11dd7556812 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -70,13 +70,15 @@
     is_alpha as cpp_is_alpha,
     is_decimal as cpp_is_decimal,
     is_digit as cpp_is_digit,
-    is_float as cpp_is_float,
-    is_integer as cpp_is_integer,
     is_lower as cpp_is_lower,
     is_numeric as cpp_is_numeric,
     is_space as cpp_isspace,
     is_upper as cpp_is_upper,
 )
+from cudf._lib.strings.convert.convert_integers import (
+    is_integer as cpp_is_integer,
+)
+from cudf._lib.strings.convert.convert_floats import is_float as cpp_is_float
 from cudf._lib.strings.combine import (
     concatenate as cpp_concatenate,
     join as cpp_join,
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 4e5e4ce1987..535e497e8dc 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -8,7 +8,9 @@
 from pandas.core.tools.datetimes import _unit_map
 
 import cudf
-from cudf._lib.strings.char_types import is_integer as cpp_is_integer
+from cudf._lib.strings.convert.convert_integers import (
+    is_integer as cpp_is_integer,
+)
 from cudf.core import column
 from cudf.core.index import as_index
 from cudf.utils.dtypes import is_scalar

From 168c489a9415ae7bbbec5ef600b0d3dcde44b583 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Wed, 17 Mar 2021 20:37:11 -0500
Subject: [PATCH 15/21] Fix Series/Dataframe Mixed Arithmetic (#7491)

Fixes https://github.com/rapidsai/cudf/issues/7385

Authors:
  - @brandon-b-miller

Approvers:
  - GALI PREM SAGAR (@galipremsagar)
  - Michael Wang (@isVoid)

URL: https://github.com/rapidsai/cudf/pull/7491
---
 python/cudf/cudf/core/dataframe.py       |  8 ++---
 python/cudf/cudf/core/series.py          |  4 +--
 python/cudf/cudf/tests/test_dataframe.py | 42 ++++++++++++------------
 3 files changed, 24 insertions(+), 30 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 25f57748765..9672ab3002f 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1518,11 +1518,7 @@ def fallback(col, fn):
                 else:
                     if col not in df_cols:
                         r_opr = other_cols[col]
-                        l_opr = Series(
-                            column_empty(
-                                len(self), masked=True, dtype=other.dtype
-                            )
-                        )
+                        l_opr = Series(as_column(np.nan, length=len(self)))
                     if col not in other_cols_keys:
                         r_opr = None
                         l_opr = self[col]
@@ -2198,7 +2194,7 @@ def rpow(self, other, axis="columns", level=None, fill_value=None):
         return self._apply_op("rpow", other, fill_value)
 
     def __rpow__(self, other):
-        return self._apply_op("__pow__", other)
+        return self._apply_op("__rpow__", other)
 
     def floordiv(self, other, axis="columns", level=None, fill_value=None):
         """
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 5e7121c0488..b06fef178f6 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1501,9 +1501,7 @@ def _binaryop(
         If ``reflect`` is ``True``, swap the order of the operands.
         """
         if isinstance(other, cudf.DataFrame):
-            # TODO: fn is not the same as arg expected by _apply_op
-            # e.g. for fn = 'and', _apply_op equivalent is '__and__'
-            return other._apply_op(self, fn)
+            return NotImplemented
 
         result_name = utils.get_result_name(self, other)
         if isinstance(other, Series):
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 77548b95277..5f4d571e8c5 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -4996,13 +4996,13 @@ def test_cov_nans():
 @pytest.mark.parametrize(
     "gsr",
     [
-        cudf.Series([1, 2, 3]),
-        cudf.Series([1, 2, 3], index=["a", "b", "c"]),
-        cudf.Series([1, 2, 3], index=["a", "b", "d"]),
-        cudf.Series([1, 2], index=["a", "b"]),
-        cudf.Series([1, 2, 3], index=cudf.core.index.RangeIndex(0, 3)),
+        cudf.Series([4, 2, 3]),
+        cudf.Series([4, 2, 3], index=["a", "b", "c"]),
+        cudf.Series([4, 2, 3], index=["a", "b", "d"]),
+        cudf.Series([4, 2], index=["a", "b"]),
+        cudf.Series([4, 2, 3], index=cudf.core.index.RangeIndex(0, 3)),
         pytest.param(
-            cudf.Series([1, 2, 3, 4, 5], index=["a", "b", "d", "0", "12"]),
+            cudf.Series([4, 2, 3, 4, 5], index=["a", "b", "d", "0", "12"]),
             marks=pytest.mark.xfail,
         ),
     ],
@@ -5017,32 +5017,32 @@ def test_cov_nans():
         operator.truediv,
         operator.mod,
         operator.pow,
-        # comparison ops will temporarily XFAIL
-        # see PR  https://github.com/rapidsai/cudf/pull/7491
-        pytest.param(operator.eq, marks=pytest.mark.xfail()),
-        pytest.param(operator.lt, marks=pytest.mark.xfail()),
-        pytest.param(operator.le, marks=pytest.mark.xfail()),
-        pytest.param(operator.gt, marks=pytest.mark.xfail()),
-        pytest.param(operator.ge, marks=pytest.mark.xfail()),
-        pytest.param(operator.ne, marks=pytest.mark.xfail()),
+        operator.eq,
+        operator.lt,
+        operator.le,
+        operator.gt,
+        operator.ge,
+        operator.ne,
     ],
 )
 def test_df_sr_binop(gsr, colnames, op):
-    data = [[0, 2, 5], [3, None, 5], [6, 7, np.nan]]
+    data = [[3.0, 2.0, 5.0], [3.0, None, 5.0], [6.0, 7.0, np.nan]]
     data = dict(zip(colnames, data))
 
+    gsr = gsr.astype("float64")
+
     gdf = cudf.DataFrame(data)
-    pdf = pd.DataFrame.from_dict(data)
+    pdf = gdf.to_pandas(nullable=True)
 
-    psr = gsr.to_pandas()
+    psr = gsr.to_pandas(nullable=True)
 
     expect = op(pdf, psr)
-    got = op(gdf, gsr)
-    assert_eq(expect.astype(float), got.astype(float))
+    got = op(gdf, gsr).to_pandas(nullable=True)
+    assert_eq(expect, got, check_dtype=False)
 
     expect = op(psr, pdf)
-    got = op(psr, pdf)
-    assert_eq(expect.astype(float), got.astype(float))
+    got = op(gsr, gdf).to_pandas(nullable=True)
+    assert_eq(expect, got, check_dtype=False)
 
 
 @pytest.mark.parametrize(

From 99001d2c8d9b3898e58c74d7979ab6204c5e5bee Mon Sep 17 00:00:00 2001
From: Alfred Xu <lovedreamf@gmail.com>
Date: Thu, 18 Mar 2021 09:57:30 +0800
Subject: [PATCH 16/21] Java support on explode_outer (#7625)

This pull request aims to enable `cudf::explode_outer` and `cudf::explode_outer_position` in Java package.

Authors:
  - Alfred Xu (@sperlingxx)

Approvers:
  - Robert (Bobby) Evans (@revans2)

URL: https://github.com/rapidsai/cudf/pull/7625
---
 java/src/main/java/ai/rapids/cudf/Table.java  | 141 +++++++++++++++---
 java/src/main/native/src/TableJni.cpp         |  28 ++++
 .../test/java/ai/rapids/cudf/TableTest.java   |  86 +++++++++--
 3 files changed, 218 insertions(+), 37 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 7385b55d0df..d0e59fdc105 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -516,6 +516,10 @@ private static native long[] repeatColumnCount(long tableHandle,
 
   private static native long[] explodePosition(long tableHandle, int index);
 
+  private static native long[] explodeOuter(long tableHandle, int index);
+
+  private static native long[] explodeOuterPosition(long tableHandle, int index);
+
   private static native long createCudfTableView(long[] nativeColumnViewHandles);
 
   private static native long[] columnViewsFromPacked(ByteBuffer metadata, long dataAddress);
@@ -1725,7 +1729,7 @@ public ContiguousTable[] contiguousSplit(int... indices) {
    * Example:
    * input:  [[5,10,15], 100],
    *         [[20,25],   200],
-   *         [[30],      300],
+   *         [[30],      300]
    * index: 0
    * output: [5,         100],
    *         [10,        100],
@@ -1737,12 +1741,12 @@ public ContiguousTable[] contiguousSplit(int... indices) {
    *
    * Nulls propagate in different ways depending on what is null.
    * <code>
-   *     [[5,null,15], 100],
-   *     [null,        200]
-   * returns:
-   *     [5,           100],
-   *     [null,        100],
-   *     [15,          100]
+   * input:  [[5,null,15], 100],
+   *         [null,        200]
+   * index: 0
+   * output: [5,           100],
+   *         [null,        100],
+   *         [15,          100]
    * </code>
    * Note that null lists are completely removed from the output
    * and nulls inside lists are pulled out and remain.
@@ -1763,27 +1767,26 @@ public Table explode(int index) {
    * in the output. The corresponding rows for other columns in the input are duplicated. A position
    * column is added that has the index inside the original list for each row. Example:
    * <code>
-   * [[5,10,15], 100],
-   * [[20,25],   200],
-   * [[30],      300],
-   * returns
-   * [0,   5,    100],
-   * [1,   10,   100],
-   * [2,   15,    100],
-   * [0,   20,    200],
-   * [1,   25,    200],
-   * [0,   30,    300],
+   * input:  [[5,10,15], 100],
+   *         [[20,25],   200],
+   *         [[30],      300]
+   * index: 0
+   * output: [0,   5,    100],
+   *         [1,   10,   100],
+   *         [2,   15,   100],
+   *         [0,   20,   200],
+   *         [1,   25,   200],
+   *         [0,   30,   300]
    * </code>
    *
    * Nulls and empty lists propagate in different ways depending on what is null or empty.
    * <code>
-   * [[5,null,15], 100],
-   * [null,        200],
-   * [[],          300],
-   * returns
-   * [0,    5,     100],
-   * [1,    null,  100],
-   * [2,    15,    100],
+   * input:  [[5,null,15], 100],
+   *         [null,        200]
+   * index: 0
+   * output: [5,           100],
+   *         [null,        100],
+   *         [15,          100]
    * </code>
    *
    * Note that null lists are not included in the resulting table, but nulls inside
@@ -1799,6 +1802,96 @@ public Table explodePosition(int index) {
     return new Table(explodePosition(nativeHandle, index));
   }
 
+  /**
+   * Explodes a list column's elements.
+   *
+   * Any list is exploded, which means the elements of the list in each row are expanded
+   * into new rows in the output. The corresponding rows for other columns in the input
+   * are duplicated.
+   *
+   * <code>
+   * Example:
+   * input:  [[5,10,15], 100],
+   *         [[20,25],   200],
+   *         [[30],      300],
+   * index: 0
+   * output: [5,         100],
+   *         [10,        100],
+   *         [15,        100],
+   *         [20,        200],
+   *         [25,        200],
+   *         [30,        300]
+   * </code>
+   *
+   * Nulls propagate in different ways depending on what is null.
+   * <code>
+   *  input:  [[5,null,15], 100],
+   *          [null,        200]
+   * index: 0
+   * output:  [5,           100],
+   *          [null,        100],
+   *          [15,          100],
+   *          [null,        200]
+   * </code>
+   * Note that null lists are completely removed from the output
+   * and nulls inside lists are pulled out and remain.
+   *
+   * @param index Column index to explode inside the table.
+   * @return A new table with explode_col exploded.
+   */
+  public Table explodeOuter(int index) {
+    assert 0 <= index && index < columns.length : "Column index is out of range";
+    assert columns[index].getType().equals(DType.LIST) : "Column to explode must be of type LIST";
+    return new Table(explodeOuter(nativeHandle, index));
+  }
+
+  /**
+   * Explodes a list column's elements retaining any null entries or empty lists and includes a
+   * position column.
+   *
+   * Any list is exploded, which means the elements of the list in each row are expanded into new rows
+   * in the output. The corresponding rows for other columns in the input are duplicated. A position
+   * column is added that has the index inside the original list for each row. Example:
+   *
+   * <code>
+   * Example:
+   * input:  [[5,10,15], 100],
+   *         [[20,25],   200],
+   *         [[30],      300],
+   * index: 0
+   * output: [0,   5,    100],
+   *         [1,   10,   100],
+   *         [2,   15,   100],
+   *         [0,   20,   200],
+   *         [1,   25,   200],
+   *         [0,   30,   300]
+   * </code>
+   *
+   * Nulls and empty lists propagate as null entries in the result.
+   * <code>
+   * input:  [[5,null,15], 100],
+   *         [null,        200],
+   *         [[],          300]
+   * index: 0
+   * output: [0,     5,    100],
+   *         [1,  null,    100],
+   *         [2,    15,    100],
+   *         [0,  null,    200],
+   *         [0,  null,    300]
+   * </code>
+   *
+   *    returns
+   *
+   * @param index Column index to explode inside the table.
+   * @return A new table with exploded value and position. The column order of return table is
+   *         [cols before explode_input, explode_position, explode_value, cols after explode_input].
+   */
+  public Table explodeOuterPosition(int index) {
+    assert 0 <= index && index < columns.length : "Column index is out of range";
+    assert columns[index].getType().equals(DType.LIST) : "Column to explode must be of type LIST";
+    return new Table(explodeOuterPosition(nativeHandle, index));
+  }
+
   /**
    * Gathers the rows of this table according to `gatherMap` such that row "i"
    * in the resulting table's columns will contain row "gatherMap[i]" from this table.
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 4548156055a..02385a453d0 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -2052,4 +2052,32 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodePosition(JNIEnv *e
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuter(JNIEnv *env, jclass,
+                                                                    jlong input_jtable,
+                                                                    jint column_index) {
+  JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *input_table = reinterpret_cast<cudf::table_view *>(input_jtable);
+    cudf::size_type col_index = static_cast<cudf::size_type>(column_index);
+    std::unique_ptr<cudf::table> exploded = cudf::explode_outer(*input_table, col_index);
+    return cudf::jni::convert_table_for_return(env, exploded);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuterPosition(JNIEnv *env, jclass,
+                                                                            jlong input_jtable,
+                                                                            jint column_index) {
+  JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *input_table = reinterpret_cast<cudf::table_view *>(input_jtable);
+    cudf::size_type col_index = static_cast<cudf::size_type>(column_index);
+    std::unique_ptr<cudf::table> exploded = cudf::explode_outer_position(*input_table, col_index);
+    return cudf::jni::convert_table_for_return(env, exploded);
+  }
+  CATCH_STD(env, 0);
+}
+
 } // extern "C"
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 626f7828012..c2e28e1cad8 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -4635,7 +4635,7 @@ private Table[] buildExplodeTestTableWithPrimitiveTypes(boolean pos, boolean out
     }
   }
 
-  private Table[] buildExplodeTestTableWithNestedTypes(boolean pos) {
+  private Table[] buildExplodeTestTableWithNestedTypes(boolean pos, boolean outer) {
     StructType nestedType = new StructType(true,
         new BasicType(false, DType.INT32), new BasicType(false, DType.STRING));
     try (Table input = new Table.TestBuilder()
@@ -4644,23 +4644,42 @@ private Table[] buildExplodeTestTableWithNestedTypes(boolean pos) {
             Arrays.asList(struct(4, "k4"), struct(5, "k5")),
             Arrays.asList(struct(6, "k6")),
             Arrays.asList(new HostColumnVector.StructData((List) null)),
-            Arrays.asList())
+            null)
         .column("s1", "s2", "s3", "s4", "s5")
         .column(1, 3, 5, 7, 9)
         .column(12.0, 14.0, 13.0, 11.0, 15.0)
         .build()) {
       Table.TestBuilder expectedBuilder = new Table.TestBuilder();
       if (pos) {
-        expectedBuilder.column(0, 1, 2, 0, 1, 0, 0);
+        if (!outer)
+          expectedBuilder.column(0, 1, 2, 0, 1, 0, 0);
+        else
+          expectedBuilder.column(0, 1, 2, 0, 1, 0, 0, 0);
       }
-      try (Table expected = expectedBuilder
-          .column(nestedType,
+      List<Object[]> expectedData = new ArrayList<Object[]>(){{
+        if (!outer) {
+          this.add(new HostColumnVector.StructData[]{
+              struct(1, "k1"), struct(2, "k2"), struct(3, "k3"),
+              struct(4, "k4"), struct(5, "k5"), struct(6, "k6"),
+              new HostColumnVector.StructData((List) null)});
+          this.add(new String[]{"s1", "s1", "s1", "s2", "s2", "s3", "s4"});
+          this.add(new Integer[]{1, 1, 1, 3, 3, 5, 7});
+          this.add(new Double[]{12.0, 12.0, 12.0, 14.0, 14.0, 13.0, 11.0});
+        } else {
+          this.add(new HostColumnVector.StructData[]{
               struct(1, "k1"), struct(2, "k2"), struct(3, "k3"),
               struct(4, "k4"), struct(5, "k5"), struct(6, "k6"),
-              new HostColumnVector.StructData((List) null))
-          .column("s1", "s1", "s1", "s2", "s2", "s3", "s4")
-          .column(1, 1, 1, 3, 3, 5, 7)
-          .column(12.0, 12.0, 12.0, 14.0, 14.0, 13.0, 11.0)
+              new HostColumnVector.StructData((List) null), null});
+          this.add(new String[]{"s1", "s1", "s1", "s2", "s2", "s3", "s4", "s5"});
+          this.add(new Integer[]{1, 1, 1, 3, 3, 5, 7, 9});
+          this.add(new Double[]{12.0, 12.0, 12.0, 14.0, 14.0, 13.0, 11.0, 15.0});
+        }
+      }};
+      try (Table expected = expectedBuilder
+          .column(nestedType, (HostColumnVector.StructData[]) expectedData.get(0))
+          .column((String[]) expectedData.get(1))
+          .column((Integer[]) expectedData.get(2))
+          .column((Double[]) expectedData.get(3))
           .build()) {
         return new Table[]{new Table(input.getColumns()), new Table(expected.getColumns())};
       }
@@ -4679,7 +4698,7 @@ void testExplode() {
     }
 
     // Child is nested type
-    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(false);
+    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(false, false);
     try (Table input = testTables2[0];
          Table expected = testTables2[1]) {
       try (Table exploded = input.explode(0)) {
@@ -4689,7 +4708,7 @@ void testExplode() {
   }
 
   @Test
-  void testPosExplode() {
+  void testExplodePosition() {
     // Child is primitive type
     Table[] testTables = buildExplodeTestTableWithPrimitiveTypes(true, false);
     try (Table input = testTables[0];
@@ -4699,8 +4718,8 @@ void testPosExplode() {
       }
     }
 
-    // Child is primitive type
-    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(true);
+    // Child is nested type
+    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(true, false);
     try (Table input = testTables2[0];
          Table expected = testTables2[1]) {
       try (Table exploded = input.explodePosition(0)) {
@@ -4709,4 +4728,45 @@ void testPosExplode() {
     }
   }
 
+  @Test
+  void testExplodeOuter() {
+    // Child is primitive type
+    Table[] testTables = buildExplodeTestTableWithPrimitiveTypes(false, true);
+    try (Table input = testTables[0];
+         Table expected = testTables[1]) {
+      try (Table exploded = input.explodeOuter(0)) {
+        assertTablesAreEqual(expected, exploded);
+      }
+    }
+
+    // Child is nested type
+    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(false, true);
+    try (Table input = testTables2[0];
+         Table expected = testTables2[1]) {
+      try (Table exploded = input.explodeOuter(0)) {
+        assertTablesAreEqual(expected, exploded);
+      }
+    }
+  }
+
+  @Test
+  void testExplodeOuterPosition() {
+    // Child is primitive type
+    Table[] testTables = buildExplodeTestTableWithPrimitiveTypes(true, true);
+    try (Table input = testTables[0];
+         Table expected = testTables[1]) {
+      try (Table exploded = input.explodeOuterPosition(0)) {
+        assertTablesAreEqual(expected, exploded);
+      }
+    }
+
+    // Child is nested type
+    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(true, true);
+    try (Table input = testTables2[0];
+         Table expected = testTables2[1]) {
+      try (Table exploded = input.explodeOuterPosition(0)) {
+        assertTablesAreEqual(expected, exploded);
+      }
+    }
+  }
 }

From 9aa33efa9e931c9de78e047a20e6a7481ee13559 Mon Sep 17 00:00:00 2001
From: David <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 17 Mar 2021 22:54:30 -0400
Subject: [PATCH 17/21] Optimize cudf::make_strings_column for long strings
 (#7576)

Reference #7571
This improves the performance of the `cudf::make_strings_column` for long strings. It uses a similar approach from `cudf::strings::detail::gather` and also use thresholding as in the optimized `cudf::strings::replace`.
This may not be the right solution for overall optimizing #7571 but may be helpful in other places where long strings are used for created a strings column in libcudf.
This PR also includes a gbenchmark to help measure the performance results of this factory function. The results of the benchmark are that longer strings (~ >64 bytes on average) showed about a 10x improvement. I can post benchmark results here if needed. The character-parallel algorithm was slower for shorter strings so the existing algorithm is used based on the a threshold calculation.
I also added an additional gtest with a mixture of nulls and empty strings to make sure the new algorithm handles these correctly.

Authors:
  - David (@davidwendt)

Approvers:
  - Jason Lowe (@jlowe)
  - Nghia Truong (@ttnghia)
  - Jake Hemstad (@jrhemstad)

URL: https://github.com/rapidsai/cudf/pull/7576
---
 cpp/benchmarks/CMakeLists.txt                 |  1 +
 cpp/benchmarks/string/factory_benchmark.cu    | 93 ++++++++++++++++++
 cpp/benchmarks/string/string_bench_args.hpp   |  2 +
 cpp/include/cudf/strings/detail/gather.cuh    | 87 ++++++++++------
 .../detail/strings_column_factories.cuh       | 98 +++++++++++++++----
 cpp/include/cudf/utilities/traits.hpp         | 12 +++
 cpp/tests/strings/factories_test.cu           | 34 +++++++
 7 files changed, 280 insertions(+), 47 deletions(-)
 create mode 100644 cpp/benchmarks/string/factory_benchmark.cu

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index dfc340b1459..e63ea38a31b 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -185,6 +185,7 @@ ConfigureBench(STRINGS_BENCH
   string/convert_floats_benchmark.cpp
   string/copy_benchmark.cpp
   string/extract_benchmark.cpp
+  string/factory_benchmark.cu
   string/filter_benchmark.cpp
   string/find_benchmark.cpp
   string/replace_benchmark.cpp
diff --git a/cpp/benchmarks/string/factory_benchmark.cu b/cpp/benchmarks/string/factory_benchmark.cu
new file mode 100644
index 00000000000..6c5dceffaa8
--- /dev/null
+++ b/cpp/benchmarks/string/factory_benchmark.cu
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "string_bench_args.hpp"
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/execution_policy.h>
+#include <thrust/transform.h>
+
+#include <limits>
+
+namespace {
+using string_pair = thrust::pair<char const*, cudf::size_type>;
+struct string_view_to_pair {
+  __device__ string_pair operator()(thrust::pair<cudf::string_view, bool> const& p)
+  {
+    return (p.second) ? string_pair{p.first.data(), p.first.size_bytes()} : string_pair{nullptr, 0};
+  }
+};
+}  // namespace
+
+class StringsFactory : public cudf::benchmark {
+};
+
+static void BM_factory(benchmark::State& state)
+{
+  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
+  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  auto d_column = cudf::column_device_view::create(table->view().column(0));
+  rmm::device_vector<string_pair> pairs(d_column->size());
+  thrust::transform(thrust::device,
+                    d_column->pair_begin<cudf::string_view, true>(),
+                    d_column->pair_end<cudf::string_view, true>(),
+                    pairs.data(),
+                    string_view_to_pair{});
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    cudf::make_strings_column(pairs);
+  }
+
+  cudf::strings_column_view input(table->view().column(0));
+  state.SetBytesProcessed(state.iterations() * input.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 1 << 5;
+  int const max_rowlen = 1 << 13;
+  int const len_mult   = 4;
+  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+}
+
+#define STRINGS_BENCHMARK_DEFINE(name)          \
+  BENCHMARK_DEFINE_F(StringsFactory, name)      \
+  (::benchmark::State & st) { BM_factory(st); } \
+  BENCHMARK_REGISTER_F(StringsFactory, name)    \
+    ->Apply(generate_bench_args)                \
+    ->UseManualTime()                           \
+    ->Unit(benchmark::kMillisecond);
+
+STRINGS_BENCHMARK_DEFINE(factory)
diff --git a/cpp/benchmarks/string/string_bench_args.hpp b/cpp/benchmarks/string/string_bench_args.hpp
index f81f859de74..9c709b064dd 100644
--- a/cpp/benchmarks/string/string_bench_args.hpp
+++ b/cpp/benchmarks/string/string_bench_args.hpp
@@ -17,6 +17,8 @@
 
 #include <benchmark/benchmark.h>
 
+#include <cudf/types.hpp>
+
 /**
  * @brief Generate row count and row length argument ranges for a string benchmark.
  *
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 28da8ef4324..988fa552100 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -31,15 +31,60 @@
 #include <thrust/transform.h>
 
 namespace cudf {
+namespace strings {
+namespace detail {
 
-template <typename Iterator>
-constexpr inline bool is_signed_iterator()
+/**
+ * @brief Returns a new chars column using the specified indices to select
+ * strings from the input iterator.
+ *
+ * This uses a character-parallel gather CUDA kernel that performs very
+ * well on a strings column with long strings (e.g. average > 64 bytes).
+ *
+ * @tparam StringIterator Iterator should produce `string_view` objects.
+ * @tparam MapIterator Iterator for retrieving integer indices of the `StringIterator`.
+ *
+ * @param strings_begin Start of the iterator to retrieve `string_view` instances
+ * @param map_begin Start of index iterator.
+ * @param map_end End of index iterator.
+ * @param offsets The offset values to be associated with the output chars column.
+ * @param chars_bytes The total number of bytes for the output chars column.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @return New chars column fit for a strings column.
+ */
+template <typename StringIterator, typename MapIterator>
+std::unique_ptr<cudf::column> gather_chars(StringIterator strings_begin,
+                                           MapIterator map_begin,
+                                           MapIterator map_end,
+                                           cudf::device_span<int32_t const> const offsets,
+                                           size_type chars_bytes,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
 {
-  return std::is_signed<typename std::iterator_traits<Iterator>::value_type>::value;
-}
+  auto const output_count = std::distance(map_begin, map_end);
+  if (output_count == 0) return make_empty_column(data_type{type_id::INT8});
 
-namespace strings {
-namespace detail {
+  auto chars_column  = create_chars_child_column(output_count, 0, chars_bytes, stream, mr);
+  auto const d_chars = chars_column->mutable_view().template data<char>();
+
+  auto gather_chars_fn = [strings_begin, map_begin, offsets] __device__(size_type out_idx) -> char {
+    auto const out_row =
+      thrust::prev(thrust::upper_bound(thrust::seq, offsets.begin(), offsets.end(), out_idx));
+    auto const row_idx = map_begin[thrust::distance(offsets.begin(), out_row)];  // get row index
+    auto const d_str   = strings_begin[row_idx];                                 // get row's string
+    auto const offset  = out_idx - *out_row;  // get string's char
+    return d_str.data()[offset];
+  };
+
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(chars_bytes),
+                    d_chars,
+                    gather_chars_fn);
+
+  return chars_column;
+}
 
 /**
  * @brief Returns a new strings column using the specified indices to select
@@ -107,29 +152,15 @@ std::unique_ptr<cudf::column> gather(
     rmm::exec_policy(stream), d_out_offsets, d_out_offsets + output_count + 1, d_out_offsets);
 
   // build chars column
-  size_type const out_chars_bytes = static_cast<size_type>(total_bytes);
-  auto out_chars_column  = create_chars_child_column(output_count, 0, out_chars_bytes, stream, mr);
-  auto const d_out_chars = out_chars_column->mutable_view().template data<char>();
-
-  // fill in chars
   cudf::device_span<int32_t const> const d_out_offsets_span(d_out_offsets, output_count + 1);
-  auto const d_in_chars = (strings_count > 0) ? strings.chars().data<char>() : nullptr;
-  auto gather_chars_fn =
-    [d_out_offsets_span, begin, d_in_offsets, d_in_chars] __device__(size_type out_char_idx) {
-      // find output row index for this output char index
-      auto const next_row_ptr = thrust::upper_bound(
-        thrust::seq, d_out_offsets_span.begin(), d_out_offsets_span.end(), out_char_idx);
-      auto const out_row_idx     = thrust::distance(d_out_offsets_span.begin(), next_row_ptr) - 1;
-      auto const str_char_offset = out_char_idx - d_out_offsets_span[out_row_idx];
-      auto const in_row_idx      = begin[out_row_idx];
-      auto const in_char_offset  = d_in_offsets[in_row_idx] + str_char_offset;
-      return d_in_chars[in_char_offset];
-    };
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(out_chars_bytes),
-                    d_out_chars,
-                    gather_chars_fn);
+  auto const d_strings  = column_device_view::create(strings.parent(), stream);
+  auto out_chars_column = gather_chars(d_strings->begin<string_view>(),
+                                       begin,
+                                       end,
+                                       d_out_offsets_span,
+                                       static_cast<size_type>(total_bytes),
+                                       stream,
+                                       mr);
 
   return make_strings_column(output_count,
                              std::move(out_offsets_column),
diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
index 8e843c555c5..932f7eb0926 100644
--- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh
+++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -20,6 +20,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/valid_if.cuh>
+#include <cudf/strings/detail/gather.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -27,6 +28,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/copy.h>
 #include <thrust/for_each.h>
 #include <thrust/transform_reduce.h>
 
@@ -34,7 +36,27 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
-// Create a strings-type column from iterators of pointer/size pairs
+/**
+ * @brief Average string byte-length threshold for deciding character-level
+ * vs. row-level parallel algorithm.
+ *
+ * This value was determined by running the factory_benchmark against different
+ * string lengths and observing the point where the performance is faster for
+ * long strings.
+ */
+constexpr size_type FACTORY_BYTES_PER_ROW_THRESHOLD = 64;
+
+/**
+ * @brief Create a strings-type column from iterators of pointer/size pairs
+ *
+ * @tparam IndexPairIterator iterator over type `pair<char const*,size_type>` values
+ *
+ * @param begin First string row (inclusive)
+ * @param end Last string row (exclusive)
+ * @param stream CUDA stream used for device memory operations
+ * @param mr  Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
+ */
 template <typename IndexPairIterator>
 std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
                                             IndexPairIterator end,
@@ -51,7 +73,7 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
   auto size_checker = [] __device__(string_index_pair const& item) {
     return (item.first != nullptr) ? item.second : 0;
   };
-  size_t bytes = thrust::transform_reduce(
+  size_t const bytes = thrust::transform_reduce(
     rmm::exec_policy(stream), begin, end, size_checker, 0, thrust::plus<size_t>());
   CUDF_EXPECTS(bytes < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
                "total size of strings is too large for cudf column");
@@ -65,26 +87,49 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
     offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
 
   // create null mask
-  auto validator  = [] __device__(string_index_pair const item) { return item.first != nullptr; };
-  auto new_nulls  = cudf::detail::valid_if(begin, end, validator, stream, mr);
-  auto null_count = new_nulls.second;
+  auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; };
+  auto new_nulls = cudf::detail::valid_if(begin, end, validator, stream, mr);
+  auto const null_count = new_nulls.second;
   auto null_mask =
     (null_count > 0) ? std::move(new_nulls.first) : rmm::device_buffer{0, stream, mr};
 
   // build chars column
-  auto chars_column =
-    strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
-  auto d_chars    = chars_column->mutable_view().template data<char>();
-  auto copy_chars = [d_chars] __device__(auto item) {
-    string_index_pair str = thrust::get<0>(item);
-    size_type offset      = thrust::get<1>(item);
-    if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second);
-  };
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_zip_iterator(
-                       thrust::make_tuple(begin, offsets_column->view().template begin<int32_t>())),
-                     strings_count,
-                     copy_chars);
+  std::unique_ptr<column> chars_column = [&] {
+    // use a character-parallel kernel for long string lengths
+    auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1);
+    if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) {
+      auto const d_offsets =
+        device_span<size_type const>{offsets_column->view().template data<int32_t>(),
+                                     static_cast<std::size_t>(offsets_column->size())};
+      auto const str_begin = thrust::make_transform_iterator(begin, [] __device__(auto ip) {
+        return string_view{ip.first, ip.second};
+      });
+
+      return gather_chars(str_begin,
+                          thrust::make_counting_iterator<size_type>(0),
+                          thrust::make_counting_iterator<size_type>(strings_count),
+                          d_offsets,
+                          static_cast<size_type>(bytes),
+                          stream,
+                          mr);
+    } else {
+      // this approach is 2-3x faster for a large number of smaller string lengths
+      auto chars_column =
+        strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
+      auto d_chars    = chars_column->mutable_view().template data<char>();
+      auto copy_chars = [d_chars] __device__(auto item) {
+        string_index_pair const str = thrust::get<0>(item);
+        size_type const offset      = thrust::get<1>(item);
+        if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second);
+      };
+      thrust::for_each_n(rmm::exec_policy(stream),
+                         thrust::make_zip_iterator(thrust::make_tuple(
+                           begin, offsets_column->view().template begin<int32_t>())),
+                         strings_count,
+                         copy_chars);
+      return chars_column;
+    }
+  }();
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
@@ -95,7 +140,22 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
                              mr);
 }
 
-// Create a strings-type column from iterators to chars, offsets, and bitmask.
+/**
+ * @brief Create a strings-type column from iterators to chars, offsets, and bitmask.
+ *
+ * @tparam CharIterator iterator over character bytes (int8)
+ * @tparam OffsetIterator iterator over offset values (size_type)
+ *
+ * @param chars_begin First character byte (inclusive)
+ * @param chars_end Last character byte (exclusive)
+ * @param offset_begin First offset value (inclusive)
+ * @param offset_end Last offset value (exclusive)
+ * @param null_count Number of null rows
+ * @param null_mask The validity bitmask in Arrow format
+ * @param stream CUDA stream used for device memory operations
+ * @param mr  Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
+ */
 template <typename CharIterator, typename OffsetIterator>
 std::unique_ptr<column> make_strings_column(CharIterator chars_begin,
                                             CharIterator chars_end,
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index e045476ea77..1e0d45d081d 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -224,6 +224,18 @@ constexpr inline bool is_unsigned(data_type type)
   return cudf::type_dispatcher(type, is_unsigned_impl{});
 }
 
+/**
+ * @brief Indicates whether the `Iterator` value type is unsigned.
+ *
+ * @tparam Iterator  The type to verify
+ * @return true if the iterator's value type is unsigned
+ */
+template <typename Iterator>
+constexpr inline bool is_signed_iterator()
+{
+  return std::is_signed<typename std::iterator_traits<Iterator>::value_type>::value;
+}
+
 /**
  * @brief Indicates whether the type `T` is a floating point type.
  *
diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu
index f904c404251..bd463a7ab0d 100644
--- a/cpp/tests/strings/factories_test.cu
+++ b/cpp/tests/strings/factories_test.cu
@@ -19,12 +19,18 @@
 #include <cudf/copying.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/execution_policy.h>
+#include <thrust/transform.h>
+
 #include <cstring>
 #include <vector>
 
@@ -198,3 +204,31 @@ TEST_F(StringsFactoriesTest, CreateOffsets)
     }
   }
 }
+
+namespace {
+using string_pair = thrust::pair<char const*, cudf::size_type>;
+struct string_view_to_pair {
+  __device__ string_pair operator()(thrust::pair<cudf::string_view, bool> const& p)
+  {
+    return (p.second) ? string_pair{p.first.data(), p.first.size_bytes()} : string_pair{nullptr, 0};
+  }
+};
+}  // namespace
+
+TEST_F(StringsFactoriesTest, StringPairWithNullsAndEmpty)
+{
+  cudf::test::strings_column_wrapper data(
+    {"", "this", "is", "", "a", "", "column", "of", "strings", "", ""},
+    {0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1});
+
+  auto d_column = cudf::column_device_view::create(data);
+  rmm::device_vector<string_pair> pairs(d_column->size());
+  thrust::transform(thrust::device,
+                    d_column->pair_begin<cudf::string_view, true>(),
+                    d_column->pair_end<cudf::string_view, true>(),
+                    pairs.data(),
+                    string_view_to_pair{});
+
+  auto result = cudf::make_strings_column(pairs);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), data);
+}

From 951b455b14a37efcbffc38638ab0b89d787d5b59 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 17 Mar 2021 20:29:36 -0700
Subject: [PATCH 18/21] Adds `list.take`, python binding for
 `cudf::lists::segmented_gather` (#7591)

Closes #7465

Implements `ListColumn.list.take` based on `cudf::lists:segmented_gather`. Gather elements inside each list based on the provided positions. Example:

```python
>>> s = cudf.Series([[1, 2, 3], [4, 5]])
>>> s
0    [1, 2, 3]
1       [4, 5]
dtype: list
>>> s.list.take([[2, 1], [1, 0]])
0    [3, 2]
1    [5, 4]
dtype: list
```

Authors:
  - Michael Wang (@isVoid)

Approvers:
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7591
---
 python/cudf/cudf/_lib/copying.pyx          | 25 ++++++++-
 python/cudf/cudf/_lib/cpp/lists/gather.pxd | 13 +++++
 python/cudf/cudf/core/column/lists.py      | 61 +++++++++++++++++++++-
 python/cudf/cudf/tests/test_list.py        | 47 +++++++++++++++++
 4 files changed, 143 insertions(+), 3 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/cpp/lists/gather.pxd

diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index ad798a73ed2..e5501428624 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -3,7 +3,7 @@
 import pandas as pd
 
 from libcpp cimport bool
-from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr, shared_ptr, make_shared
 from libcpp.vector cimport vector
 from libcpp.utility cimport move
 from libc.stdint cimport int32_t, int64_t
@@ -24,6 +24,10 @@ from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport size_type
+from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
+from cudf._lib.cpp.lists.gather cimport (
+    segmented_gather as cpp_segmented_gather
+)
 cimport cudf._lib.cpp.copying as cpp_copying
 
 # workaround for https://github.com/cython/cython/issues/3885
@@ -704,3 +708,22 @@ def sample(Table input, size_type n,
             else input._index_names
         )
     )
+
+
+def segmented_gather(Column source_column, Column gather_map):
+    cdef shared_ptr[lists_column_view] source_LCV = (
+        make_shared[lists_column_view](source_column.view())
+    )
+    cdef shared_ptr[lists_column_view] gather_map_LCV = (
+        make_shared[lists_column_view](gather_map.view())
+    )
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_segmented_gather(
+                source_LCV.get()[0], gather_map_LCV.get()[0])
+        )
+
+    result = Column.from_unique_ptr(move(c_result))
+    return result
diff --git a/python/cudf/cudf/_lib/cpp/lists/gather.pxd b/python/cudf/cudf/_lib/cpp/lists/gather.pxd
new file mode 100644
index 00000000000..ea664eee82e
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/lists/gather.pxd
@@ -0,0 +1,13 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
+
+
+cdef extern from "cudf/lists/gather.hpp" namespace "cudf::lists" nogil:
+    cdef unique_ptr[column] segmented_gather(
+        const lists_column_view source_column,
+        const lists_column_view gather_map_list
+    ) except +
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index a60fe627acb..1d3f73822a9 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -2,14 +2,16 @@
 
 import pickle
 
+import numpy as np
 import pyarrow as pa
 
 import cudf
+from cudf._lib.copying import segmented_gather
 from cudf._lib.lists import count_elements
 from cudf.core.buffer import Buffer
-from cudf.core.column import ColumnBase, column
+from cudf.core.column import ColumnBase, as_column, column
 from cudf.core.column.methods import ColumnMethodsMixin
-from cudf.utils.dtypes import is_list_dtype
+from cudf.utils.dtypes import is_list_dtype, is_numerical_dtype
 
 
 class ListColumn(ColumnBase):
@@ -228,3 +230,58 @@ def len(self):
         dtype: int32
         """
         return self._return_or_inplace(count_elements(self._column))
+
+    def take(self, lists_indices):
+        """
+        Collect list elements based on given indices.
+
+        Parameters
+        ----------
+        lists_indices: List type arrays
+            Specifies what to collect from each row
+
+        Returns
+        -------
+        ListColumn
+
+        Examples
+        --------
+        >>> s = cudf.Series([[1, 2, 3], None, [4, 5]])
+        >>> s
+        0    [1, 2, 3]
+        1         None
+        2       [4, 5]
+        dtype: list
+        >>> s.list.take([[0, 1], [], []])
+        0    [1, 2]
+        1      None
+        2        []
+        dtype: list
+        """
+
+        lists_indices_col = as_column(lists_indices)
+        if not isinstance(lists_indices_col, ListColumn):
+            raise ValueError("lists_indices should be list type array.")
+        if not lists_indices_col.size == self._column.size:
+            raise ValueError(
+                "lists_indices and list column is of different " "size."
+            )
+        if not is_numerical_dtype(
+            lists_indices_col.children[1].dtype
+        ) or not np.issubdtype(
+            lists_indices_col.children[1].dtype, np.integer
+        ):
+            raise TypeError(
+                "lists_indices should be column of values of index types."
+            )
+
+        try:
+            res = self._return_or_inplace(
+                segmented_gather(self._column, lists_indices_col)
+            )
+        except RuntimeError as e:
+            if "contains nulls" in str(e):
+                raise ValueError("lists_indices contains null.") from e
+            raise
+        else:
+            return res
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 195d8749ec6..33812cfa7a7 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -112,3 +112,50 @@ def test_len(data):
     got = gsr.list.len()
 
     assert_eq(expect, got, check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    ("data", "idx"),
+    [
+        ([[1, 2, 3], [3, 4, 5], [4, 5, 6]], [[0, 1], [2], [1, 2]]),
+        ([[1, 2, 3], [3, 4, 5], [4, 5, 6]], [[1, 2, 0], [1, 0, 2], [0, 1, 2]]),
+        ([[1, 2, 3], []], [[0, 1], []]),
+        ([[1, 2, 3], [None]], [[0, 1], []]),
+        ([[1, None, 3], None], [[0, 1], []]),
+    ],
+)
+def test_take(data, idx):
+    ps = pd.Series(data)
+    gs = cudf.from_pandas(ps)
+
+    expected = pd.Series(zip(ps, idx)).map(
+        lambda x: [x[0][i] for i in x[1]] if x[0] is not None else None
+    )
+    got = gs.list.take(idx)
+    assert_eq(expected, got)
+
+
+@pytest.mark.parametrize(
+    ("invalid", "exception"),
+    [
+        ([[0]], pytest.raises(ValueError, match="different size")),
+        ([1, 2, 3, 4], pytest.raises(ValueError, match="should be list type")),
+        (
+            [["a", "b"], ["c"]],
+            pytest.raises(
+                TypeError, match="should be column of values of index types"
+            ),
+        ),
+        (
+            [[[1], [0]], [[0]]],
+            pytest.raises(
+                TypeError, match="should be column of values of index types"
+            ),
+        ),
+        ([[0, 1], None], pytest.raises(ValueError, match="contains null")),
+    ],
+)
+def test_take_invalid(invalid, exception):
+    gs = cudf.Series([[0, 1], [2, 3]])
+    with exception:
+        gs.list.take(invalid)

From 873955e08ae47de118b73ba9c52dfaa3062fce81 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <36027403+codereport@users.noreply.github.com>
Date: Thu, 18 Mar 2021 09:32:06 -0400
Subject: [PATCH 19/21] Add explicit fixed_point merge test (#7635)

While confirming that libcudf does indeed support `cudf::merge` with `fixed_point` ended up writing an explicit test that there is no reason not to just add as a unit test.

Authors:
  - Conor Hoekstra (@codereport)

Approvers:
  - Karthikeyan (@karthikeyann)
  - David (@davidwendt)

URL: https://github.com/rapidsai/cudf/pull/7635
---
 cpp/tests/merge/merge_test.cpp | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp
index fa3bde8cb52..451fa82d5a3 100644
--- a/cpp/tests/merge/merge_test.cpp
+++ b/cpp/tests/merge/merge_test.cpp
@@ -729,4 +729,36 @@ TEST_F(MergeTest, KeysWithNulls)
     }
 }
 
+template <typename T>
+struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+};
+
+template <typename T>
+using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
+
+TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+
+TYPED_TEST(FixedPointTestBothReps, FixedPointMerge)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = cudf::device_storage_type_t<decimalXX>;
+
+  auto const a       = fp_wrapper<RepType>{{4, 22, 33, 44, 55}, scale_type{-1}};
+  auto const b       = fp_wrapper<RepType>{{5, 7, 10}, scale_type{-1}};
+  auto const table_a = cudf::table_view(std::vector<cudf::column_view>{a});
+  auto const table_b = cudf::table_view(std::vector<cudf::column_view>{b});
+  auto const tables  = std::vector<cudf::table_view>{table_a, table_b};
+
+  auto const key_cols = std::vector<cudf::size_type>{0};
+  auto const order    = std::vector<cudf::order>{cudf::order::ASCENDING};
+
+  auto const exp       = fp_wrapper<RepType>{{4, 5, 7, 10, 22, 33, 44, 55}, scale_type{-1}};
+  auto const exp_table = cudf::table_view(std::vector<cudf::column_view>{exp});
+
+  auto const result = cudf::merge(tables, key_cols, order);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(exp_table.column(0), result->view().column(0));
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From d6cc6943411c00d01368f8a1a53567997ffaff39 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Thu, 18 Mar 2021 08:55:43 -0500
Subject: [PATCH 20/21] Add in JNI support for table partition (#7637)

This adds in support for partition. Which will partition a table based off of a partition map.

Authors:
  - Robert (Bobby) Evans (@revans2)

Approvers:
  - Jason Lowe (@jlowe)

URL: https://github.com/rapidsai/cudf/pull/7637
---
 java/src/main/java/ai/rapids/cudf/Table.java  | 21 ++++++++++++
 java/src/main/native/src/TableJni.cpp         | 33 +++++++++++++++++++
 .../test/java/ai/rapids/cudf/TableTest.java   | 17 ++++++++++
 3 files changed, 71 insertions(+)

diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index d0e59fdc105..0dc529d423f 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -183,6 +183,9 @@ public long getDeviceMemorySize() {
   
   private static native ContiguousTable[] contiguousSplit(long inputTable, int[] indices);
 
+  private static native long[] partition(long inputTable, long partitionView,
+      int numberOfPartitions, int[] outputOffsets);
+
   private static native long[] hashPartition(long inputTable,
                                              int[] columnsToHash,
                                              int hashTypeId,
@@ -1257,6 +1260,24 @@ public Table repeat(ColumnVector counts, boolean checkCount) {
     return new Table(repeatColumnCount(this.nativeHandle, counts.getNativeView(), checkCount));
   }
 
+  /**
+   * Partition this table using the mapping in partitionMap. partitionMap must be an integer
+   * column. The number of rows in partitionMap must be the same as this table.  Each row
+   * in the map will indicate which partition the rows in the table belong to.
+   * @param partitionMap the partitions for each row.
+   * @param numberOfPartitions number of partitions
+   * @return {@link PartitionedTable} Table that exposes a limited functionality of the
+   * {@link Table} class
+   */
+  public PartitionedTable partition(ColumnView partitionMap, int numberOfPartitions) {
+    int[] partitionOffsets = new int[numberOfPartitions];
+    return new PartitionedTable(new Table(partition(
+        getNativeView(),
+        partitionMap.getNativeView(),
+        partitionOffsets.length,
+        partitionOffsets)), partitionOffsets);
+  }
+
   /**
    * Find smallest indices in a sorted table where values should be inserted to maintain order.
    * <pre>
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 02385a453d0..81b9882104f 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1614,6 +1614,39 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_concatenate(JNIEnv *env,
   CATCH_STD(env, NULL);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_partition(JNIEnv *env, jclass,
+                                                                 jlong input_table,
+                                                                 jlong partition_column,
+                                                                 jint number_of_partitions,
+                                                                 jintArray output_offsets) {
+
+  JNI_NULL_CHECK(env, input_table, "input table is null", NULL);
+  JNI_NULL_CHECK(env, partition_column, "partition_column is null", NULL);
+  JNI_NULL_CHECK(env, output_offsets, "output_offsets is null", NULL);
+  JNI_ARG_CHECK(env, number_of_partitions > 0, "number_of_partitions is zero", NULL);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
+    cudf::column_view *n_part_column = reinterpret_cast<cudf::column_view *>(partition_column);
+    cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
+
+    auto result = cudf::partition(*n_input_table,
+                                  *n_part_column,
+                                  number_of_partitions);
+
+    for (size_t i = 0; i < result.second.size() - 1; i++) {
+      // for what ever reason partition returns the length of the result at then
+      // end and hash partition/round robin do not, so skip the last entry for
+      // consistency
+      n_output_offsets[i] = result.second[i];
+    }
+
+    return cudf::jni::convert_table_for_return(env, result.first);
+  }
+  CATCH_STD(env, NULL);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(JNIEnv *env, jclass,
                                                                      jlong input_table,
                                                                      jintArray columns_to_hash,
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index c2e28e1cad8..c075f074068 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -1773,6 +1773,23 @@ void testPartStability() {
     }
   }
 
+  @Test
+  void testPartition() {
+    try (Table t = new Table.TestBuilder()
+        .column(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+        .build();
+         ColumnVector parts = ColumnVector
+             .fromInts(1, 2, 1, 2, 1, 2, 1, 2, 1, 2);
+         PartitionedTable pt = t.partition(parts, 3);
+         Table expected = new Table.TestBuilder()
+             .column(1, 3, 5, 7, 9, 2, 4, 6, 8, 10)
+             .build()) {
+      int[] partCutoffs = pt.getPartitions();
+      assertArrayEquals(new int[]{0, 0, 5}, partCutoffs);
+      assertTablesAreEqual(expected, pt.getTable());
+    }
+  }
+
   @Test
   void testIdentityHashPartition() {
     final int count = 1024 * 1024;

From 472305172bd373823db0976e1b0f53b3e7cc11f1 Mon Sep 17 00:00:00 2001
From: David <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 18 Mar 2021 10:33:55 -0400
Subject: [PATCH 21/21] Add gbenchmarks for string substrings functions (#7603)

Reference #5698
This creates a gbenchmark for the 4 variations of `cudf::strings::slice_strings()` API. The benchmarks measures various sized rows as well as strings lengths.
This PR also includes changes to `substring.cu` implementation cleaning up the code and using the more efficient `make_strings_children`. This change improved performance for all 4 functions on average by 2-3x.

Authors:
  - David (@davidwendt)

Approvers:
  - Nghia Truong (@ttnghia)
  - @nvdbaranec
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7603
---
 cpp/benchmarks/CMakeLists.txt                 |   1 +
 cpp/benchmarks/string/substring_benchmark.cpp |  93 ++++++++++++
 cpp/src/strings/substring.cu                  | 136 ++++++++----------
 3 files changed, 155 insertions(+), 75 deletions(-)
 create mode 100644 cpp/benchmarks/string/substring_benchmark.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index e63ea38a31b..682f1ac5fca 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -190,4 +190,5 @@ ConfigureBench(STRINGS_BENCH
   string/find_benchmark.cpp
   string/replace_benchmark.cpp
   string/split_benchmark.cpp
+  string/substring_benchmark.cpp
   string/url_decode_benchmark.cpp)
diff --git a/cpp/benchmarks/string/substring_benchmark.cpp b/cpp/benchmarks/string/substring_benchmark.cpp
new file mode 100644
index 00000000000..d47c42e45be
--- /dev/null
+++ b/cpp/benchmarks/string/substring_benchmark.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "string_bench_args.hpp"
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/strings/substring.hpp>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <limits>
+
+#include <thrust/iterator/constant_iterator.h>
+
+class StringSubstring : public cudf::benchmark {
+};
+
+enum substring_type { position, multi_position, delimiter, multi_delimiter };
+
+static void BM_substring(benchmark::State& state, substring_type rt)
+{
+  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
+  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+  auto starts_itr = thrust::constant_iterator<cudf::size_type>(1);
+  auto stops_itr  = thrust::constant_iterator<cudf::size_type>(max_str_length / 2);
+  cudf::test::fixed_width_column_wrapper<int32_t> starts(starts_itr, starts_itr + n_rows);
+  cudf::test::fixed_width_column_wrapper<int32_t> stops(stops_itr, stops_itr + n_rows);
+  auto delim_itr = thrust::constant_iterator<std::string>(" ");
+  cudf::test::strings_column_wrapper delimiters(delim_itr, delim_itr + n_rows);
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    switch (rt) {
+      case position: cudf::strings::slice_strings(input, 1, max_str_length / 2); break;
+      case multi_position: cudf::strings::slice_strings(input, starts, stops); break;
+      case delimiter: cudf::strings::slice_strings(input, std::string{" "}, 1); break;
+      case multi_delimiter:
+        cudf::strings::slice_strings(input, cudf::strings_column_view(delimiters), 1);
+        break;
+    }
+  }
+
+  state.SetBytesProcessed(state.iterations() * input.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 1 << 5;
+  int const max_rowlen = 1 << 13;
+  int const len_mult   = 4;
+  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+}
+
+#define STRINGS_BENCHMARK_DEFINE(name)                                  \
+  BENCHMARK_DEFINE_F(StringSubstring, name)                             \
+  (::benchmark::State & st) { BM_substring(st, substring_type::name); } \
+  BENCHMARK_REGISTER_F(StringSubstring, name)                           \
+    ->Apply(generate_bench_args)                                        \
+    ->UseManualTime()                                                   \
+    ->Unit(benchmark::kMillisecond);
+
+STRINGS_BENCHMARK_DEFINE(position)
+STRINGS_BENCHMARK_DEFINE(multi_position)
+STRINGS_BENCHMARK_DEFINE(delimiter)
+STRINGS_BENCHMARK_DEFINE(multi_delimiter)
diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu
index 68080c0eb89..f712b0cb6aa 100644
--- a/cpp/src/strings/substring.cu
+++ b/cpp/src/strings/substring.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,17 +43,25 @@ namespace {
  * using the provided start, stop, and step parameters.
  */
 struct substring_fn {
-  const column_device_view d_column;
-  numeric_scalar_device_view<size_type> d_start, d_stop, d_step;
-  const int32_t* d_offsets{};
+  column_device_view const d_column;
+  numeric_scalar_device_view<size_type> const d_start;
+  numeric_scalar_device_view<size_type> const d_stop;
+  numeric_scalar_device_view<size_type> const d_step;
+  int32_t* d_offsets{};
   char* d_chars{};
 
-  __device__ cudf::size_type operator()(size_type idx)
+  __device__ void operator()(size_type idx)
   {
-    if (d_column.is_null(idx)) return 0;  // null string
-    string_view d_str = d_column.template element<string_view>(idx);
+    if (d_column.is_null(idx)) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
+    auto const d_str  = d_column.template element<string_view>(idx);
     auto const length = d_str.length();
-    if (length == 0) return 0;  // empty string
+    if (length == 0) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
     size_type const step = d_step.is_valid() ? d_step.value() : 1;
     auto const begin     = [&] {  // always inclusive
       // when invalid, default depends on step
@@ -88,7 +96,7 @@ struct substring_fn {
       if (d_buffer) d_buffer += from_char_utf8(*itr, d_buffer);
       itr += step;
     }
-    return bytes;
+    if (!d_chars) d_offsets[idx] = bytes;
   }
 };
 
@@ -103,42 +111,26 @@ std::unique_ptr<column> slice_strings(
   rmm::cuda_stream_view stream           = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource())
 {
-  size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(stream, mr);
+  if (strings.is_empty()) return make_empty_strings_column(stream, mr);
 
   if (step.is_valid()) CUDF_EXPECTS(step.value(stream) != 0, "Step parameter must not be 0");
 
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_column       = *strings_column;
-  auto d_start        = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(start));
-  auto d_stop         = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(stop));
-  auto d_step         = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(step));
-
-  // copy the null mask
-  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
-
-  // build offsets column
-  auto offsets_transformer_itr = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<int32_t>(0), substring_fn{d_column, d_start, d_stop, d_step});
-  auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_new_offsets = offsets_column->view().data<int32_t>();
-
-  // build chars column
-  auto bytes = cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
-  auto chars_column = strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), bytes, stream, mr);
-  auto d_chars = chars_column->mutable_view().data<char>();
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     strings_count,
-                     substring_fn{d_column, d_start, d_stop, d_step, d_new_offsets, d_chars});
+  auto const d_column = column_device_view::create(strings.parent(), stream);
+  auto const d_start  = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(start));
+  auto const d_stop   = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(stop));
+  auto const d_step   = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(step));
 
-  return make_strings_column(strings_count,
-                             std::move(offsets_column),
-                             std::move(chars_column),
+  auto children = make_strings_children(substring_fn{*d_column, d_start, d_stop, d_step},
+                                        strings.size(),
+                                        strings.null_count(),
+                                        stream,
+                                        mr);
+
+  return make_strings_column(strings.size(),
+                             std::move(children.first),
+                             std::move(children.second),
                              strings.null_count(),
-                             std::move(null_mask),
+                             cudf::detail::copy_bitmask(strings.parent(), stream, mr),
                              stream,
                              mr);
 }
@@ -166,25 +158,33 @@ namespace {
  * This both calculates the output size and executes the substring.
  */
 struct substring_from_fn {
-  const column_device_view d_column;
-  const cudf::detail::input_indexalator starts;
-  const cudf::detail::input_indexalator stops;
-  const int32_t* d_offsets{};
+  column_device_view const d_column;
+  cudf::detail::input_indexalator const starts;
+  cudf::detail::input_indexalator const stops;
+  int32_t* d_offsets{};
   char* d_chars{};
 
-  __device__ size_type operator()(size_type idx)
+  __device__ void operator()(size_type idx)
   {
-    if (d_column.is_null(idx)) return 0;  // null string
-    string_view d_str = d_column.template element<string_view>(idx);
+    if (d_column.is_null(idx)) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
+    auto const d_str  = d_column.template element<string_view>(idx);
     auto const length = d_str.length();
     auto const start  = starts[idx];
-    if (start >= length) return 0;  // empty string
+    if (start >= length) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
     auto const stop = stops[idx];
     auto const end  = (((stop < 0) || (stop > length)) ? length : stop);
 
-    string_view d_substr = d_str.substr(start, end - start);
-    if (d_chars) memcpy(d_chars + d_offsets[idx], d_substr.data(), d_substr.size_bytes());
-    return d_substr.size_bytes();
+    auto const d_substr = d_str.substr(start, end - start);
+    if (d_chars)
+      memcpy(d_chars + d_offsets[idx], d_substr.data(), d_substr.size_bytes());
+    else
+      d_offsets[idx] = d_substr.size_bytes();
   }
 };
 
@@ -212,32 +212,18 @@ std::unique_ptr<column> compute_substrings_from_fn(column_device_view const& d_c
   auto strings_count = d_column.size();
 
   // Copy the null mask
-  rmm::device_buffer null_mask{0, stream, mr};
-  if (d_column.nullable())
-    null_mask = rmm::device_buffer(
-      d_column.null_mask(), cudf::bitmask_allocation_size_bytes(strings_count), stream, mr);
-
-  // Build offsets column
-  auto offsets_transformer_itr = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_type>(0), substring_from_fn{d_column, starts, stops});
-  auto offsets_column = cudf::strings::detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_new_offsets = offsets_column->view().data<int32_t>();
-
-  // Build chars column
-  auto bytes = cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
-  auto chars_column =
-    cudf::strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
-  auto chars_view = chars_column->mutable_view();
-  auto d_chars    = chars_view.template data<char>();
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<cudf::size_type>(0),
-                     strings_count,
-                     substring_from_fn{d_column, starts, stops, d_new_offsets, d_chars});
+  rmm::device_buffer null_mask =
+    !d_column.nullable()
+      ? rmm::device_buffer{0, stream, mr}
+      : rmm::device_buffer(
+          d_column.null_mask(), cudf::bitmask_allocation_size_bytes(strings_count), stream, mr);
+
+  auto children = make_strings_children(
+    substring_from_fn{d_column, starts, stops}, strings_count, null_count, stream, mr);
 
   return make_strings_column(strings_count,
-                             std::move(offsets_column),
-                             std::move(chars_column),
+                             std::move(children.first),
+                             std::move(children.second),
                              null_count,
                              std::move(null_mask),
                              stream,