From 39ad863652d02b5f5e3b16b0fcdfe254a9e6a348 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 17 Mar 2021 06:06:13 -0700
Subject: [PATCH 01/26] Use cuFile for Parquet IO when available (#7444)

Adds optional cuFile integration:
- `cufile.h` is included in the build when available.
- `libcufile.so` is loaded at runtime if `LIBCUDF_CUFILE_POLICY` environment variable is set to "ALWAYS" or "GDS".
- cuFile compatibility mode is set through the same policy variable - "ALWAYS" means on, "GDS" means off.
- cuFile is currently only used on Parquet R/W and in CSV writer.
- device_read/write API can be used with file datasource/data_sink.
- Added CUDA stream to `device_read`.

Authors:
  - Vukasin Milovanovic (@vuule)

Approvers:
  - Keith Kraus (@kkraus14)
  - Karthikeyan (@karthikeyann)
  - Devavret Makkar (@devavret)
  - Robert Maynard (@robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/7444
---
 cpp/CMakeLists.txt                           |   8 +
 cpp/benchmarks/fixture/benchmark_fixture.hpp |   2 +-
 cpp/cmake/Modules/FindcuFile.cmake           |   6 +
 cpp/include/cudf/io/data_sink.hpp            |  26 +-
 cpp/include/cudf/io/datasource.hpp           |  94 ++++++-
 cpp/src/io/csv/writer_impl.cu                |  32 +--
 cpp/src/io/parquet/parquet_gpu.hpp           |   4 +-
 cpp/src/io/parquet/reader_impl.cu            |  23 +-
 cpp/src/io/parquet/reader_impl.hpp           |   4 +-
 cpp/src/io/parquet/writer_impl.cu            |  24 +-
 cpp/src/io/utilities/data_sink.cpp           |  45 +++-
 cpp/src/io/utilities/datasource.cpp          | 107 +++++---
 cpp/src/io/utilities/file_io_utilities.cpp   | 264 +++++++++++++++++++
 cpp/src/io/utilities/file_io_utilities.hpp   | 242 +++++++++++++++++
 14 files changed, 757 insertions(+), 124 deletions(-)
 create mode 100644 cpp/src/io/utilities/file_io_utilities.cpp
 create mode 100644 cpp/src/io/utilities/file_io_utilities.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 39acc362450..f15fd649b83 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -139,6 +139,8 @@ include(cmake/thirdparty/CUDF_GetLibcudacxx.cmake)
 include(cmake/thirdparty/CUDF_GetGTest.cmake)
 # Stringify libcudf and libcudacxx headers used in JIT operations
 include(cmake/Modules/StringifyJITHeaders.cmake)
+# find cuFile
+include(cmake/Modules/FindcuFile.cmake)
 
 ###################################################################################################
 # - library targets -------------------------------------------------------------------------------
@@ -244,6 +246,7 @@ add_library(cudf
     src/io/statistics/column_stats.cu
     src/io/utilities/data_sink.cpp
     src/io/utilities/datasource.cpp
+    src/io/utilities/file_io_utilities.cpp
     src/io/utilities/parsing_utils.cu
     src/io/utilities/type_conversion.cpp
     src/jit/cache.cpp
@@ -469,6 +472,11 @@ else()
     target_link_libraries(cudf PUBLIC CUDA::nvrtc CUDA::cudart CUDA::cuda_driver)
 endif()
 
+# Add cuFile interface if available
+if(TARGET cuFile::cuFile_interface)
+    target_link_libraries(cudf PRIVATE cuFile::cuFile_interface)
+endif()
+
 file(WRITE "${CUDF_BINARY_DIR}/fatbin.ld"
 [=[
 SECTIONS
diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp
index ad2ce095b6e..dd1bbcba0b4 100644
--- a/cpp/benchmarks/fixture/benchmark_fixture.hpp
+++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp
@@ -88,4 +88,4 @@ class benchmark : public ::benchmark::Fixture {
   std::shared_ptr<rmm::mr::device_memory_resource> mr;
 };
 
-};  // namespace cudf
+}  // namespace cudf
diff --git a/cpp/cmake/Modules/FindcuFile.cmake b/cpp/cmake/Modules/FindcuFile.cmake
index e67b79d9d60..4f67e186f42 100644
--- a/cpp/cmake/Modules/FindcuFile.cmake
+++ b/cpp/cmake/Modules/FindcuFile.cmake
@@ -93,6 +93,12 @@ find_package_handle_standard_args(cuFile
     cuFile_VERSION
 )
 
+if (cuFile_INCLUDE_DIR AND NOT TARGET cuFile::cuFile_interface)
+  add_library(cuFile::cuFile_interface IMPORTED INTERFACE)
+  target_include_directories(cuFile::cuFile_interface INTERFACE "$<BUILD_INTERFACE:${cuFile_INCLUDE_DIR}>")
+  target_compile_options(cuFile::cuFile_interface INTERFACE "${cuFile_COMPILE_OPTIONS}")
+  target_compile_definitions(cuFile::cuFile_interface INTERFACE CUFILE_FOUND)
+endif ()
 
 if (cuFile_FOUND AND NOT TARGET cuFile::cuFile)
   add_library(cuFile::cuFile UNKNOWN IMPORTED)
diff --git a/cpp/include/cudf/io/data_sink.hpp b/cpp/include/cudf/io/data_sink.hpp
index 0ae403458a0..e0eb60af070 100644
--- a/cpp/include/cudf/io/data_sink.hpp
+++ b/cpp/include/cudf/io/data_sink.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -107,23 +107,35 @@ class data_sink {
    */
   virtual bool supports_device_write() const { return false; }
 
+  /**
+   * @brief Estimates whether a direct device write would be more optimal for the given size.
+   *
+   * @param size Number of bytes to write
+   * @return whether the device write is expected to be more performant for the given size
+   */
+  virtual bool is_device_write_preferred(size_t size) const { return supports_device_write(); }
+
   /**
    * @brief Append the buffer content to the sink from a gpu address
    *
-   * @param[in] data Pointer to the buffer to be written into the sink object
-   * @param[in] size Number of bytes to write
+   * For optimal performance, should only be called when `is_device_write_preferred` returns `true`.
+   * Data sink implementations that don't support direct device writes don't need to override
+   * this function.
    *
-   * @return void
+   * @throws cudf::logic_error the object does not support direct device writes, i.e.
+   * `supports_device_write` returns `false`.
+   *
+   * @param gpu_data Pointer to the buffer to be written into the sink object
+   * @param size Number of bytes to write
+   * @param stream CUDA stream to use
    */
   virtual void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream)
   {
-    CUDF_FAIL("data_sink classes that support device_write must override this function.");
+    CUDF_FAIL("data_sink classes that support device_write must override it.");
   }
 
   /**
    * @brief Flush the data written into the sink
-   *
-   * @return void
    */
   virtual void flush() = 0;
 
diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp
index 88f2bd187e2..8fcc045e6d2 100644
--- a/cpp/include/cudf/io/datasource.hpp
+++ b/cpp/include/cudf/io/datasource.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/io/types.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <arrow/buffer.h>
 #include <arrow/io/file.h>
 #include <arrow/io/interfaces.h>
@@ -50,12 +52,15 @@ class datasource {
     /**
      * @brief Returns the address of the data in the buffer.
      */
-    virtual const uint8_t* data() const = 0;
+    virtual uint8_t const* data() const = 0;
 
     /**
      * @brief Base class destructor
      */
     virtual ~buffer() {}
+
+    template <typename Container>
+    static std::unique_ptr<buffer> create(Container&& data_owner);
   };
 
   /**
@@ -147,37 +152,57 @@ class datasource {
    */
   virtual bool supports_device_read() const { return false; }
 
+  /**
+   * @brief Estimates whether a direct device read would be more optimal for the given size.
+   *
+   * @param size Number of bytes to read
+   * @return whether the device read is expected to be more performant for the given size
+   */
+  virtual bool is_device_read_preferred(size_t size) const { return supports_device_read(); }
+
   /**
    * @brief Returns a device buffer with a subset of data from the source.
    *
+   * For optimal performance, should only be called when `is_device_read_preferred` returns `true`.
    * Data source implementations that don't support direct device reads don't need to override this
    * function.
    *
-   * @param[in] offset Bytes from the start
-   * @param[in] size Bytes to read
+   *  @throws cudf::logic_error the object does not support direct device reads, i.e.
+   * `supports_device_read` returns `false`.
+   *
+   * @param offset Number of bytes from the start
+   * @param size Number of bytes to read
+   * @param stream CUDA stream to use
    *
    * @return The data buffer in the device memory
    */
-  virtual std::unique_ptr<datasource::buffer> device_read(size_t offset, size_t size)
+  virtual std::unique_ptr<datasource::buffer> device_read(size_t offset,
+                                                          size_t size,
+                                                          rmm::cuda_stream_view stream)
   {
-    CUDF_FAIL("datasource classes that support device_read must override this function.");
+    CUDF_FAIL("datasource classes that support device_read must override it.");
   }
 
   /**
    * @brief Reads a selected range into a preallocated device buffer
    *
+   * For optimal performance, should only be called when `is_device_read_preferred` returns `true`.
    * Data source implementations that don't support direct device reads don't need to override this
    * function.
    *
-   * @param[in] offset Bytes from the start
-   * @param[in] size Bytes to read
-   * @param[in] dst Address of the existing device memory
+   *  @throws cudf::logic_error when the object does not support direct device reads, i.e.
+   * `supports_device_read` returns `false`.
+   *
+   * @param offset Number of bytes from the start
+   * @param size Number of bytes to read
+   * @param dst Address of the existing device memory
+   * @param stream CUDA stream to use
    *
    * @return The number of bytes read (can be smaller than size)
    */
-  virtual size_t device_read(size_t offset, size_t size, uint8_t* dst)
+  virtual size_t device_read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream)
   {
-    CUDF_FAIL("datasource classes that support device_read must override this function.");
+    CUDF_FAIL("datasource classes that support device_read must override it.");
   }
 
   /**
@@ -205,14 +230,57 @@ class datasource {
 
     size_t size() const override { return _size; }
 
-    const uint8_t* data() const override { return _data; }
+    uint8_t const* data() const override { return _data; }
 
    private:
     uint8_t* const _data;
     size_t const _size;
   };
+
+  /**
+   * @brief Derived implementation of `buffer` that owns the data.
+   *
+   * Can use different container types to hold the data buffer.
+   *
+   * @tparam Container Type of the container object that owns the data
+   */
+  template <typename Container>
+  class owning_buffer : public buffer {
+   public:
+    /**
+     * @brief Moves the input container into the newly created object.
+     */
+    owning_buffer(Container&& data_owner)
+      : _data(std::move(data_owner)), _data_ptr(_data.data()), _size(_data.size())
+    {
+    }
+
+    /**
+     * @brief Moves the input container into the newly created object, and exposes a subspan of the
+     * buffer.
+     */
+    owning_buffer(Container&& data_owner, uint8_t const* data_ptr, size_t size)
+      : _data(std::move(data_owner)), _data_ptr(data_ptr), _size(size)
+    {
+    }
+
+    size_t size() const override { return _size; }
+
+    uint8_t const* data() const override { return static_cast<uint8_t const*>(_data_ptr); }
+
+   private:
+    Container _data;
+    void const* _data_ptr;
+    size_t _size;
+  };
 };
 
+template <typename Container>
+std::unique_ptr<datasource::buffer> datasource::buffer::create(Container&& data_owner)
+{
+  return std::make_unique<owning_buffer<Container>>(std::move(data_owner));
+}
+
 /**
  * @brief Implementation class for reading from an Apache Arrow file. The file
  * could be a memory-mapped file or other implementation supported by Arrow.
@@ -230,7 +298,7 @@ class arrow_io_source : public datasource {
     {
     }
     size_t size() const override { return arrow_buffer->size(); }
-    const uint8_t* data() const override { return arrow_buffer->data(); }
+    uint8_t const* data() const override { return arrow_buffer->data(); }
   };
 
  public:
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index dda2e0704f6..f7e153d71f4 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -416,36 +416,28 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view,
   auto total_num_bytes      = strings_column.chars_size();
   char const* ptr_all_bytes = strings_column.chars().data<char>();
 
-  if (out_sink_->supports_device_write()) {
-    // host algorithm call, but the underlying call
-    // is a device_write taking a device buffer;
-    //
+  if (out_sink_->is_device_write_preferred(total_num_bytes)) {
+    // Direct write from device memory
     out_sink_->device_write(ptr_all_bytes, total_num_bytes, stream);
-    out_sink_->device_write(newline.data(),
-                            newline.size(),
-                            stream);  // needs newline at the end, to separate from next chunk
   } else {
-    // no device write possible;
-    //
-    // copy the bytes to host, too:
-    //
+    // copy the bytes to host to write them out
     thrust::host_vector<char> h_bytes(total_num_bytes);
     CUDA_TRY(cudaMemcpyAsync(h_bytes.data(),
                              ptr_all_bytes,
                              total_num_bytes * sizeof(char),
                              cudaMemcpyDeviceToHost,
                              stream.value()));
-
     stream.synchronize();
 
-    // host algorithm call, where the underlying call
-    // is also host_write taking a host buffer;
-    //
-    char const* ptr_h_bytes = h_bytes.data();
-    out_sink_->host_write(ptr_h_bytes, total_num_bytes);
+    out_sink_->host_write(h_bytes.data(), total_num_bytes);
+  }
+
+  // Needs newline at the end, to separate from next chunk
+  if (out_sink_->is_device_write_preferred(newline.size())) {
+    out_sink_->device_write(newline.data(), newline.size(), stream);
+  } else {
     out_sink_->host_write(options_.get_line_terminator().data(),
-                          options_.get_line_terminator()
-                            .size());  // needs newline at the end, to separate from next chunk
+                          options_.get_line_terminator().size());
   }
 }
 
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 43d144ec980..f920aee1c29 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -184,7 +184,7 @@ struct ColumnChunkDesc {
   {
   }
 
-  uint8_t *compressed_data;                        // pointer to compressed column chunk data
+  uint8_t const *compressed_data;                  // pointer to compressed column chunk data
   size_t compressed_size;                          // total compressed data size for this chunk
   size_t num_values;                               // total number of values in this column
   size_t start_row;                                // starting row of this chunk
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index a7a02cc6108..16cf0877c23 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -822,7 +822,7 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
  * @copydoc cudf::io::detail::parquet::read_column_chunks
  */
 void reader::impl::read_column_chunks(
-  std::vector<rmm::device_buffer> &page_data,
+  std::vector<std::unique_ptr<datasource::buffer>> &page_data,
   hostdevice_vector<gpu::ColumnChunkDesc> &chunks,  // TODO const?
   size_t begin_chunk,
   size_t end_chunk,
@@ -850,9 +850,15 @@ void reader::impl::read_column_chunks(
       next_chunk++;
     }
     if (io_size != 0) {
-      auto buffer         = _sources[chunk_source_map[chunk]]->host_read(io_offset, io_size);
-      page_data[chunk]    = rmm::device_buffer(buffer->data(), buffer->size(), stream);
-      uint8_t *d_compdata = static_cast<uint8_t *>(page_data[chunk].data());
+      auto &source = _sources[chunk_source_map[chunk]];
+      if (source->is_device_read_preferred(io_size)) {
+        page_data[chunk] = source->device_read(io_offset, io_size, stream);
+      } else {
+        auto const buffer = source->host_read(io_offset, io_size);
+        page_data[chunk] =
+          datasource::buffer::create(rmm::device_buffer(buffer->data(), buffer->size(), stream));
+      }
+      auto d_compdata = page_data[chunk]->data();
       do {
         chunks[chunk].compressed_data = d_compdata;
         d_compdata += chunks[chunk].compressed_size;
@@ -1414,7 +1420,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
     std::vector<size_type> chunk_source_map(num_chunks);
 
     // Tracker for eventually deallocating compressed and uncompressed data
-    std::vector<rmm::device_buffer> page_data(num_chunks);
+    std::vector<std::unique_ptr<datasource::buffer>> page_data(num_chunks);
 
     // Keep track of column chunk file offsets
     std::vector<size_t> column_chunk_offsets(num_chunks);
@@ -1516,10 +1522,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
         decomp_page_data = decompress_page_data(chunks, pages, stream);
         // Free compressed data
         for (size_t c = 0; c < chunks.size(); c++) {
-          if (chunks[c].codec != parquet::Compression::UNCOMPRESSED && page_data[c].size() != 0) {
-            page_data[c].resize(0);
-            page_data[c].shrink_to_fit();
-          }
+          if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) { page_data[c].reset(); }
         }
       }
 
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 137fca03bfd..ca200936134 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -91,7 +91,7 @@ class reader::impl {
    * @param stream CUDA stream used for device memory operations and kernel launches.
    *
    */
-  void read_column_chunks(std::vector<rmm::device_buffer> &page_data,
+  void read_column_chunks(std::vector<std::unique_ptr<datasource::buffer>> &page_data,
                           hostdevice_vector<gpu::ColumnChunkDesc> &chunks,
                           size_t begin_chunk,
                           size_t end_chunk,
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index a645ca0fd91..dd68bc50043 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1108,19 +1108,7 @@ void writer::impl::write(table_view const &table)
                        num_stats_bfr);
   }
 
-  auto host_bfr = [&]() {
-    // if the writer supports device_write(), we don't need this scratch space
-    if (out_sink_->supports_device_write()) {
-      return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
-    } else {
-      return pinned_buffer<uint8_t>{[](size_t size) {
-                                      uint8_t *ptr = nullptr;
-                                      CUDA_TRY(cudaMallocHost(&ptr, size));
-                                      return ptr;
-                                    }(max_chunk_bfr_size),
-                                    cudaFreeHost};
-    }
-  }();
+  pinned_buffer<uint8_t> host_bfr{nullptr, cudaFreeHost};
 
   // Encode row groups in batches
   for (uint32_t b = 0, r = 0, global_r = global_rowgroup_base; b < (uint32_t)batch_list.size();
@@ -1155,7 +1143,7 @@ void writer::impl::write(table_view const &table)
           dev_bfr = ck->uncompressed_bfr;
         }
 
-        if (out_sink_->supports_device_write()) {
+        if (out_sink_->is_device_write_preferred(ck->compressed_size)) {
           // let the writer do what it wants to retrieve the data from the gpu.
           out_sink_->device_write(dev_bfr + ck->ck_stat_size, ck->compressed_size, stream);
           // we still need to do a (much smaller) memcpy for the statistics.
@@ -1170,6 +1158,14 @@ void writer::impl::write(table_view const &table)
             stream.synchronize();
           }
         } else {
+          if (!host_bfr) {
+            host_bfr = pinned_buffer<uint8_t>{[](size_t size) {
+                                                uint8_t *ptr = nullptr;
+                                                CUDA_TRY(cudaMallocHost(&ptr, size));
+                                                return ptr;
+                                              }(max_chunk_bfr_size),
+                                              cudaFreeHost};
+          }
           // copy the full data
           CUDA_TRY(cudaMemcpyAsync(host_bfr.get(),
                                    dev_bfr,
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index 48558005303..10af7bcb0bd 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <cudf/io/data_sink.hpp>
 #include <cudf/utilities/error.hpp>
+#include <io/utilities/file_io_utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -29,24 +30,44 @@ namespace io {
 class file_sink : public data_sink {
  public:
   explicit file_sink(std::string const& filepath)
+    : _cufile_out(detail::make_cufile_output(filepath))
   {
-    outfile_.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc);
-    CUDF_EXPECTS(outfile_.is_open(), "Cannot open output file");
+    _output_stream.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc);
+    CUDF_EXPECTS(_output_stream.is_open(), "Cannot open output file");
   }
 
   virtual ~file_sink() { flush(); }
 
   void host_write(void const* data, size_t size) override
   {
-    outfile_.write(static_cast<char const*>(data), size);
+    _output_stream.seekp(_bytes_written);
+    _output_stream.write(static_cast<char const*>(data), size);
+    _bytes_written += size;
   }
 
-  void flush() override { outfile_.flush(); }
+  void flush() override { _output_stream.flush(); }
 
-  size_t bytes_written() override { return outfile_.tellp(); }
+  size_t bytes_written() override { return _bytes_written; }
+
+  bool supports_device_write() const override { return _cufile_out != nullptr; }
+
+  bool is_device_write_preferred(size_t size) const override
+  {
+    return _cufile_out != nullptr && _cufile_out->is_cufile_io_preferred(size);
+  }
+
+  void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
+  {
+    if (!supports_device_write()) CUDF_FAIL("Device writes are not supported for this file.");
+
+    _cufile_out->write(gpu_data, _bytes_written, size);
+    _bytes_written += size;
+  }
 
  private:
-  std::ofstream outfile_;
+  std::ofstream _output_stream;
+  size_t _bytes_written = 0;
+  std::unique_ptr<detail::cufile_output_impl> _cufile_out;
 };
 
 /**
@@ -77,25 +98,25 @@ class host_buffer_sink : public data_sink {
  */
 class void_sink : public data_sink {
  public:
-  explicit void_sink() : bytes_written_(0) {}
+  explicit void_sink() : _bytes_written(0) {}
 
   virtual ~void_sink() {}
 
-  void host_write(void const* data, size_t size) override { bytes_written_ += size; }
+  void host_write(void const* data, size_t size) override { _bytes_written += size; }
 
   bool supports_device_write() const override { return true; }
 
   void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
   {
-    bytes_written_ += size;
+    _bytes_written += size;
   }
 
   void flush() override {}
 
-  size_t bytes_written() override { return bytes_written_; }
+  size_t bytes_written() override { return _bytes_written; }
 
  private:
-  size_t bytes_written_;
+  size_t _bytes_written;
 };
 
 class user_sink_wrapper : public data_sink {
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 74163d023be..3f2884d5b7d 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
+#include <cudf/io/datasource.hpp>
+
 #include <fcntl.h>
 #include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
 #include <unistd.h>
 
-#include <cudf/io/datasource.hpp>
 #include <cudf/utilities/error.hpp>
+#include <io/utilities/file_io_utilities.hpp>
 
 namespace cudf {
 namespace io {
@@ -34,12 +34,6 @@ namespace io {
  * mapping a subset of the file where the starting offset may not be zero.
  */
 class memory_mapped_source : public datasource {
-  struct file_wrapper {
-    const int fd = -1;
-    explicit file_wrapper(const char *filepath) : fd(open(filepath, O_RDONLY)) {}
-    ~file_wrapper() { close(fd); }
-  };
-
   class memory_mapped_buffer : public buffer {
     size_t _size   = 0;
     uint8_t *_data = nullptr;
@@ -52,77 +46,99 @@ class memory_mapped_source : public datasource {
 
  public:
   explicit memory_mapped_source(const char *filepath, size_t offset, size_t size)
+    : _cufile_in(detail::make_cufile_input(filepath))
   {
-    auto const file = file_wrapper(filepath);
-    CUDF_EXPECTS(file.fd != -1, "Cannot open file");
-
-    struct stat st;
-    CUDF_EXPECTS(fstat(file.fd, &st) != -1, "Cannot query file size");
-    file_size_ = static_cast<size_t>(st.st_size);
-
-    if (file_size_ != 0) { map(file.fd, offset, size); }
+    auto const file = detail::file_wrapper(filepath, O_RDONLY);
+    _file_size      = file.size();
+    if (_file_size != 0) { map(file.desc(), offset, size); }
   }
 
   virtual ~memory_mapped_source()
   {
-    if (map_addr_ != nullptr) { munmap(map_addr_, map_size_); }
+    if (_map_addr != nullptr) { munmap(_map_addr, _map_size); }
   }
 
   std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
   {
-    CUDF_EXPECTS(offset >= map_offset_, "Requested offset is outside mapping");
+    CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping");
 
     // Clamp length to available data in the mapped region
-    auto const read_size = std::min(size, map_size_ - (offset - map_offset_));
+    auto const read_size = std::min(size, _map_size - (offset - _map_offset));
 
     return std::make_unique<memory_mapped_buffer>(
-      static_cast<uint8_t *>(map_addr_) + (offset - map_offset_), read_size);
+      static_cast<uint8_t *>(_map_addr) + (offset - _map_offset), read_size);
   }
 
   size_t host_read(size_t offset, size_t size, uint8_t *dst) override
   {
-    CUDF_EXPECTS(offset >= map_offset_, "Requested offset is outside mapping");
+    CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping");
 
     // Clamp length to available data in the mapped region
-    auto const read_size = std::min(size, map_size_ - (offset - map_offset_));
+    auto const read_size = std::min(size, _map_size - (offset - _map_offset));
 
-    auto const src = static_cast<uint8_t *>(map_addr_) + (offset - map_offset_);
+    auto const src = static_cast<uint8_t *>(_map_addr) + (offset - _map_offset);
     std::memcpy(dst, src, read_size);
     return read_size;
   }
 
-  size_t size() const override { return file_size_; }
+  bool supports_device_read() const override { return _cufile_in != nullptr; }
+
+  bool is_device_read_preferred(size_t size) const
+  {
+    return _cufile_in != nullptr && _cufile_in->is_cufile_io_preferred(size);
+  }
+
+  std::unique_ptr<datasource::buffer> device_read(size_t offset,
+                                                  size_t size,
+                                                  rmm::cuda_stream_view stream) override
+  {
+    if (!supports_device_read()) CUDF_FAIL("Device reads are not supported for this file.");
+
+    auto const read_size = std::min(size, _map_size - (offset - _map_offset));
+    return _cufile_in->read(offset, read_size, stream);
+  }
+
+  size_t device_read(size_t offset,
+                     size_t size,
+                     uint8_t *dst,
+                     rmm::cuda_stream_view stream) override
+  {
+    if (!supports_device_read()) CUDF_FAIL("Device reads are not supported for this file.");
+    auto const read_size = std::min(size, _map_size - (offset - _map_offset));
+    return _cufile_in->read(offset, read_size, dst, stream);
+  }
+
+  size_t size() const override { return _file_size; }
 
  private:
   void map(int fd, size_t offset, size_t size)
   {
-    CUDF_EXPECTS(offset < file_size_, "Offset is past end of file");
+    CUDF_EXPECTS(offset < _file_size, "Offset is past end of file");
 
     // Offset for `mmap()` must be page aligned
-    auto const map_offset = offset & ~(sysconf(_SC_PAGESIZE) - 1);
+    _map_offset = offset & ~(sysconf(_SC_PAGESIZE) - 1);
 
     // Clamp length to available data in the file
     if (size == 0) {
-      size = file_size_ - offset;
+      size = _file_size - offset;
     } else {
-      if ((offset + size) > file_size_) { size = file_size_ - offset; }
+      if ((offset + size) > _file_size) { size = _file_size - offset; }
     }
 
     // Size for `mmap()` needs to include the page padding
-    const auto map_size = size + (offset - map_offset);
+    _map_size = size + (offset - _map_offset);
 
     // Check if accessing a region within already mapped area
-    map_addr_ = mmap(NULL, map_size, PROT_READ, MAP_PRIVATE, fd, map_offset);
-    CUDF_EXPECTS(map_addr_ != MAP_FAILED, "Cannot create memory mapping");
-    map_offset_ = map_offset;
-    map_size_   = map_size;
+    _map_addr = mmap(nullptr, _map_size, PROT_READ, MAP_PRIVATE, fd, _map_offset);
+    CUDF_EXPECTS(_map_addr != MAP_FAILED, "Cannot create memory mapping");
   }
 
  private:
-  size_t file_size_  = 0;
-  void *map_addr_    = nullptr;
-  size_t map_size_   = 0;
-  size_t map_offset_ = 0;
+  size_t _file_size  = 0;
+  void *_map_addr    = nullptr;
+  size_t _map_size   = 0;
+  size_t _map_offset = 0;
+  std::unique_ptr<detail::cufile_input_impl> _cufile_in;
 };
 
 /**
@@ -148,14 +164,19 @@ class user_datasource_wrapper : public datasource {
 
   bool supports_device_read() const override { return source->supports_device_read(); }
 
-  size_t device_read(size_t offset, size_t size, uint8_t *dst) override
+  size_t device_read(size_t offset,
+                     size_t size,
+                     uint8_t *dst,
+                     rmm::cuda_stream_view stream) override
   {
-    return source->device_read(offset, size, dst);
+    return source->device_read(offset, size, dst, stream);
   }
 
-  std::unique_ptr<buffer> device_read(size_t offset, size_t size) override
+  std::unique_ptr<buffer> device_read(size_t offset,
+                                      size_t size,
+                                      rmm::cuda_stream_view stream) override
   {
-    return source->device_read(offset, size);
+    return source->device_read(offset, size, stream);
   }
 
   size_t size() const override { return source->size(); }
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
new file mode 100644
index 00000000000..22ff057cbc1
--- /dev/null
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf_test/file_utilities.hpp>
+#include <io/utilities/file_io_utilities.hpp>
+
+#include <rmm/device_buffer.hpp>
+
+#include <dlfcn.h>
+
+#include <fstream>
+
+namespace cudf {
+namespace io {
+namespace detail {
+
+file_wrapper::file_wrapper(std::string const &filepath, int flags)
+  : fd(open(filepath.c_str(), flags))
+{
+  CUDF_EXPECTS(fd != -1, "Cannot open file " + filepath);
+}
+
+file_wrapper::file_wrapper(std::string const &filepath, int flags, mode_t mode)
+  : fd(open(filepath.c_str(), flags, mode))
+{
+  CUDF_EXPECTS(fd != -1, "Cannot open file " + filepath);
+}
+
+file_wrapper::~file_wrapper() { close(fd); }
+
+long file_wrapper::size() const
+{
+  if (_size < 0) {
+    struct stat st;
+    CUDF_EXPECTS(fstat(fd, &st) != -1, "Cannot query file size");
+    _size = static_cast<size_t>(st.st_size);
+  }
+  return _size;
+}
+
+#ifdef CUFILE_FOUND
+
+/**
+ * @brief Class that manages cuFile configuration.
+ */
+class cufile_config {
+  std::string const default_policy    = "OFF";
+  std::string const json_path_env_var = "CUFILE_ENV_PATH_JSON";
+
+  std::string const policy = default_policy;
+  temp_directory tmp_config_dir{"cudf_cufile_config"};
+
+  std::string getenv_or(std::string const &env_var_name, std::string const &default_val)
+  {
+    auto const env_val = std::getenv(env_var_name.c_str());
+    return (env_val == nullptr) ? default_val : std::string(env_val);
+  }
+
+  cufile_config() : policy{getenv_or("LIBCUDF_CUFILE_POLICY", default_policy)}
+  {
+    if (is_enabled()) {
+      // Modify the config file based on the policy
+      auto const config_file_path = getenv_or(json_path_env_var, "/etc/cufile.json");
+      std::ifstream user_config_file(config_file_path);
+      // Modified config file is stored in a temporary directory
+      auto const cudf_config_path = tmp_config_dir.path() + "/cufile.json";
+      std::ofstream cudf_config_file(cudf_config_path);
+
+      std::string line;
+      while (std::getline(user_config_file, line)) {
+        std::string const tag = "\"allow_compat_mode\"";
+        if (line.find(tag) != std::string::npos) {
+          // TODO: only replace the true/false value
+          // Enable compatiblity mode when cuDF does not fall back to host path
+          cudf_config_file << tag << ": " << (is_required() ? "true" : "false") << ",\n";
+        } else {
+          cudf_config_file << line << '\n';
+        }
+
+        // Point libcufile to the modified config file
+        CUDF_EXPECTS(setenv(json_path_env_var.c_str(), cudf_config_path.c_str(), 0) == 0,
+                     "Failed to set the cuFile config file environment variable.");
+      }
+    }
+  }
+
+ public:
+  /**
+   * @brief Returns true when cuFile use is enabled.
+   */
+  bool is_enabled() const { return policy == "ALWAYS" or policy == "GDS"; }
+
+  /**
+   * @brief Returns true when cuDF should not fall back to host IO.
+   */
+  bool is_required() const { return policy == "ALWAYS"; }
+
+  static cufile_config const *instance()
+  {
+    static cufile_config _instance;
+    return &_instance;
+  }
+};
+
+/**
+ * @brief Class that dynamically loads the cuFile library and manages the cuFile driver.
+ */
+class cufile_shim {
+ private:
+  cufile_shim();
+
+  void *cf_lib                              = nullptr;
+  decltype(cuFileDriverOpen) *driver_open   = nullptr;
+  decltype(cuFileDriverClose) *driver_close = nullptr;
+
+  std::unique_ptr<cudf::logic_error> init_error;
+  auto is_valid() const noexcept { return init_error == nullptr; }
+
+ public:
+  cufile_shim(cufile_shim const &) = delete;
+  cufile_shim &operator=(cufile_shim const &) = delete;
+
+  static cufile_shim const *instance();
+
+  ~cufile_shim()
+  {
+    driver_close();
+    dlclose(cf_lib);
+  }
+
+  decltype(cuFileHandleRegister) *handle_register     = nullptr;
+  decltype(cuFileHandleDeregister) *handle_deregister = nullptr;
+  decltype(cuFileRead) *read                          = nullptr;
+  decltype(cuFileWrite) *write                        = nullptr;
+};
+
+cufile_shim::cufile_shim()
+{
+  try {
+    cf_lib      = dlopen("libcufile.so", RTLD_NOW);
+    driver_open = reinterpret_cast<decltype(driver_open)>(dlsym(cf_lib, "cuFileDriverOpen"));
+    CUDF_EXPECTS(driver_open != nullptr, "could not find cuFile cuFileDriverOpen symbol");
+    driver_close = reinterpret_cast<decltype(driver_close)>(dlsym(cf_lib, "cuFileDriverClose"));
+    CUDF_EXPECTS(driver_close != nullptr, "could not find cuFile cuFileDriverClose symbol");
+    handle_register =
+      reinterpret_cast<decltype(handle_register)>(dlsym(cf_lib, "cuFileHandleRegister"));
+    CUDF_EXPECTS(handle_register != nullptr, "could not find cuFile cuFileHandleRegister symbol");
+    handle_deregister =
+      reinterpret_cast<decltype(handle_deregister)>(dlsym(cf_lib, "cuFileHandleDeregister"));
+    CUDF_EXPECTS(handle_deregister != nullptr,
+                 "could not find cuFile cuFileHandleDeregister symbol");
+    read = reinterpret_cast<decltype(read)>(dlsym(cf_lib, "cuFileRead"));
+    CUDF_EXPECTS(read != nullptr, "could not find cuFile cuFileRead symbol");
+    write = reinterpret_cast<decltype(write)>(dlsym(cf_lib, "cuFileWrite"));
+    CUDF_EXPECTS(write != nullptr, "could not find cuFile cuFileWrite symbol");
+
+    CUDF_EXPECTS(driver_open().err == CU_FILE_SUCCESS, "Failed to initialize cuFile driver");
+  } catch (cudf::logic_error const &err) {
+    init_error = std::make_unique<cudf::logic_error>(err);
+  }
+}
+
+cufile_shim const *cufile_shim::instance()
+{
+  static cufile_shim _instance;
+  // Defer throwing to avoid repeated attempts to load the library
+  if (!_instance.is_valid()) CUDF_FAIL("" + std::string(_instance.init_error->what()));
+
+  return &_instance;
+}
+
+void cufile_registered_file::register_handle()
+{
+  CUfileDescr_t cufile_desc{};
+  cufile_desc.handle.fd = _file.desc();
+  cufile_desc.type      = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
+  CUDF_EXPECTS(shim->handle_register(&cf_handle, &cufile_desc).err == CU_FILE_SUCCESS,
+               "Cannot register file handle with cuFile");
+}
+
+cufile_registered_file::~cufile_registered_file() { shim->handle_deregister(cf_handle); }
+
+cufile_input_impl::cufile_input_impl(std::string const &filepath)
+  : shim{cufile_shim::instance()}, cf_file(shim, filepath, O_RDONLY | O_DIRECT)
+{
+}
+
+std::unique_ptr<datasource::buffer> cufile_input_impl::read(size_t offset,
+                                                            size_t size,
+                                                            rmm::cuda_stream_view stream)
+{
+  rmm::device_buffer out_data(size, stream);
+  CUDF_EXPECTS(shim->read(cf_file.handle(), out_data.data(), size, offset, 0) != -1,
+               "cuFile error reading from a file");
+
+  return datasource::buffer::create(std::move(out_data));
+}
+
+size_t cufile_input_impl::read(size_t offset,
+                               size_t size,
+                               uint8_t *dst,
+                               rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(shim->read(cf_file.handle(), dst, size, offset, 0) != -1,
+               "cuFile error reading from a file");
+  // always read the requested size for now
+  return size;
+}
+
+cufile_output_impl::cufile_output_impl(std::string const &filepath)
+  : shim{cufile_shim::instance()}, cf_file(shim, filepath, O_CREAT | O_RDWR | O_DIRECT, 0664)
+{
+}
+
+void cufile_output_impl::write(void const *data, size_t offset, size_t size)
+{
+  CUDF_EXPECTS(shim->write(cf_file.handle(), data, size, offset, 0) != -1,
+               "cuFile error writing to a file");
+}
+#endif
+
+std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const &filepath)
+{
+#ifdef CUFILE_FOUND
+  if (cufile_config::instance()->is_enabled()) {
+    try {
+      return std::make_unique<cufile_input_impl>(filepath);
+    } catch (...) {
+      if (cufile_config::instance()->is_required()) throw;
+    }
+  }
+#endif
+  return nullptr;
+}
+
+std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const &filepath)
+{
+#ifdef CUFILE_FOUND
+  if (cufile_config::instance()->is_enabled()) {
+    try {
+      return std::make_unique<cufile_output_impl>(filepath);
+    } catch (...) {
+      if (cufile_config::instance()->is_required()) throw;
+    }
+  }
+#endif
+  return nullptr;
+}
+
+}  // namespace detail
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
new file mode 100644
index 00000000000..85399bdd44d
--- /dev/null
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef CUFILE_FOUND
+#include <cufile.h>
+#endif
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cudf/io/datasource.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <string>
+
+namespace cudf {
+namespace io {
+namespace detail {
+
+/**
+ * @brief Class that provides RAII for file handling.
+ */
+class file_wrapper {
+  int const fd       = -1;
+  long mutable _size = -1;
+
+ public:
+  explicit file_wrapper(std::string const &filepath, int flags);
+  explicit file_wrapper(std::string const &filepath, int flags, mode_t mode);
+  ~file_wrapper();
+  long size() const;
+  auto desc() const { return fd; }
+};
+
+/**
+ * @brief Base class for cuFile input/output.
+ *
+ * Contains the common API for cuFile input and output classes.
+ */
+class cufile_io_base {
+ public:
+  /**
+   * @brief Returns an estimate of whether the cuFile operation is the optimal option.
+   *
+   * @param size Read/write operation size, in bytes.
+   * @return Whether a cuFile operation with the given size is expected to be faster than a host
+   * read + H2D copy
+   */
+  static bool is_cufile_io_preferred(size_t size) { return size > op_size_threshold; }
+
+ protected:
+  /**
+   * @brief The read/write size above which cuFile is faster then host read + copy
+   *
+   * This may not be the optimal threshold for all systems. Derived `is_cufile_io_preferred`
+   * implementations can use a different logic.
+   */
+  static constexpr size_t op_size_threshold = 128 << 10;
+};
+
+/**
+ * @brief Interface class for cufile input.
+ */
+class cufile_input : public cufile_io_base {
+ public:
+  /**
+   * @brief Reads into a new device buffer.
+   *
+   *  @throws cudf::logic_error on cuFile error
+   *
+   * @param offset Number of bytes from the start
+   * @param size Number of bytes to read
+   * @param stream CUDA stream to use
+   *
+   * @return The data buffer in the device memory
+   */
+  virtual std::unique_ptr<datasource::buffer> read(size_t offset,
+                                                   size_t size,
+                                                   rmm::cuda_stream_view stream) = 0;
+
+  /**
+   * @brief Reads into existing device memory.
+   *
+   *  @throws cudf::logic_error on cuFile error
+   *
+   * @param offset Number of bytes from the start
+   * @param size Number of bytes to read
+   * @param dst Address of the existing device memory
+   * @param stream CUDA stream to use
+   *
+   * @return The number of bytes read
+   */
+  virtual size_t read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream) = 0;
+};
+
+/**
+ * @brief Interface class for cufile output.
+ */
+class cufile_output : public cufile_io_base {
+ public:
+  /**
+   * @brief Writes the data from a device buffer into a file.
+   *
+   *  @throws cudf::logic_error on cuFile error
+   *
+   * @param data Pointer to the buffer to be written into the output file
+   * @param offset Number of bytes from the start
+   * @param size Number of bytes to write
+   */
+  virtual void write(void const *data, size_t offset, size_t size) = 0;
+};
+
+#ifdef CUFILE_FOUND
+
+class cufile_shim;
+
+/**
+ * @brief Class that provides RAII for cuFile file registration.
+ */
+struct cufile_registered_file {
+  void register_handle();
+
+ public:
+  cufile_registered_file(cufile_shim const *shim, std::string const &filepath, int flags)
+    : _file(filepath, flags), shim{shim}
+  {
+    register_handle();
+  }
+
+  cufile_registered_file(cufile_shim const *shim,
+                         std::string const &filepath,
+                         int flags,
+                         mode_t mode)
+    : _file(filepath, flags, mode), shim{shim}
+  {
+    register_handle();
+  }
+
+  auto const &handle() const noexcept { return cf_handle; }
+
+  ~cufile_registered_file();
+
+ private:
+  file_wrapper const _file;
+  CUfileHandle_t cf_handle = nullptr;
+  cufile_shim const *shim  = nullptr;
+};
+
+/**
+ * @brief Adapter for the `cuFileRead` API.
+ *
+ * Exposes APIs to read directly from a file into device memory.
+ */
+class cufile_input_impl final : public cufile_input {
+ public:
+  cufile_input_impl(std::string const &filepath);
+
+  std::unique_ptr<datasource::buffer> read(size_t offset,
+                                           size_t size,
+                                           rmm::cuda_stream_view stream) override;
+
+  size_t read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream) override;
+
+ private:
+  cufile_shim const *shim = nullptr;
+  cufile_registered_file const cf_file;
+};
+
+/**
+ * @brief Adapter for the `cuFileWrite` API.
+ *
+ * Exposes an API to write directly into a file from device memory.
+ */
+class cufile_output_impl final : public cufile_output {
+ public:
+  cufile_output_impl(std::string const &filepath);
+
+  void write(void const *data, size_t offset, size_t size) override;
+
+ private:
+  cufile_shim const *shim = nullptr;
+  cufile_registered_file const cf_file;
+};
+#else
+
+class cufile_input_impl final : public cufile_input {
+ public:
+  std::unique_ptr<datasource::buffer> read(size_t offset,
+                                           size_t size,
+                                           rmm::cuda_stream_view stream) override
+  {
+    CUDF_FAIL("Only used to compile without cufile library, should not be called");
+  }
+
+  size_t read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream) override
+  {
+    CUDF_FAIL("Only used to compile without cufile library, should not be called");
+  }
+};
+
+class cufile_output_impl final : public cufile_output {
+ public:
+  void write(void const *data, size_t offset, size_t size) override
+  {
+    CUDF_FAIL("Only used to compile without cufile library, should not be called");
+  }
+};
+#endif
+
+/**
+ * @brief Creates a `cufile_input_impl` object
+ *
+ * Returns a null pointer if an exception occurs in the `cufile_input_impl` constructor, or if the
+ * cuFile library is not installed.
+ */
+std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const &filepath);
+
+/**
+ * @brief Creates a `cufile_output_impl` object
+ *
+ * Returns a null pointer if an exception occurs in the `cufile_output_impl` constructor, or if the
+ * cuFile library is not installed.
+ */
+std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const &filepath);
+
+}  // namespace detail
+}  // namespace io
+}  // namespace cudf

From 9c6e1baf023012aacc7bcada921658db0a9993eb Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Wed, 17 Mar 2021 10:47:39 -0500
Subject: [PATCH 02/26] Refactor Java host-side buffer concatenation to expose
 separate steps (#7610)

This refactors `JCudfSerialization.concatToContiguousTable` to expose the separate steps of concatenating to a single host-side buffer and constructing a device-side contiguous table from that host buffer.  This allows application code to perform other operations in-between those two steps.

Authors:
  - Jason Lowe (@jlowe)

Approvers:
  - Robert (Bobby) Evans (@revans2)

URL: https://github.com/rapidsai/cudf/pull/7610
---
 .../ai/rapids/cudf/JCudfSerialization.java    | 94 ++++++++++++++-----
 1 file changed, 71 insertions(+), 23 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java b/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
index bf49fb59d52..6c52b8fe798 100644
--- a/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
+++ b/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -353,6 +353,50 @@ static SerializedColumnHeader readFrom(DataInputStream din, long rowCount) throw
     }
   }
 
+  /** Class to hold the header and buffer pair result from host-side concatenation */
+  public static final class HostConcatResult implements AutoCloseable {
+    private final SerializedTableHeader tableHeader;
+    private final HostMemoryBuffer hostBuffer;
+
+    public HostConcatResult(SerializedTableHeader tableHeader, HostMemoryBuffer tableBuffer) {
+      this.tableHeader = tableHeader;
+      this.hostBuffer = tableBuffer;
+    }
+
+    public SerializedTableHeader getTableHeader() {
+      return tableHeader;
+    }
+
+    public HostMemoryBuffer getHostBuffer() {
+      return hostBuffer;
+    }
+
+    /** Build a contiguous table in device memory from this host-concatenated result */
+    public ContiguousTable toContiguousTable() {
+      DeviceMemoryBuffer devBuffer = DeviceMemoryBuffer.allocate(hostBuffer.length);
+      try {
+        if (hostBuffer.length > 0) {
+          devBuffer.copyFromHostBuffer(hostBuffer);
+        }
+        Table table = sliceUpColumnVectors(tableHeader, devBuffer, hostBuffer);
+        try {
+          return new ContiguousTable(table, devBuffer);
+        } catch (Exception e) {
+          table.close();
+          throw e;
+        }
+      } catch (Exception e) {
+        devBuffer.close();
+        throw e;
+      }
+    }
+
+    @Override
+    public void close() {
+      hostBuffer.close();
+    }
+  }
+
   /**
    * Visible for testing
    */
@@ -1681,15 +1725,32 @@ public static Table readAndConcat(SerializedTableHeader[] headers,
     return ct.getTable();
   }
 
+  /**
+   * Concatenate multiple tables in host memory into a contiguous table in device memory.
+   * @param headers table headers corresponding to the host table buffers
+   * @param dataBuffers host table buffer for each input table to be concatenated
+   * @return contiguous table in device memory
+   */
   public static ContiguousTable concatToContiguousTable(SerializedTableHeader[] headers,
                                                         HostMemoryBuffer[] dataBuffers) throws IOException {
+    try (HostConcatResult concatResult = concatToHostBuffer(headers, dataBuffers)) {
+      return concatResult.toContiguousTable();
+    }
+  }
+
+  /**
+   * Concatenate multiple tables in host memory into a single host table buffer.
+   * @param headers table headers corresponding to the host table buffers
+   * @param dataBuffers host table buffer for each input table to be concatenated
+   * @return host table header and buffer
+   */
+  public static HostConcatResult concatToHostBuffer(SerializedTableHeader[] headers,
+                                                    HostMemoryBuffer[] dataBuffers) throws IOException {
     ColumnBufferProvider[][] providersPerColumn = providersFrom(headers, dataBuffers);
-    DeviceMemoryBuffer devBuffer = null;
-    Table table = null;
     try {
       SerializedTableHeader combined = calcConcatHeader(providersPerColumn);
-
-      try (HostMemoryBuffer hostBuffer = HostMemoryBuffer.allocate(combined.dataLen)) {
+      HostMemoryBuffer hostBuffer = HostMemoryBuffer.allocate(combined.dataLen);
+      try {
         try (NvtxRange range = new NvtxRange("Concat Host Side", NvtxColor.GREEN)) {
           DataWriter writer = writerFrom(hostBuffer);
           int numColumns = combined.getNumColumns();
@@ -1697,27 +1758,14 @@ public static ContiguousTable concatToContiguousTable(SerializedTableHeader[] he
             writeConcat(writer, combined.getColumnHeader(columnIdx), providersPerColumn[columnIdx]);
           }
         }
-
-        devBuffer = DeviceMemoryBuffer.allocate(hostBuffer.length);
-        if (hostBuffer.length > 0) {
-          try (NvtxRange range = new NvtxRange("Copy Data To Device", NvtxColor.WHITE)) {
-            devBuffer.copyFromHostBuffer(hostBuffer);
-          }
-        }
-        table = sliceUpColumnVectors(combined, devBuffer, hostBuffer);
-        ContiguousTable result = new ContiguousTable(table, devBuffer);
-        table = null;
-        devBuffer = null;
-        return result;
+      } catch (Exception e) {
+        hostBuffer.close();
+        throw e;
       }
+
+      return new HostConcatResult(combined, hostBuffer);
     } finally {
       closeAll(providersPerColumn);
-      if (table != null) {
-        table.close();
-      }
-      if (devBuffer != null) {
-        devBuffer.close();
-      }
     }
   }
 

From 0b766c5a1de627ad70a6d83e167104a52db1565b Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Wed, 17 Mar 2021 10:47:56 -0500
Subject: [PATCH 03/26] Add JNI support for IDENTITY hash partitioning (#7626)

This adds in support for identity hash partitioning in JNI.

Authors:
  - Robert (Bobby) Evans (@revans2)

Approvers:
  - Jason Lowe (@jlowe)

URL: https://github.com/rapidsai/cudf/pull/7626
---
 .../main/java/ai/rapids/cudf/HashType.java    |  6 +-
 java/src/main/java/ai/rapids/cudf/Table.java  | 21 ++++++-
 java/src/main/native/src/TableJni.cpp         |  8 ++-
 .../test/java/ai/rapids/cudf/TableTest.java   | 58 +++++++++++++++++--
 4 files changed, 83 insertions(+), 10 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/HashType.java b/java/src/main/java/ai/rapids/cudf/HashType.java
index b521bc5c42c..eb31edd8222 100644
--- a/java/src/main/java/ai/rapids/cudf/HashType.java
+++ b/java/src/main/java/ai/rapids/cudf/HashType.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -22,8 +22,8 @@
  * Hash algorithm identifiers, mirroring native enum cudf::hash_id
  */
 public enum HashType {
-  // TODO IDENTITY(0),
-  // TODO MURMUR3(1),
+  IDENTITY(0),
+  MURMUR3(1),
   HASH_MD5(2),
   HASH_SERIAL_MURMUR3(3),
   HASH_SPARK_MURMUR3(4);
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index fcc23777d69..7385b55d0df 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -185,6 +185,7 @@ public long getDeviceMemorySize() {
 
   private static native long[] hashPartition(long inputTable,
                                              int[] columnsToHash,
+                                             int hashTypeId,
                                              int numberOfPartitions,
                                              int[] outputOffsets) throws CudfException;
 
@@ -2587,15 +2588,31 @@ public Table leftAntiJoin(TableOperation rightJoinIndices) {
     }
 
     /**
-     * Hash partition a table into the specified number of partitions.
+     * Hash partition a table into the specified number of partitions. Uses the default MURMUR3
+     * hashing.
      * @param numberOfPartitions - number of partitions to use
      * @return - {@link PartitionedTable} - Table that exposes a limited functionality of the
      * {@link Table} class
      */
     public PartitionedTable hashPartition(int numberOfPartitions) {
+      return hashPartition(HashType.MURMUR3, numberOfPartitions);
+    }
+
+    /**
+     * Hash partition a table into the specified number of partitions.
+     * @param type the type of hash to use. Depending on the type of hash different restrictions
+     *             on the hash column(s) may exist. Not all hash functions are guaranteed to work
+     *             besides IDENTITY and MURMUR3.
+     * @param numberOfPartitions - number of partitions to use
+     * @return {@link PartitionedTable} - Table that exposes a limited functionality of the
+     * {@link Table} class
+     */
+    public PartitionedTable hashPartition(HashType type, int numberOfPartitions) {
       int[] partitionOffsets = new int[numberOfPartitions];
-      return new PartitionedTable(new Table(Table.hashPartition(operation.table.nativeHandle,
+      return new PartitionedTable(new Table(Table.hashPartition(
+          operation.table.nativeHandle,
           operation.indices,
+          type.nativeId,
           partitionOffsets.length,
           partitionOffsets)), partitionOffsets);
     }
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index e051f68be4e..4548156055a 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -27,6 +27,7 @@
 #include <cudf/io/orc.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/join.hpp>
+#include <cudf/lists/explode.hpp>
 #include <cudf/merge.hpp>
 #include <cudf/partitioning.hpp>
 #include <cudf/reshape.hpp>
@@ -1616,6 +1617,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_concatenate(JNIEnv *env,
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(JNIEnv *env, jclass,
                                                                      jlong input_table,
                                                                      jintArray columns_to_hash,
+                                                                     jint hash_function,
                                                                      jint number_of_partitions,
                                                                      jintArray output_offsets) {
 
@@ -1626,6 +1628,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(JNIEnv *env
 
   try {
     cudf::jni::auto_set_device(env);
+    cudf::hash_id hash_func = static_cast<cudf::hash_id>(hash_function);
     cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
     cudf::jni::native_jintArray n_columns_to_hash(env, columns_to_hash);
     cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
@@ -1638,7 +1641,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(JNIEnv *env
     }
 
     std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> result =
-        cudf::hash_partition(*n_input_table, columns_to_hash_vec, number_of_partitions);
+        cudf::hash_partition(*n_input_table,
+                             columns_to_hash_vec,
+                             number_of_partitions,
+                             hash_func);
 
     for (size_t i = 0; i < result.second.size(); i++) {
       n_output_offsets[i] = result.second[i];
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 88196a4112a..626f7828012 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -1742,7 +1742,7 @@ void testPartStability() {
     final int PARTS = 5;
     int expectedPart = -1;
     try (Table start = new Table.TestBuilder().column(0).build();
-         PartitionedTable out = start.onColumns(0).partition(PARTS)) {
+         PartitionedTable out = start.onColumns(0).hashPartition(PARTS)) {
       // Lets figure out what partitions this is a part of.
       int[] parts = out.getPartitions();
       for (int i = 0; i < parts.length; i++) {
@@ -1755,7 +1755,7 @@ void testPartStability() {
     for (int numEntries = 1; numEntries < COUNT; numEntries++) {
       try (ColumnVector data = ColumnVector.build(DType.INT32, numEntries, Range.appendInts(0, numEntries));
            Table t = new Table(data);
-           PartitionedTable out = t.onColumns(0).partition(PARTS);
+           PartitionedTable out = t.onColumns(0).hashPartition(PARTS);
            HostColumnVector tmp = out.getColumn(0).copyToHost()) {
         // Now we need to get the range out for the partition we expect
         int[] parts = out.getPartitions();
@@ -1774,7 +1774,7 @@ void testPartStability() {
   }
 
   @Test
-  void testPartition() {
+  void testIdentityHashPartition() {
     final int count = 1024 * 1024;
     try (ColumnVector aIn = ColumnVector.build(DType.INT64, count, Range.appendLongs(count));
          ColumnVector bIn = ColumnVector.build(DType.INT32, count, (b) -> {
@@ -1793,7 +1793,57 @@ void testPartition() {
         expected.add(i);
       }
       try (Table input = new Table(new ColumnVector[]{aIn, bIn, cIn});
-           PartitionedTable output = input.onColumns(0).partition(5)) {
+           PartitionedTable output = input.onColumns(0).hashPartition(HashType.IDENTITY, 5)) {
+        int[] parts = output.getPartitions();
+        assertEquals(5, parts.length);
+        assertEquals(0, parts[0]);
+        int previous = 0;
+        long rows = 0;
+        for (int i = 1; i < parts.length; i++) {
+          assertTrue(parts[i] >= previous);
+          rows += parts[i] - previous;
+          previous = parts[i];
+        }
+        assertTrue(rows <= count);
+        try (HostColumnVector aOut = output.getColumn(0).copyToHost();
+             HostColumnVector bOut = output.getColumn(1).copyToHost();
+             HostColumnVector cOut = output.getColumn(2).copyToHost()) {
+
+          for (int i = 0; i < count; i++) {
+            long fromA = aOut.getLong(i);
+            long fromB = bOut.getInt(i);
+            String fromC = cOut.getJavaString(i);
+            assertTrue(expected.remove(fromA));
+            assertEquals(fromA / 2, fromB);
+            assertEquals(String.valueOf(fromA), fromC, "At Index " + i);
+          }
+          assertTrue(expected.isEmpty());
+        }
+      }
+    }
+  }
+
+  @Test
+  void testHashPartition() {
+    final int count = 1024 * 1024;
+    try (ColumnVector aIn = ColumnVector.build(DType.INT64, count, Range.appendLongs(count));
+         ColumnVector bIn = ColumnVector.build(DType.INT32, count, (b) -> {
+           for (int i = 0; i < count; i++) {
+             b.append(i / 2);
+           }
+         });
+         ColumnVector cIn = ColumnVector.build(DType.STRING, count, (b) -> {
+           for (int i = 0; i < count; i++) {
+             b.appendUTF8String(String.valueOf(i).getBytes());
+           }
+         })) {
+
+      HashSet<Long> expected = new HashSet<>();
+      for (long i = 0; i < count; i++) {
+        expected.add(i);
+      }
+      try (Table input = new Table(new ColumnVector[]{aIn, bIn, cIn});
+           PartitionedTable output = input.onColumns(0).hashPartition(5)) {
         int[] parts = output.getPartitions();
         assertEquals(5, parts.length);
         assertEquals(0, parts[0]);

From 3349764b9a0699bf96f7403895df88f7308647fb Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Wed, 17 Mar 2021 12:48:09 -0600
Subject: [PATCH 04/26] Refactor string conversion check (#7599)

This addresses #7557.

In summary:
 * Move `cudf::strings::is_integer()` code from `strings/chars_types.*` to `strings/convert/convert_integers.hpp/cu`
 * Move `cudf::strings::is_float()` code from `strings/chars_types.*` to `strings/convert/convert_floats.hpp/cu`
 * Remove `cudf::strings::all_integer()` and `cudf::strings::all_float()`

Authors:
  - Nghia Truong (@ttnghia)

Approvers:
  - GALI PREM SAGAR (@galipremsagar)
  - Jason Lowe (@jlowe)
  - Jake Hemstad (@jrhemstad)
  - David (@davidwendt)

URL: https://github.com/rapidsai/cudf/pull/7599
---
 .../cudf/strings/char_types/char_types.hpp    |  78 +-----------
 .../cudf/strings/convert/convert_floats.hpp   |  26 +++-
 .../cudf/strings/convert/convert_integers.hpp |  26 +++-
 cpp/src/strings/char_types/char_types.cu      | 113 +-----------------
 cpp/src/strings/convert/convert_floats.cu     |  41 ++++++-
 cpp/src/strings/convert/convert_integers.cu   |  41 ++++++-
 cpp/tests/strings/chars_types_tests.cpp       |  63 ----------
 cpp/tests/strings/floats_tests.cpp            |  35 ++++++
 cpp/tests/strings/integers_tests.cu           |  23 +++-
 java/src/main/native/src/ColumnViewJni.cpp    |   1 -
 .../cudf/cudf/_lib/cpp/strings/char_types.pxd |  10 +-
 .../cpp/strings/convert/convert_floats.pxd    |   6 +-
 .../cpp/strings/convert/convert_integers.pxd  |   6 +-
 python/cudf/cudf/_lib/strings/char_types.pyx  |  36 +-----
 .../_lib/strings/convert/convert_floats.pyx   |  29 +++++
 .../_lib/strings/convert/convert_integers.pyx |  29 +++++
 python/cudf/cudf/core/column/string.py        |   6 +-
 python/cudf/cudf/core/tools/datetimes.py      |   4 +-
 18 files changed, 265 insertions(+), 308 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
 create mode 100644 python/cudf/cudf/_lib/strings/convert/convert_integers.pyx

diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp
index 300722920f4..1f5b6241850 100644
--- a/cpp/include/cudf/strings/char_types/char_types.hpp
+++ b/cpp/include/cudf/strings/char_types/char_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -146,82 +146,6 @@ std::unique_ptr<column> filter_characters_of_type(
   string_character_types types_to_keep = string_character_types::ALL_TYPES,
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Returns a boolean column identifying strings in which all
- * characters are valid for conversion to integers.
- *
- * The output row entry will be set to `true` if the corresponding string element
- * has at least one character in [-+0-9].
- *
- * @code{.pseudo}
- * Example:
- * s = ['123', '-456', '', 'A', '+7']
- * b = s.is_integer(s)
- * b is [true, true, false, false, true]
- * @endcode
- *
- * Any null row results in a null entry for that row in the output column.
- *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of boolean results for each string.
- */
-std::unique_ptr<column> is_integer(
-  strings_column_view const& strings,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Returns `true` if all strings contain
- * characters that are valid for conversion to integers.
- *
- * This function will return `true` if all string elements
- * has at least one character in [-+0-9].
- *
- * Any null entry or empty string will cause this function to return `false`.
- *
- * @param strings Strings instance for this operation.
- * @return true if all string are valid
- */
-bool all_integer(strings_column_view const& strings);
-
-/**
- * @brief Returns a boolean column identifying strings in which all
- * characters are valid for conversion to floats.
- *
- * The output row entry will be set to `true` if the corresponding string element
- * has at least one character in [-+0-9eE.].
- *
- * @code{.pseudo}
- * Example:
- * s = ['123', '-456', '', 'A', '+7', '8.9' '3.7e+5']
- * b = s.is_float(s)
- * b is [true, true, false, false, true, true, true]
- * @endcode
- *
- * Any null row results in a null entry for that row in the output column.
- *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of boolean results for each string.
- */
-std::unique_ptr<column> is_float(
-  strings_column_view const& strings,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Returns `true` if all strings contain
- * characters that are valid for conversion to floats.
- *
- * This function will return `true` if all string elements
- * has at least one character in [-+0-9eE.].
- *
- * Any null entry or empty string will cause this function to return `false`.
- *
- * @param strings Strings instance for this operation.
- * @return true if all string are valid
- */
-bool all_float(strings_column_view const& strings);
-
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/convert/convert_floats.hpp b/cpp/include/cudf/strings/convert/convert_floats.hpp
index cb4746dbf40..d1e00b36f6f 100644
--- a/cpp/include/cudf/strings/convert/convert_floats.hpp
+++ b/cpp/include/cudf/strings/convert/convert_floats.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -68,6 +68,30 @@ std::unique_ptr<column> from_floats(
   column_view const& floats,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a boolean column identifying strings in which all
+ * characters are valid for conversion to floats.
+ *
+ * The output row entry will be set to `true` if the corresponding string element
+ * has at least one character in [-+0-9eE.].
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ['123', '-456', '', 'A', '+7', '8.9' '3.7e+5']
+ * b = s.is_float(s)
+ * b is [true, true, false, false, true, true, true]
+ * @endcode
+ *
+ * Any null row results in a null entry for that row in the output column.
+ *
+ * @param strings Strings instance for this operation.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return New column of boolean results for each string.
+ */
+std::unique_ptr<column> is_float(
+  strings_column_view const& strings,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/convert/convert_integers.hpp b/cpp/include/cudf/strings/convert/convert_integers.hpp
index 8f42deb380d..1e2fa80b129 100644
--- a/cpp/include/cudf/strings/convert/convert_integers.hpp
+++ b/cpp/include/cudf/strings/convert/convert_integers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -73,6 +73,30 @@ std::unique_ptr<column> from_integers(
   column_view const& integers,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a boolean column identifying strings in which all
+ * characters are valid for conversion to integers.
+ *
+ * The output row entry will be set to `true` if the corresponding string element
+ * has at least one character in [-+0-9].
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ['123', '-456', '', 'A', '+7']
+ * b = s.is_integer(s)
+ * b is [true, true, false, false, true]
+ * @endcode
+ *
+ * Any null row results in a null entry for that row in the output column.
+ *
+ * @param strings Strings instance for this operation.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return New column of boolean results for each string.
+ */
+std::unique_ptr<column> is_integer(
+  strings_column_view const& strings,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Returns a new integer numeric column parsing hexadecimal values from the
  * provided strings column.
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index 10496b89328..0b384ad0631 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -186,91 +186,6 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
                              mr);
 }
 
-std::unique_ptr<column> is_integer(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_column       = *strings_column;
-  // create output column
-  auto results   = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings.size(),
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
-  auto d_results = results->mutable_view().data<bool>();
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(strings.size()),
-                    d_results,
-                    [d_column] __device__(size_type idx) {
-                      if (d_column.is_null(idx)) return false;
-                      return string::is_integer(d_column.element<string_view>(idx));
-                    });
-  results->set_null_count(strings.null_count());
-  return results;
-}
-
-bool all_integer(strings_column_view const& strings, rmm::cuda_stream_view stream)
-{
-  auto strings_column  = column_device_view::create(strings.parent(), stream);
-  auto d_column        = *strings_column;
-  auto transformer_itr = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_type>(0), [d_column] __device__(size_type idx) {
-      if (d_column.is_null(idx)) return false;
-      return string::is_integer(d_column.element<string_view>(idx));
-    });
-  return thrust::all_of(rmm::exec_policy(stream),
-                        transformer_itr,
-                        transformer_itr + strings.size(),
-                        thrust::identity<bool>());
-}
-
-std::unique_ptr<column> is_float(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_column       = *strings_column;
-  // create output column
-  auto results   = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings.size(),
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
-  auto d_results = results->mutable_view().data<bool>();
-  // check strings for valid float chars
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(strings.size()),
-                    d_results,
-                    [d_column] __device__(size_type idx) {
-                      if (d_column.is_null(idx)) return false;
-                      return string::is_float(d_column.element<string_view>(idx));
-                    });
-  results->set_null_count(strings.null_count());
-  return results;
-}
-
-bool all_float(strings_column_view const& strings, rmm::cuda_stream_view stream)
-{
-  auto strings_column  = column_device_view::create(strings.parent(), stream);
-  auto d_column        = *strings_column;
-  auto transformer_itr = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_type>(0), [d_column] __device__(size_type idx) {
-      if (d_column.is_null(idx)) return false;
-      return string::is_float(d_column.element<string_view>(idx));
-    });
-  return thrust::all_of(rmm::exec_policy(stream),
-                        transformer_itr,
-                        transformer_itr + strings.size(),
-                        thrust::identity<bool>());
-}
-
 }  // namespace detail
 
 // external API
@@ -295,31 +210,5 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
     strings, types_to_remove, replacement, types_to_keep, rmm::cuda_stream_default, mr);
 }
 
-std::unique_ptr<column> is_integer(strings_column_view const& strings,
-                                   rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::is_integer(strings, rmm::cuda_stream_default, mr);
-}
-
-std::unique_ptr<column> is_float(strings_column_view const& strings,
-                                 rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::is_float(strings, rmm::cuda_stream_default, mr);
-}
-
-bool all_integer(strings_column_view const& strings)
-{
-  CUDF_FUNC_RANGE();
-  return detail::all_integer(strings, rmm::cuda_stream_default);
-}
-
-bool all_float(strings_column_view const& strings)
-{
-  CUDF_FUNC_RANGE();
-  return detail::all_float(strings, rmm::cuda_stream_default);
-}
-
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index 2bf65976986..b6d99efd51f 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -21,6 +21,7 @@
 #include <cudf/strings/convert/convert_floats.hpp>
 #include <cudf/strings/detail/converters.hpp>
 #include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/string.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -536,12 +537,50 @@ std::unique_ptr<column> from_floats(column_view const& floats,
 }  // namespace detail
 
 // external API
-
 std::unique_ptr<column> from_floats(column_view const& floats, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_floats(floats, rmm::cuda_stream_default, mr);
 }
 
+namespace detail {
+std::unique_ptr<column> is_float(
+  strings_column_view const& strings,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto d_column       = *strings_column;
+  // create output column
+  auto results   = make_numeric_column(data_type{type_id::BOOL8},
+                                     strings.size(),
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
+  auto d_results = results->mutable_view().data<bool>();
+  // check strings for valid float chars
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(strings.size()),
+                    d_results,
+                    [d_column] __device__(size_type idx) {
+                      if (d_column.is_null(idx)) return false;
+                      return string::is_float(d_column.element<string_view>(idx));
+                    });
+  results->set_null_count(strings.null_count());
+  return results;
+}
+
+}  // namespace detail
+
+// external API
+std::unique_ptr<column> is_float(strings_column_view const& strings,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::is_float(strings, rmm::cuda_stream_default, mr);
+}
+
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index 112550fc25b..5c5032b5c87 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/strings/convert/convert_integers.hpp>
 #include <cudf/strings/detail/converters.hpp>
 #include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/string.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -245,7 +246,6 @@ std::unique_ptr<column> from_integers(column_view const& integers,
 }  // namespace detail
 
 // external API
-
 std::unique_ptr<column> from_integers(column_view const& integers,
                                       rmm::mr::device_memory_resource* mr)
 {
@@ -253,5 +253,42 @@ std::unique_ptr<column> from_integers(column_view const& integers,
   return detail::from_integers(integers, rmm::cuda_stream_default, mr);
 }
 
+namespace detail {
+std::unique_ptr<column> is_integer(
+  strings_column_view const& strings,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto d_column       = *strings_column;
+  // create output column
+  auto results   = make_numeric_column(data_type{type_id::BOOL8},
+                                     strings.size(),
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
+  auto d_results = results->mutable_view().data<bool>();
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(strings.size()),
+                    d_results,
+                    [d_column] __device__(size_type idx) {
+                      if (d_column.is_null(idx)) return false;
+                      return string::is_integer(d_column.element<string_view>(idx));
+                    });
+  results->set_null_count(strings.null_count());
+  return results;
+}
+}  // namespace detail
+
+// external API
+std::unique_ptr<column> is_integer(strings_column_view const& strings,
+                                   rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::is_integer(strings, rmm::cuda_stream_default, mr);
+}
+
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/tests/strings/chars_types_tests.cpp b/cpp/tests/strings/chars_types_tests.cpp
index 803a9b01b07..702329edaba 100644
--- a/cpp/tests/strings/chars_types_tests.cpp
+++ b/cpp/tests/strings/chars_types_tests.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <tests/strings/utilities.h>
 #include <cudf/column/column.hpp>
 #include <cudf/strings/char_types/char_types.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -228,54 +227,6 @@ TEST_F(StringsCharsTest, Numerics)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_F(StringsCharsTest, Integers)
-{
-  cudf::test::strings_column_wrapper strings1(
-    {"+175", "-34", "9.8", "17+2", "+-14", "1234567890", "67de", "", "1e10", "-", "++", ""});
-  auto results = cudf::strings::is_integer(cudf::strings_column_view(strings1));
-  cudf::test::fixed_width_column_wrapper<bool> expected1({1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);
-  EXPECT_FALSE(cudf::strings::all_integer(cudf::strings_column_view(strings1)));
-
-  cudf::test::strings_column_wrapper strings2(
-    {"0", "+0", "-0", "1234567890", "-27341132", "+012", "023", "-045"});
-  results = cudf::strings::is_integer(cudf::strings_column_view(strings2));
-  cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1, 1, 1});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
-  EXPECT_TRUE(cudf::strings::all_integer(cudf::strings_column_view(strings2)));
-}
-
-TEST_F(StringsCharsTest, Floats)
-{
-  cudf::test::strings_column_wrapper strings1({"+175",
-                                               "-9.8",
-                                               "7+2",
-                                               "+-4",
-                                               "6.7e17",
-                                               "-1.2e-5",
-                                               "e",
-                                               ".e",
-                                               "1.e+-2",
-                                               "00.00",
-                                               "1.0e+1.0",
-                                               "1.2.3",
-                                               "+",
-                                               "--",
-                                               ""});
-  auto results = cudf::strings::is_float(cudf::strings_column_view(strings1));
-  cudf::test::fixed_width_column_wrapper<bool> expected1(
-    {1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);
-  EXPECT_FALSE(cudf::strings::all_float(cudf::strings_column_view(strings1)));
-
-  cudf::test::strings_column_wrapper strings2(
-    {"+175", "-34", "9.8", "1234567890", "6.7e17", "-917.2e5"});
-  results = cudf::strings::is_float(cudf::strings_column_view(strings2));
-  cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
-  EXPECT_TRUE(cudf::strings::all_float(cudf::strings_column_view(strings2)));
-}
-
 TEST_F(StringsCharsTest, EmptyStrings)
 {
   cudf::test::strings_column_wrapper strings({"", "", ""});
@@ -284,12 +235,6 @@ TEST_F(StringsCharsTest, EmptyStrings)
   auto results = cudf::strings::all_characters_of_type(
     strings_view, cudf::strings::string_character_types::ALPHANUM);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-  results = cudf::strings::is_integer(strings_view);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-  EXPECT_FALSE(cudf::strings::all_integer(strings_view));
-  results = cudf::strings::is_float(strings_view);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-  EXPECT_FALSE(cudf::strings::all_float(strings_view));
 }
 
 TEST_F(StringsCharsTest, FilterCharTypes)
@@ -379,14 +324,6 @@ TEST_F(StringsCharsTest, EmptyStringsColumn)
   EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id());
   EXPECT_EQ(0, results->view().size());
 
-  results = cudf::strings::is_integer(strings_view);
-  EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id());
-  EXPECT_EQ(0, results->view().size());
-
-  results = cudf::strings::is_float(strings_view);
-  EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id());
-  EXPECT_EQ(0, results->view().size());
-
   results = cudf::strings::filter_characters_of_type(
     strings_view, cudf::strings::string_character_types::NUMERIC);
   EXPECT_EQ(cudf::type_id::STRING, results->view().type().id());
diff --git a/cpp/tests/strings/floats_tests.cpp b/cpp/tests/strings/floats_tests.cpp
index b98416d9edd..f7151363d83 100644
--- a/cpp/tests/strings/floats_tests.cpp
+++ b/cpp/tests/strings/floats_tests.cpp
@@ -27,6 +27,41 @@
 struct StringsConvertTest : public cudf::test::BaseFixture {
 };
 
+TEST_F(StringsConvertTest, IsFloat)
+{
+  cudf::test::strings_column_wrapper strings;
+  auto strings_view = cudf::strings_column_view(strings);
+  auto results      = cudf::strings::is_float(strings_view);
+  EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id());
+  EXPECT_EQ(0, results->view().size());
+
+  cudf::test::strings_column_wrapper strings1({"+175",
+                                               "-9.8",
+                                               "7+2",
+                                               "+-4",
+                                               "6.7e17",
+                                               "-1.2e-5",
+                                               "e",
+                                               ".e",
+                                               "1.e+-2",
+                                               "00.00",
+                                               "1.0e+1.0",
+                                               "1.2.3",
+                                               "+",
+                                               "--",
+                                               ""});
+  results = cudf::strings::is_float(cudf::strings_column_view(strings1));
+  cudf::test::fixed_width_column_wrapper<bool> expected1(
+    {1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);
+
+  cudf::test::strings_column_wrapper strings2(
+    {"+175", "-34", "9.8", "1234567890", "6.7e17", "-917.2e5"});
+  results = cudf::strings::is_float(cudf::strings_column_view(strings2));
+  cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
+}
+
 TEST_F(StringsConvertTest, ToFloats32)
 {
   std::vector<const char*> h_strings{"1234",
diff --git a/cpp/tests/strings/integers_tests.cu b/cpp/tests/strings/integers_tests.cu
index 9e2b9809b26..d6bf03b3f76 100644
--- a/cpp/tests/strings/integers_tests.cu
+++ b/cpp/tests/strings/integers_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,27 @@
 struct StringsConvertTest : public cudf::test::BaseFixture {
 };
 
+TEST_F(StringsConvertTest, IsInteger)
+{
+  cudf::test::strings_column_wrapper strings;
+  auto strings_view = cudf::strings_column_view(strings);
+  auto results      = cudf::strings::is_integer(strings_view);
+  EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id());
+  EXPECT_EQ(0, results->view().size());
+
+  cudf::test::strings_column_wrapper strings1(
+    {"+175", "-34", "9.8", "17+2", "+-14", "1234567890", "67de", "", "1e10", "-", "++", ""});
+  results = cudf::strings::is_integer(cudf::strings_column_view(strings1));
+  cudf::test::fixed_width_column_wrapper<bool> expected1({1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);
+
+  cudf::test::strings_column_wrapper strings2(
+    {"0", "+0", "-0", "1234567890", "-27341132", "+012", "023", "-045"});
+  results = cudf::strings::is_integer(cudf::strings_column_view(strings2));
+  cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
+}
+
 TEST_F(StringsConvertTest, ToInteger)
 {
   std::vector<const char*> h_strings{
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 0ce9d6303e4..ac14e1605d7 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -37,7 +37,6 @@
 #include <cudf/strings/attributes.hpp>
 #include <cudf/strings/capitalize.hpp>
 #include <cudf/strings/case.hpp>
-#include <cudf/strings/char_types/char_types.hpp>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/contains.hpp>
 #include <cudf/strings/convert/convert_booleans.hpp>
diff --git a/python/cudf/cudf/_lib/cpp/strings/char_types.pxd b/python/cudf/cudf/_lib/cpp/strings/char_types.pxd
index ad675027c10..934269c6f25 100644
--- a/python/cudf/cudf/_lib/cpp/strings/char_types.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/char_types.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2021, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -33,11 +33,3 @@ cdef extern from "cudf/strings/char_types/char_types.hpp" \
         string_character_types types_to_remove,
         string_scalar replacement,
         string_character_types types_to_keep) except +
-
-    cdef unique_ptr[column] is_integer(
-        column_view source_strings
-    ) except +
-
-    cdef unique_ptr[column] is_float(
-        column_view source_strings
-    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd b/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd
index baee01b8f99..55a84b60efd 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2021, NVIDIA CORPORATION.
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -14,3 +14,7 @@ cdef extern from "cudf/strings/convert/convert_floats.hpp" namespace \
 
     cdef unique_ptr[column] from_floats(
         column_view input_col) except +
+
+    cdef unique_ptr[column] is_float(
+        column_view source_strings
+    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd b/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd
index 92f99a2f5cb..6e45d4ba869 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2021, NVIDIA CORPORATION.
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -15,6 +15,10 @@ cdef extern from "cudf/strings/convert/convert_integers.hpp" namespace \
     cdef unique_ptr[column] from_integers(
         column_view input_col) except +
 
+    cdef unique_ptr[column] is_integer(
+        column_view source_strings
+    ) except +
+
     cdef unique_ptr[column] hex_to_integers(
         column_view input_col,
         data_type output_type) except +
diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx
index 5d8d1522418..1890e98f956 100644
--- a/python/cudf/cudf/_lib/strings/char_types.pyx
+++ b/python/cudf/cudf/_lib/strings/char_types.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2021, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -14,8 +14,6 @@ from cudf._lib.cpp.strings.char_types cimport (
     all_characters_of_type as cpp_all_characters_of_type,
     filter_characters_of_type as cpp_filter_characters_of_type,
     string_character_types as string_character_types,
-    is_integer as cpp_is_integer,
-    is_float as cpp_is_float,
 )
 
 
@@ -191,35 +189,3 @@ def is_space(Column source_strings):
         ))
 
     return Column.from_unique_ptr(move(c_result))
-
-
-def is_integer(Column source_strings):
-    """
-    Returns a Column of boolean values with True for `source_strings`
-    that have intergers.
-    """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_is_integer(
-            source_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
-
-
-def is_float(Column source_strings):
-    """
-    Returns a Column of boolean values with True for `source_strings`
-    that have floats.
-    """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_is_float(
-            source_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
new file mode 100644
index 00000000000..195d9b71f6e
--- /dev/null
+++ b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
@@ -0,0 +1,29 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column cimport column
+
+from cudf._lib.cpp.strings.convert.convert_floats cimport (
+    is_float as cpp_is_float,
+)
+
+
+def is_float(Column source_strings):
+    """
+    Returns a Column of boolean values with True for `source_strings`
+    that have floats.
+    """
+    cdef unique_ptr[column] c_result
+    cdef column_view source_view = source_strings.view()
+
+    with nogil:
+        c_result = move(cpp_is_float(
+            source_view
+        ))
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
new file mode 100644
index 00000000000..d1bae1edd37
--- /dev/null
+++ b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
@@ -0,0 +1,29 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column cimport column
+
+from cudf._lib.cpp.strings.convert.convert_integers cimport (
+    is_integer as cpp_is_integer,
+)
+
+
+def is_integer(Column source_strings):
+    """
+    Returns a Column of boolean values with True for `source_strings`
+    that have intergers.
+    """
+    cdef unique_ptr[column] c_result
+    cdef column_view source_view = source_strings.view()
+
+    with nogil:
+        c_result = move(cpp_is_integer(
+            source_view
+        ))
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index ea01aa07b91..11dd7556812 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -70,13 +70,15 @@
     is_alpha as cpp_is_alpha,
     is_decimal as cpp_is_decimal,
     is_digit as cpp_is_digit,
-    is_float as cpp_is_float,
-    is_integer as cpp_is_integer,
     is_lower as cpp_is_lower,
     is_numeric as cpp_is_numeric,
     is_space as cpp_isspace,
     is_upper as cpp_is_upper,
 )
+from cudf._lib.strings.convert.convert_integers import (
+    is_integer as cpp_is_integer,
+)
+from cudf._lib.strings.convert.convert_floats import is_float as cpp_is_float
 from cudf._lib.strings.combine import (
     concatenate as cpp_concatenate,
     join as cpp_join,
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 4e5e4ce1987..535e497e8dc 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -8,7 +8,9 @@
 from pandas.core.tools.datetimes import _unit_map
 
 import cudf
-from cudf._lib.strings.char_types import is_integer as cpp_is_integer
+from cudf._lib.strings.convert.convert_integers import (
+    is_integer as cpp_is_integer,
+)
 from cudf.core import column
 from cudf.core.index import as_index
 from cudf.utils.dtypes import is_scalar

From 168c489a9415ae7bbbec5ef600b0d3dcde44b583 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Wed, 17 Mar 2021 20:37:11 -0500
Subject: [PATCH 05/26] Fix Series/Dataframe Mixed Arithmetic (#7491)

Fixes https://github.com/rapidsai/cudf/issues/7385

Authors:
  - @brandon-b-miller

Approvers:
  - GALI PREM SAGAR (@galipremsagar)
  - Michael Wang (@isVoid)

URL: https://github.com/rapidsai/cudf/pull/7491
---
 python/cudf/cudf/core/dataframe.py       |  8 ++---
 python/cudf/cudf/core/series.py          |  4 +--
 python/cudf/cudf/tests/test_dataframe.py | 42 ++++++++++++------------
 3 files changed, 24 insertions(+), 30 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 25f57748765..9672ab3002f 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1518,11 +1518,7 @@ def fallback(col, fn):
                 else:
                     if col not in df_cols:
                         r_opr = other_cols[col]
-                        l_opr = Series(
-                            column_empty(
-                                len(self), masked=True, dtype=other.dtype
-                            )
-                        )
+                        l_opr = Series(as_column(np.nan, length=len(self)))
                     if col not in other_cols_keys:
                         r_opr = None
                         l_opr = self[col]
@@ -2198,7 +2194,7 @@ def rpow(self, other, axis="columns", level=None, fill_value=None):
         return self._apply_op("rpow", other, fill_value)
 
     def __rpow__(self, other):
-        return self._apply_op("__pow__", other)
+        return self._apply_op("__rpow__", other)
 
     def floordiv(self, other, axis="columns", level=None, fill_value=None):
         """
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 5e7121c0488..b06fef178f6 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1501,9 +1501,7 @@ def _binaryop(
         If ``reflect`` is ``True``, swap the order of the operands.
         """
         if isinstance(other, cudf.DataFrame):
-            # TODO: fn is not the same as arg expected by _apply_op
-            # e.g. for fn = 'and', _apply_op equivalent is '__and__'
-            return other._apply_op(self, fn)
+            return NotImplemented
 
         result_name = utils.get_result_name(self, other)
         if isinstance(other, Series):
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 77548b95277..5f4d571e8c5 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -4996,13 +4996,13 @@ def test_cov_nans():
 @pytest.mark.parametrize(
     "gsr",
     [
-        cudf.Series([1, 2, 3]),
-        cudf.Series([1, 2, 3], index=["a", "b", "c"]),
-        cudf.Series([1, 2, 3], index=["a", "b", "d"]),
-        cudf.Series([1, 2], index=["a", "b"]),
-        cudf.Series([1, 2, 3], index=cudf.core.index.RangeIndex(0, 3)),
+        cudf.Series([4, 2, 3]),
+        cudf.Series([4, 2, 3], index=["a", "b", "c"]),
+        cudf.Series([4, 2, 3], index=["a", "b", "d"]),
+        cudf.Series([4, 2], index=["a", "b"]),
+        cudf.Series([4, 2, 3], index=cudf.core.index.RangeIndex(0, 3)),
         pytest.param(
-            cudf.Series([1, 2, 3, 4, 5], index=["a", "b", "d", "0", "12"]),
+            cudf.Series([4, 2, 3, 4, 5], index=["a", "b", "d", "0", "12"]),
             marks=pytest.mark.xfail,
         ),
     ],
@@ -5017,32 +5017,32 @@ def test_cov_nans():
         operator.truediv,
         operator.mod,
         operator.pow,
-        # comparison ops will temporarily XFAIL
-        # see PR  https://github.com/rapidsai/cudf/pull/7491
-        pytest.param(operator.eq, marks=pytest.mark.xfail()),
-        pytest.param(operator.lt, marks=pytest.mark.xfail()),
-        pytest.param(operator.le, marks=pytest.mark.xfail()),
-        pytest.param(operator.gt, marks=pytest.mark.xfail()),
-        pytest.param(operator.ge, marks=pytest.mark.xfail()),
-        pytest.param(operator.ne, marks=pytest.mark.xfail()),
+        operator.eq,
+        operator.lt,
+        operator.le,
+        operator.gt,
+        operator.ge,
+        operator.ne,
     ],
 )
 def test_df_sr_binop(gsr, colnames, op):
-    data = [[0, 2, 5], [3, None, 5], [6, 7, np.nan]]
+    data = [[3.0, 2.0, 5.0], [3.0, None, 5.0], [6.0, 7.0, np.nan]]
     data = dict(zip(colnames, data))
 
+    gsr = gsr.astype("float64")
+
     gdf = cudf.DataFrame(data)
-    pdf = pd.DataFrame.from_dict(data)
+    pdf = gdf.to_pandas(nullable=True)
 
-    psr = gsr.to_pandas()
+    psr = gsr.to_pandas(nullable=True)
 
     expect = op(pdf, psr)
-    got = op(gdf, gsr)
-    assert_eq(expect.astype(float), got.astype(float))
+    got = op(gdf, gsr).to_pandas(nullable=True)
+    assert_eq(expect, got, check_dtype=False)
 
     expect = op(psr, pdf)
-    got = op(psr, pdf)
-    assert_eq(expect.astype(float), got.astype(float))
+    got = op(gsr, gdf).to_pandas(nullable=True)
+    assert_eq(expect, got, check_dtype=False)
 
 
 @pytest.mark.parametrize(

From 99001d2c8d9b3898e58c74d7979ab6204c5e5bee Mon Sep 17 00:00:00 2001
From: Alfred Xu <lovedreamf@gmail.com>
Date: Thu, 18 Mar 2021 09:57:30 +0800
Subject: [PATCH 06/26] Java support on explode_outer (#7625)

This pull request aims to enable `cudf::explode_outer` and `cudf::explode_outer_position` in Java package.

Authors:
  - Alfred Xu (@sperlingxx)

Approvers:
  - Robert (Bobby) Evans (@revans2)

URL: https://github.com/rapidsai/cudf/pull/7625
---
 java/src/main/java/ai/rapids/cudf/Table.java  | 141 +++++++++++++++---
 java/src/main/native/src/TableJni.cpp         |  28 ++++
 .../test/java/ai/rapids/cudf/TableTest.java   |  86 +++++++++--
 3 files changed, 218 insertions(+), 37 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 7385b55d0df..d0e59fdc105 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -516,6 +516,10 @@ private static native long[] repeatColumnCount(long tableHandle,
 
   private static native long[] explodePosition(long tableHandle, int index);
 
+  private static native long[] explodeOuter(long tableHandle, int index);
+
+  private static native long[] explodeOuterPosition(long tableHandle, int index);
+
   private static native long createCudfTableView(long[] nativeColumnViewHandles);
 
   private static native long[] columnViewsFromPacked(ByteBuffer metadata, long dataAddress);
@@ -1725,7 +1729,7 @@ public ContiguousTable[] contiguousSplit(int... indices) {
    * Example:
    * input:  [[5,10,15], 100],
    *         [[20,25],   200],
-   *         [[30],      300],
+   *         [[30],      300]
    * index: 0
    * output: [5,         100],
    *         [10,        100],
@@ -1737,12 +1741,12 @@ public ContiguousTable[] contiguousSplit(int... indices) {
    *
    * Nulls propagate in different ways depending on what is null.
    * <code>
-   *     [[5,null,15], 100],
-   *     [null,        200]
-   * returns:
-   *     [5,           100],
-   *     [null,        100],
-   *     [15,          100]
+   * input:  [[5,null,15], 100],
+   *         [null,        200]
+   * index: 0
+   * output: [5,           100],
+   *         [null,        100],
+   *         [15,          100]
    * </code>
    * Note that null lists are completely removed from the output
    * and nulls inside lists are pulled out and remain.
@@ -1763,27 +1767,26 @@ public Table explode(int index) {
    * in the output. The corresponding rows for other columns in the input are duplicated. A position
    * column is added that has the index inside the original list for each row. Example:
    * <code>
-   * [[5,10,15], 100],
-   * [[20,25],   200],
-   * [[30],      300],
-   * returns
-   * [0,   5,    100],
-   * [1,   10,   100],
-   * [2,   15,    100],
-   * [0,   20,    200],
-   * [1,   25,    200],
-   * [0,   30,    300],
+   * input:  [[5,10,15], 100],
+   *         [[20,25],   200],
+   *         [[30],      300]
+   * index: 0
+   * output: [0,   5,    100],
+   *         [1,   10,   100],
+   *         [2,   15,   100],
+   *         [0,   20,   200],
+   *         [1,   25,   200],
+   *         [0,   30,   300]
    * </code>
    *
    * Nulls and empty lists propagate in different ways depending on what is null or empty.
    * <code>
-   * [[5,null,15], 100],
-   * [null,        200],
-   * [[],          300],
-   * returns
-   * [0,    5,     100],
-   * [1,    null,  100],
-   * [2,    15,    100],
+   * input:  [[5,null,15], 100],
+   *         [null,        200]
+   * index: 0
+   * output: [5,           100],
+   *         [null,        100],
+   *         [15,          100]
    * </code>
    *
    * Note that null lists are not included in the resulting table, but nulls inside
@@ -1799,6 +1802,96 @@ public Table explodePosition(int index) {
     return new Table(explodePosition(nativeHandle, index));
   }
 
+  /**
+   * Explodes a list column's elements.
+   *
+   * Any list is exploded, which means the elements of the list in each row are expanded
+   * into new rows in the output. The corresponding rows for other columns in the input
+   * are duplicated.
+   *
+   * <code>
+   * Example:
+   * input:  [[5,10,15], 100],
+   *         [[20,25],   200],
+   *         [[30],      300],
+   * index: 0
+   * output: [5,         100],
+   *         [10,        100],
+   *         [15,        100],
+   *         [20,        200],
+   *         [25,        200],
+   *         [30,        300]
+   * </code>
+   *
+   * Nulls propagate in different ways depending on what is null.
+   * <code>
+   *  input:  [[5,null,15], 100],
+   *          [null,        200]
+   * index: 0
+   * output:  [5,           100],
+   *          [null,        100],
+   *          [15,          100],
+   *          [null,        200]
+   * </code>
+   * Note that null lists are completely removed from the output
+   * and nulls inside lists are pulled out and remain.
+   *
+   * @param index Column index to explode inside the table.
+   * @return A new table with explode_col exploded.
+   */
+  public Table explodeOuter(int index) {
+    assert 0 <= index && index < columns.length : "Column index is out of range";
+    assert columns[index].getType().equals(DType.LIST) : "Column to explode must be of type LIST";
+    return new Table(explodeOuter(nativeHandle, index));
+  }
+
+  /**
+   * Explodes a list column's elements retaining any null entries or empty lists and includes a
+   * position column.
+   *
+   * Any list is exploded, which means the elements of the list in each row are expanded into new rows
+   * in the output. The corresponding rows for other columns in the input are duplicated. A position
+   * column is added that has the index inside the original list for each row. Example:
+   *
+   * <code>
+   * Example:
+   * input:  [[5,10,15], 100],
+   *         [[20,25],   200],
+   *         [[30],      300],
+   * index: 0
+   * output: [0,   5,    100],
+   *         [1,   10,   100],
+   *         [2,   15,   100],
+   *         [0,   20,   200],
+   *         [1,   25,   200],
+   *         [0,   30,   300]
+   * </code>
+   *
+   * Nulls and empty lists propagate as null entries in the result.
+   * <code>
+   * input:  [[5,null,15], 100],
+   *         [null,        200],
+   *         [[],          300]
+   * index: 0
+   * output: [0,     5,    100],
+   *         [1,  null,    100],
+   *         [2,    15,    100],
+   *         [0,  null,    200],
+   *         [0,  null,    300]
+   * </code>
+   *
+   *    returns
+   *
+   * @param index Column index to explode inside the table.
+   * @return A new table with exploded value and position. The column order of return table is
+   *         [cols before explode_input, explode_position, explode_value, cols after explode_input].
+   */
+  public Table explodeOuterPosition(int index) {
+    assert 0 <= index && index < columns.length : "Column index is out of range";
+    assert columns[index].getType().equals(DType.LIST) : "Column to explode must be of type LIST";
+    return new Table(explodeOuterPosition(nativeHandle, index));
+  }
+
   /**
    * Gathers the rows of this table according to `gatherMap` such that row "i"
    * in the resulting table's columns will contain row "gatherMap[i]" from this table.
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 4548156055a..02385a453d0 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -2052,4 +2052,32 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodePosition(JNIEnv *e
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuter(JNIEnv *env, jclass,
+                                                                    jlong input_jtable,
+                                                                    jint column_index) {
+  JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *input_table = reinterpret_cast<cudf::table_view *>(input_jtable);
+    cudf::size_type col_index = static_cast<cudf::size_type>(column_index);
+    std::unique_ptr<cudf::table> exploded = cudf::explode_outer(*input_table, col_index);
+    return cudf::jni::convert_table_for_return(env, exploded);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuterPosition(JNIEnv *env, jclass,
+                                                                            jlong input_jtable,
+                                                                            jint column_index) {
+  JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *input_table = reinterpret_cast<cudf::table_view *>(input_jtable);
+    cudf::size_type col_index = static_cast<cudf::size_type>(column_index);
+    std::unique_ptr<cudf::table> exploded = cudf::explode_outer_position(*input_table, col_index);
+    return cudf::jni::convert_table_for_return(env, exploded);
+  }
+  CATCH_STD(env, 0);
+}
+
 } // extern "C"
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 626f7828012..c2e28e1cad8 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -4635,7 +4635,7 @@ private Table[] buildExplodeTestTableWithPrimitiveTypes(boolean pos, boolean out
     }
   }
 
-  private Table[] buildExplodeTestTableWithNestedTypes(boolean pos) {
+  private Table[] buildExplodeTestTableWithNestedTypes(boolean pos, boolean outer) {
     StructType nestedType = new StructType(true,
         new BasicType(false, DType.INT32), new BasicType(false, DType.STRING));
     try (Table input = new Table.TestBuilder()
@@ -4644,23 +4644,42 @@ private Table[] buildExplodeTestTableWithNestedTypes(boolean pos) {
             Arrays.asList(struct(4, "k4"), struct(5, "k5")),
             Arrays.asList(struct(6, "k6")),
             Arrays.asList(new HostColumnVector.StructData((List) null)),
-            Arrays.asList())
+            null)
         .column("s1", "s2", "s3", "s4", "s5")
         .column(1, 3, 5, 7, 9)
         .column(12.0, 14.0, 13.0, 11.0, 15.0)
         .build()) {
       Table.TestBuilder expectedBuilder = new Table.TestBuilder();
       if (pos) {
-        expectedBuilder.column(0, 1, 2, 0, 1, 0, 0);
+        if (!outer)
+          expectedBuilder.column(0, 1, 2, 0, 1, 0, 0);
+        else
+          expectedBuilder.column(0, 1, 2, 0, 1, 0, 0, 0);
       }
-      try (Table expected = expectedBuilder
-          .column(nestedType,
+      List<Object[]> expectedData = new ArrayList<Object[]>(){{
+        if (!outer) {
+          this.add(new HostColumnVector.StructData[]{
+              struct(1, "k1"), struct(2, "k2"), struct(3, "k3"),
+              struct(4, "k4"), struct(5, "k5"), struct(6, "k6"),
+              new HostColumnVector.StructData((List) null)});
+          this.add(new String[]{"s1", "s1", "s1", "s2", "s2", "s3", "s4"});
+          this.add(new Integer[]{1, 1, 1, 3, 3, 5, 7});
+          this.add(new Double[]{12.0, 12.0, 12.0, 14.0, 14.0, 13.0, 11.0});
+        } else {
+          this.add(new HostColumnVector.StructData[]{
               struct(1, "k1"), struct(2, "k2"), struct(3, "k3"),
               struct(4, "k4"), struct(5, "k5"), struct(6, "k6"),
-              new HostColumnVector.StructData((List) null))
-          .column("s1", "s1", "s1", "s2", "s2", "s3", "s4")
-          .column(1, 1, 1, 3, 3, 5, 7)
-          .column(12.0, 12.0, 12.0, 14.0, 14.0, 13.0, 11.0)
+              new HostColumnVector.StructData((List) null), null});
+          this.add(new String[]{"s1", "s1", "s1", "s2", "s2", "s3", "s4", "s5"});
+          this.add(new Integer[]{1, 1, 1, 3, 3, 5, 7, 9});
+          this.add(new Double[]{12.0, 12.0, 12.0, 14.0, 14.0, 13.0, 11.0, 15.0});
+        }
+      }};
+      try (Table expected = expectedBuilder
+          .column(nestedType, (HostColumnVector.StructData[]) expectedData.get(0))
+          .column((String[]) expectedData.get(1))
+          .column((Integer[]) expectedData.get(2))
+          .column((Double[]) expectedData.get(3))
           .build()) {
         return new Table[]{new Table(input.getColumns()), new Table(expected.getColumns())};
       }
@@ -4679,7 +4698,7 @@ void testExplode() {
     }
 
     // Child is nested type
-    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(false);
+    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(false, false);
     try (Table input = testTables2[0];
          Table expected = testTables2[1]) {
       try (Table exploded = input.explode(0)) {
@@ -4689,7 +4708,7 @@ void testExplode() {
   }
 
   @Test
-  void testPosExplode() {
+  void testExplodePosition() {
     // Child is primitive type
     Table[] testTables = buildExplodeTestTableWithPrimitiveTypes(true, false);
     try (Table input = testTables[0];
@@ -4699,8 +4718,8 @@ void testPosExplode() {
       }
     }
 
-    // Child is primitive type
-    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(true);
+    // Child is nested type
+    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(true, false);
     try (Table input = testTables2[0];
          Table expected = testTables2[1]) {
       try (Table exploded = input.explodePosition(0)) {
@@ -4709,4 +4728,45 @@ void testPosExplode() {
     }
   }
 
+  @Test
+  void testExplodeOuter() {
+    // Child is primitive type
+    Table[] testTables = buildExplodeTestTableWithPrimitiveTypes(false, true);
+    try (Table input = testTables[0];
+         Table expected = testTables[1]) {
+      try (Table exploded = input.explodeOuter(0)) {
+        assertTablesAreEqual(expected, exploded);
+      }
+    }
+
+    // Child is nested type
+    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(false, true);
+    try (Table input = testTables2[0];
+         Table expected = testTables2[1]) {
+      try (Table exploded = input.explodeOuter(0)) {
+        assertTablesAreEqual(expected, exploded);
+      }
+    }
+  }
+
+  @Test
+  void testExplodeOuterPosition() {
+    // Child is primitive type
+    Table[] testTables = buildExplodeTestTableWithPrimitiveTypes(true, true);
+    try (Table input = testTables[0];
+         Table expected = testTables[1]) {
+      try (Table exploded = input.explodeOuterPosition(0)) {
+        assertTablesAreEqual(expected, exploded);
+      }
+    }
+
+    // Child is nested type
+    Table[] testTables2 = buildExplodeTestTableWithNestedTypes(true, true);
+    try (Table input = testTables2[0];
+         Table expected = testTables2[1]) {
+      try (Table exploded = input.explodeOuterPosition(0)) {
+        assertTablesAreEqual(expected, exploded);
+      }
+    }
+  }
 }

From 9aa33efa9e931c9de78e047a20e6a7481ee13559 Mon Sep 17 00:00:00 2001
From: David <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 17 Mar 2021 22:54:30 -0400
Subject: [PATCH 07/26] Optimize cudf::make_strings_column for long strings
 (#7576)

Reference #7571
This improves the performance of the `cudf::make_strings_column` for long strings. It uses a similar approach from `cudf::strings::detail::gather` and also use thresholding as in the optimized `cudf::strings::replace`.
This may not be the right solution for overall optimizing #7571 but may be helpful in other places where long strings are used for created a strings column in libcudf.
This PR also includes a gbenchmark to help measure the performance results of this factory function. The results of the benchmark are that longer strings (~ >64 bytes on average) showed about a 10x improvement. I can post benchmark results here if needed. The character-parallel algorithm was slower for shorter strings so the existing algorithm is used based on the a threshold calculation.
I also added an additional gtest with a mixture of nulls and empty strings to make sure the new algorithm handles these correctly.

Authors:
  - David (@davidwendt)

Approvers:
  - Jason Lowe (@jlowe)
  - Nghia Truong (@ttnghia)
  - Jake Hemstad (@jrhemstad)

URL: https://github.com/rapidsai/cudf/pull/7576
---
 cpp/benchmarks/CMakeLists.txt                 |  1 +
 cpp/benchmarks/string/factory_benchmark.cu    | 93 ++++++++++++++++++
 cpp/benchmarks/string/string_bench_args.hpp   |  2 +
 cpp/include/cudf/strings/detail/gather.cuh    | 87 ++++++++++------
 .../detail/strings_column_factories.cuh       | 98 +++++++++++++++----
 cpp/include/cudf/utilities/traits.hpp         | 12 +++
 cpp/tests/strings/factories_test.cu           | 34 +++++++
 7 files changed, 280 insertions(+), 47 deletions(-)
 create mode 100644 cpp/benchmarks/string/factory_benchmark.cu

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index dfc340b1459..e63ea38a31b 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -185,6 +185,7 @@ ConfigureBench(STRINGS_BENCH
   string/convert_floats_benchmark.cpp
   string/copy_benchmark.cpp
   string/extract_benchmark.cpp
+  string/factory_benchmark.cu
   string/filter_benchmark.cpp
   string/find_benchmark.cpp
   string/replace_benchmark.cpp
diff --git a/cpp/benchmarks/string/factory_benchmark.cu b/cpp/benchmarks/string/factory_benchmark.cu
new file mode 100644
index 00000000000..6c5dceffaa8
--- /dev/null
+++ b/cpp/benchmarks/string/factory_benchmark.cu
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "string_bench_args.hpp"
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/execution_policy.h>
+#include <thrust/transform.h>
+
+#include <limits>
+
+namespace {
+using string_pair = thrust::pair<char const*, cudf::size_type>;
+struct string_view_to_pair {
+  __device__ string_pair operator()(thrust::pair<cudf::string_view, bool> const& p)
+  {
+    return (p.second) ? string_pair{p.first.data(), p.first.size_bytes()} : string_pair{nullptr, 0};
+  }
+};
+}  // namespace
+
+class StringsFactory : public cudf::benchmark {
+};
+
+static void BM_factory(benchmark::State& state)
+{
+  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
+  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  auto d_column = cudf::column_device_view::create(table->view().column(0));
+  rmm::device_vector<string_pair> pairs(d_column->size());
+  thrust::transform(thrust::device,
+                    d_column->pair_begin<cudf::string_view, true>(),
+                    d_column->pair_end<cudf::string_view, true>(),
+                    pairs.data(),
+                    string_view_to_pair{});
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    cudf::make_strings_column(pairs);
+  }
+
+  cudf::strings_column_view input(table->view().column(0));
+  state.SetBytesProcessed(state.iterations() * input.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 1 << 5;
+  int const max_rowlen = 1 << 13;
+  int const len_mult   = 4;
+  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+}
+
+#define STRINGS_BENCHMARK_DEFINE(name)          \
+  BENCHMARK_DEFINE_F(StringsFactory, name)      \
+  (::benchmark::State & st) { BM_factory(st); } \
+  BENCHMARK_REGISTER_F(StringsFactory, name)    \
+    ->Apply(generate_bench_args)                \
+    ->UseManualTime()                           \
+    ->Unit(benchmark::kMillisecond);
+
+STRINGS_BENCHMARK_DEFINE(factory)
diff --git a/cpp/benchmarks/string/string_bench_args.hpp b/cpp/benchmarks/string/string_bench_args.hpp
index f81f859de74..9c709b064dd 100644
--- a/cpp/benchmarks/string/string_bench_args.hpp
+++ b/cpp/benchmarks/string/string_bench_args.hpp
@@ -17,6 +17,8 @@
 
 #include <benchmark/benchmark.h>
 
+#include <cudf/types.hpp>
+
 /**
  * @brief Generate row count and row length argument ranges for a string benchmark.
  *
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 28da8ef4324..988fa552100 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -31,15 +31,60 @@
 #include <thrust/transform.h>
 
 namespace cudf {
+namespace strings {
+namespace detail {
 
-template <typename Iterator>
-constexpr inline bool is_signed_iterator()
+/**
+ * @brief Returns a new chars column using the specified indices to select
+ * strings from the input iterator.
+ *
+ * This uses a character-parallel gather CUDA kernel that performs very
+ * well on a strings column with long strings (e.g. average > 64 bytes).
+ *
+ * @tparam StringIterator Iterator should produce `string_view` objects.
+ * @tparam MapIterator Iterator for retrieving integer indices of the `StringIterator`.
+ *
+ * @param strings_begin Start of the iterator to retrieve `string_view` instances
+ * @param map_begin Start of index iterator.
+ * @param map_end End of index iterator.
+ * @param offsets The offset values to be associated with the output chars column.
+ * @param chars_bytes The total number of bytes for the output chars column.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @return New chars column fit for a strings column.
+ */
+template <typename StringIterator, typename MapIterator>
+std::unique_ptr<cudf::column> gather_chars(StringIterator strings_begin,
+                                           MapIterator map_begin,
+                                           MapIterator map_end,
+                                           cudf::device_span<int32_t const> const offsets,
+                                           size_type chars_bytes,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
 {
-  return std::is_signed<typename std::iterator_traits<Iterator>::value_type>::value;
-}
+  auto const output_count = std::distance(map_begin, map_end);
+  if (output_count == 0) return make_empty_column(data_type{type_id::INT8});
 
-namespace strings {
-namespace detail {
+  auto chars_column  = create_chars_child_column(output_count, 0, chars_bytes, stream, mr);
+  auto const d_chars = chars_column->mutable_view().template data<char>();
+
+  auto gather_chars_fn = [strings_begin, map_begin, offsets] __device__(size_type out_idx) -> char {
+    auto const out_row =
+      thrust::prev(thrust::upper_bound(thrust::seq, offsets.begin(), offsets.end(), out_idx));
+    auto const row_idx = map_begin[thrust::distance(offsets.begin(), out_row)];  // get row index
+    auto const d_str   = strings_begin[row_idx];                                 // get row's string
+    auto const offset  = out_idx - *out_row;  // get string's char
+    return d_str.data()[offset];
+  };
+
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(chars_bytes),
+                    d_chars,
+                    gather_chars_fn);
+
+  return chars_column;
+}
 
 /**
  * @brief Returns a new strings column using the specified indices to select
@@ -107,29 +152,15 @@ std::unique_ptr<cudf::column> gather(
     rmm::exec_policy(stream), d_out_offsets, d_out_offsets + output_count + 1, d_out_offsets);
 
   // build chars column
-  size_type const out_chars_bytes = static_cast<size_type>(total_bytes);
-  auto out_chars_column  = create_chars_child_column(output_count, 0, out_chars_bytes, stream, mr);
-  auto const d_out_chars = out_chars_column->mutable_view().template data<char>();
-
-  // fill in chars
   cudf::device_span<int32_t const> const d_out_offsets_span(d_out_offsets, output_count + 1);
-  auto const d_in_chars = (strings_count > 0) ? strings.chars().data<char>() : nullptr;
-  auto gather_chars_fn =
-    [d_out_offsets_span, begin, d_in_offsets, d_in_chars] __device__(size_type out_char_idx) {
-      // find output row index for this output char index
-      auto const next_row_ptr = thrust::upper_bound(
-        thrust::seq, d_out_offsets_span.begin(), d_out_offsets_span.end(), out_char_idx);
-      auto const out_row_idx     = thrust::distance(d_out_offsets_span.begin(), next_row_ptr) - 1;
-      auto const str_char_offset = out_char_idx - d_out_offsets_span[out_row_idx];
-      auto const in_row_idx      = begin[out_row_idx];
-      auto const in_char_offset  = d_in_offsets[in_row_idx] + str_char_offset;
-      return d_in_chars[in_char_offset];
-    };
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(out_chars_bytes),
-                    d_out_chars,
-                    gather_chars_fn);
+  auto const d_strings  = column_device_view::create(strings.parent(), stream);
+  auto out_chars_column = gather_chars(d_strings->begin<string_view>(),
+                                       begin,
+                                       end,
+                                       d_out_offsets_span,
+                                       static_cast<size_type>(total_bytes),
+                                       stream,
+                                       mr);
 
   return make_strings_column(output_count,
                              std::move(out_offsets_column),
diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
index 8e843c555c5..932f7eb0926 100644
--- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh
+++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -20,6 +20,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/valid_if.cuh>
+#include <cudf/strings/detail/gather.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -27,6 +28,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/copy.h>
 #include <thrust/for_each.h>
 #include <thrust/transform_reduce.h>
 
@@ -34,7 +36,27 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
-// Create a strings-type column from iterators of pointer/size pairs
+/**
+ * @brief Average string byte-length threshold for deciding character-level
+ * vs. row-level parallel algorithm.
+ *
+ * This value was determined by running the factory_benchmark against different
+ * string lengths and observing the point where the performance is faster for
+ * long strings.
+ */
+constexpr size_type FACTORY_BYTES_PER_ROW_THRESHOLD = 64;
+
+/**
+ * @brief Create a strings-type column from iterators of pointer/size pairs
+ *
+ * @tparam IndexPairIterator iterator over type `pair<char const*,size_type>` values
+ *
+ * @param begin First string row (inclusive)
+ * @param end Last string row (exclusive)
+ * @param stream CUDA stream used for device memory operations
+ * @param mr  Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
+ */
 template <typename IndexPairIterator>
 std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
                                             IndexPairIterator end,
@@ -51,7 +73,7 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
   auto size_checker = [] __device__(string_index_pair const& item) {
     return (item.first != nullptr) ? item.second : 0;
   };
-  size_t bytes = thrust::transform_reduce(
+  size_t const bytes = thrust::transform_reduce(
     rmm::exec_policy(stream), begin, end, size_checker, 0, thrust::plus<size_t>());
   CUDF_EXPECTS(bytes < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
                "total size of strings is too large for cudf column");
@@ -65,26 +87,49 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
     offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
 
   // create null mask
-  auto validator  = [] __device__(string_index_pair const item) { return item.first != nullptr; };
-  auto new_nulls  = cudf::detail::valid_if(begin, end, validator, stream, mr);
-  auto null_count = new_nulls.second;
+  auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; };
+  auto new_nulls = cudf::detail::valid_if(begin, end, validator, stream, mr);
+  auto const null_count = new_nulls.second;
   auto null_mask =
     (null_count > 0) ? std::move(new_nulls.first) : rmm::device_buffer{0, stream, mr};
 
   // build chars column
-  auto chars_column =
-    strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
-  auto d_chars    = chars_column->mutable_view().template data<char>();
-  auto copy_chars = [d_chars] __device__(auto item) {
-    string_index_pair str = thrust::get<0>(item);
-    size_type offset      = thrust::get<1>(item);
-    if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second);
-  };
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_zip_iterator(
-                       thrust::make_tuple(begin, offsets_column->view().template begin<int32_t>())),
-                     strings_count,
-                     copy_chars);
+  std::unique_ptr<column> chars_column = [&] {
+    // use a character-parallel kernel for long string lengths
+    auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1);
+    if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) {
+      auto const d_offsets =
+        device_span<size_type const>{offsets_column->view().template data<int32_t>(),
+                                     static_cast<std::size_t>(offsets_column->size())};
+      auto const str_begin = thrust::make_transform_iterator(begin, [] __device__(auto ip) {
+        return string_view{ip.first, ip.second};
+      });
+
+      return gather_chars(str_begin,
+                          thrust::make_counting_iterator<size_type>(0),
+                          thrust::make_counting_iterator<size_type>(strings_count),
+                          d_offsets,
+                          static_cast<size_type>(bytes),
+                          stream,
+                          mr);
+    } else {
+      // this approach is 2-3x faster for a large number of smaller string lengths
+      auto chars_column =
+        strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
+      auto d_chars    = chars_column->mutable_view().template data<char>();
+      auto copy_chars = [d_chars] __device__(auto item) {
+        string_index_pair const str = thrust::get<0>(item);
+        size_type const offset      = thrust::get<1>(item);
+        if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second);
+      };
+      thrust::for_each_n(rmm::exec_policy(stream),
+                         thrust::make_zip_iterator(thrust::make_tuple(
+                           begin, offsets_column->view().template begin<int32_t>())),
+                         strings_count,
+                         copy_chars);
+      return chars_column;
+    }
+  }();
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
@@ -95,7 +140,22 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
                              mr);
 }
 
-// Create a strings-type column from iterators to chars, offsets, and bitmask.
+/**
+ * @brief Create a strings-type column from iterators to chars, offsets, and bitmask.
+ *
+ * @tparam CharIterator iterator over character bytes (int8)
+ * @tparam OffsetIterator iterator over offset values (size_type)
+ *
+ * @param chars_begin First character byte (inclusive)
+ * @param chars_end Last character byte (exclusive)
+ * @param offset_begin First offset value (inclusive)
+ * @param offset_end Last offset value (exclusive)
+ * @param null_count Number of null rows
+ * @param null_mask The validity bitmask in Arrow format
+ * @param stream CUDA stream used for device memory operations
+ * @param mr  Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
+ */
 template <typename CharIterator, typename OffsetIterator>
 std::unique_ptr<column> make_strings_column(CharIterator chars_begin,
                                             CharIterator chars_end,
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index e045476ea77..1e0d45d081d 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -224,6 +224,18 @@ constexpr inline bool is_unsigned(data_type type)
   return cudf::type_dispatcher(type, is_unsigned_impl{});
 }
 
+/**
+ * @brief Indicates whether the `Iterator` value type is unsigned.
+ *
+ * @tparam Iterator  The type to verify
+ * @return true if the iterator's value type is unsigned
+ */
+template <typename Iterator>
+constexpr inline bool is_signed_iterator()
+{
+  return std::is_signed<typename std::iterator_traits<Iterator>::value_type>::value;
+}
+
 /**
  * @brief Indicates whether the type `T` is a floating point type.
  *
diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu
index f904c404251..bd463a7ab0d 100644
--- a/cpp/tests/strings/factories_test.cu
+++ b/cpp/tests/strings/factories_test.cu
@@ -19,12 +19,18 @@
 #include <cudf/copying.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/execution_policy.h>
+#include <thrust/transform.h>
+
 #include <cstring>
 #include <vector>
 
@@ -198,3 +204,31 @@ TEST_F(StringsFactoriesTest, CreateOffsets)
     }
   }
 }
+
+namespace {
+using string_pair = thrust::pair<char const*, cudf::size_type>;
+struct string_view_to_pair {
+  __device__ string_pair operator()(thrust::pair<cudf::string_view, bool> const& p)
+  {
+    return (p.second) ? string_pair{p.first.data(), p.first.size_bytes()} : string_pair{nullptr, 0};
+  }
+};
+}  // namespace
+
+TEST_F(StringsFactoriesTest, StringPairWithNullsAndEmpty)
+{
+  cudf::test::strings_column_wrapper data(
+    {"", "this", "is", "", "a", "", "column", "of", "strings", "", ""},
+    {0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1});
+
+  auto d_column = cudf::column_device_view::create(data);
+  rmm::device_vector<string_pair> pairs(d_column->size());
+  thrust::transform(thrust::device,
+                    d_column->pair_begin<cudf::string_view, true>(),
+                    d_column->pair_end<cudf::string_view, true>(),
+                    pairs.data(),
+                    string_view_to_pair{});
+
+  auto result = cudf::make_strings_column(pairs);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), data);
+}

From 951b455b14a37efcbffc38638ab0b89d787d5b59 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 17 Mar 2021 20:29:36 -0700
Subject: [PATCH 08/26] Adds `list.take`, python binding for
 `cudf::lists::segmented_gather` (#7591)

Closes #7465

Implements `ListColumn.list.take` based on `cudf::lists:segmented_gather`. Gather elements inside each list based on the provided positions. Example:

```python
>>> s = cudf.Series([[1, 2, 3], [4, 5]])
>>> s
0    [1, 2, 3]
1       [4, 5]
dtype: list
>>> s.list.take([[2, 1], [1, 0]])
0    [3, 2]
1    [5, 4]
dtype: list
```

Authors:
  - Michael Wang (@isVoid)

Approvers:
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7591
---
 python/cudf/cudf/_lib/copying.pyx          | 25 ++++++++-
 python/cudf/cudf/_lib/cpp/lists/gather.pxd | 13 +++++
 python/cudf/cudf/core/column/lists.py      | 61 +++++++++++++++++++++-
 python/cudf/cudf/tests/test_list.py        | 47 +++++++++++++++++
 4 files changed, 143 insertions(+), 3 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/cpp/lists/gather.pxd

diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index ad798a73ed2..e5501428624 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -3,7 +3,7 @@
 import pandas as pd
 
 from libcpp cimport bool
-from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr, shared_ptr, make_shared
 from libcpp.vector cimport vector
 from libcpp.utility cimport move
 from libc.stdint cimport int32_t, int64_t
@@ -24,6 +24,10 @@ from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport size_type
+from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
+from cudf._lib.cpp.lists.gather cimport (
+    segmented_gather as cpp_segmented_gather
+)
 cimport cudf._lib.cpp.copying as cpp_copying
 
 # workaround for https://github.com/cython/cython/issues/3885
@@ -704,3 +708,22 @@ def sample(Table input, size_type n,
             else input._index_names
         )
     )
+
+
+def segmented_gather(Column source_column, Column gather_map):
+    cdef shared_ptr[lists_column_view] source_LCV = (
+        make_shared[lists_column_view](source_column.view())
+    )
+    cdef shared_ptr[lists_column_view] gather_map_LCV = (
+        make_shared[lists_column_view](gather_map.view())
+    )
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_segmented_gather(
+                source_LCV.get()[0], gather_map_LCV.get()[0])
+        )
+
+    result = Column.from_unique_ptr(move(c_result))
+    return result
diff --git a/python/cudf/cudf/_lib/cpp/lists/gather.pxd b/python/cudf/cudf/_lib/cpp/lists/gather.pxd
new file mode 100644
index 00000000000..ea664eee82e
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/lists/gather.pxd
@@ -0,0 +1,13 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
+
+
+cdef extern from "cudf/lists/gather.hpp" namespace "cudf::lists" nogil:
+    cdef unique_ptr[column] segmented_gather(
+        const lists_column_view source_column,
+        const lists_column_view gather_map_list
+    ) except +
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index a60fe627acb..1d3f73822a9 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -2,14 +2,16 @@
 
 import pickle
 
+import numpy as np
 import pyarrow as pa
 
 import cudf
+from cudf._lib.copying import segmented_gather
 from cudf._lib.lists import count_elements
 from cudf.core.buffer import Buffer
-from cudf.core.column import ColumnBase, column
+from cudf.core.column import ColumnBase, as_column, column
 from cudf.core.column.methods import ColumnMethodsMixin
-from cudf.utils.dtypes import is_list_dtype
+from cudf.utils.dtypes import is_list_dtype, is_numerical_dtype
 
 
 class ListColumn(ColumnBase):
@@ -228,3 +230,58 @@ def len(self):
         dtype: int32
         """
         return self._return_or_inplace(count_elements(self._column))
+
+    def take(self, lists_indices):
+        """
+        Collect list elements based on given indices.
+
+        Parameters
+        ----------
+        lists_indices: List type arrays
+            Specifies what to collect from each row
+
+        Returns
+        -------
+        ListColumn
+
+        Examples
+        --------
+        >>> s = cudf.Series([[1, 2, 3], None, [4, 5]])
+        >>> s
+        0    [1, 2, 3]
+        1         None
+        2       [4, 5]
+        dtype: list
+        >>> s.list.take([[0, 1], [], []])
+        0    [1, 2]
+        1      None
+        2        []
+        dtype: list
+        """
+
+        lists_indices_col = as_column(lists_indices)
+        if not isinstance(lists_indices_col, ListColumn):
+            raise ValueError("lists_indices should be list type array.")
+        if not lists_indices_col.size == self._column.size:
+            raise ValueError(
+                "lists_indices and list column is of different " "size."
+            )
+        if not is_numerical_dtype(
+            lists_indices_col.children[1].dtype
+        ) or not np.issubdtype(
+            lists_indices_col.children[1].dtype, np.integer
+        ):
+            raise TypeError(
+                "lists_indices should be column of values of index types."
+            )
+
+        try:
+            res = self._return_or_inplace(
+                segmented_gather(self._column, lists_indices_col)
+            )
+        except RuntimeError as e:
+            if "contains nulls" in str(e):
+                raise ValueError("lists_indices contains null.") from e
+            raise
+        else:
+            return res
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 195d8749ec6..33812cfa7a7 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -112,3 +112,50 @@ def test_len(data):
     got = gsr.list.len()
 
     assert_eq(expect, got, check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    ("data", "idx"),
+    [
+        ([[1, 2, 3], [3, 4, 5], [4, 5, 6]], [[0, 1], [2], [1, 2]]),
+        ([[1, 2, 3], [3, 4, 5], [4, 5, 6]], [[1, 2, 0], [1, 0, 2], [0, 1, 2]]),
+        ([[1, 2, 3], []], [[0, 1], []]),
+        ([[1, 2, 3], [None]], [[0, 1], []]),
+        ([[1, None, 3], None], [[0, 1], []]),
+    ],
+)
+def test_take(data, idx):
+    ps = pd.Series(data)
+    gs = cudf.from_pandas(ps)
+
+    expected = pd.Series(zip(ps, idx)).map(
+        lambda x: [x[0][i] for i in x[1]] if x[0] is not None else None
+    )
+    got = gs.list.take(idx)
+    assert_eq(expected, got)
+
+
+@pytest.mark.parametrize(
+    ("invalid", "exception"),
+    [
+        ([[0]], pytest.raises(ValueError, match="different size")),
+        ([1, 2, 3, 4], pytest.raises(ValueError, match="should be list type")),
+        (
+            [["a", "b"], ["c"]],
+            pytest.raises(
+                TypeError, match="should be column of values of index types"
+            ),
+        ),
+        (
+            [[[1], [0]], [[0]]],
+            pytest.raises(
+                TypeError, match="should be column of values of index types"
+            ),
+        ),
+        ([[0, 1], None], pytest.raises(ValueError, match="contains null")),
+    ],
+)
+def test_take_invalid(invalid, exception):
+    gs = cudf.Series([[0, 1], [2, 3]])
+    with exception:
+        gs.list.take(invalid)

From 873955e08ae47de118b73ba9c52dfaa3062fce81 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <36027403+codereport@users.noreply.github.com>
Date: Thu, 18 Mar 2021 09:32:06 -0400
Subject: [PATCH 09/26] Add explicit fixed_point merge test (#7635)

While confirming that libcudf does indeed support `cudf::merge` with `fixed_point` ended up writing an explicit test that there is no reason not to just add as a unit test.

Authors:
  - Conor Hoekstra (@codereport)

Approvers:
  - Karthikeyan (@karthikeyann)
  - David (@davidwendt)

URL: https://github.com/rapidsai/cudf/pull/7635
---
 cpp/tests/merge/merge_test.cpp | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp
index fa3bde8cb52..451fa82d5a3 100644
--- a/cpp/tests/merge/merge_test.cpp
+++ b/cpp/tests/merge/merge_test.cpp
@@ -729,4 +729,36 @@ TEST_F(MergeTest, KeysWithNulls)
     }
 }
 
+template <typename T>
+struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+};
+
+template <typename T>
+using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
+
+TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+
+TYPED_TEST(FixedPointTestBothReps, FixedPointMerge)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = cudf::device_storage_type_t<decimalXX>;
+
+  auto const a       = fp_wrapper<RepType>{{4, 22, 33, 44, 55}, scale_type{-1}};
+  auto const b       = fp_wrapper<RepType>{{5, 7, 10}, scale_type{-1}};
+  auto const table_a = cudf::table_view(std::vector<cudf::column_view>{a});
+  auto const table_b = cudf::table_view(std::vector<cudf::column_view>{b});
+  auto const tables  = std::vector<cudf::table_view>{table_a, table_b};
+
+  auto const key_cols = std::vector<cudf::size_type>{0};
+  auto const order    = std::vector<cudf::order>{cudf::order::ASCENDING};
+
+  auto const exp       = fp_wrapper<RepType>{{4, 5, 7, 10, 22, 33, 44, 55}, scale_type{-1}};
+  auto const exp_table = cudf::table_view(std::vector<cudf::column_view>{exp});
+
+  auto const result = cudf::merge(tables, key_cols, order);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(exp_table.column(0), result->view().column(0));
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From d6cc6943411c00d01368f8a1a53567997ffaff39 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Thu, 18 Mar 2021 08:55:43 -0500
Subject: [PATCH 10/26] Add in JNI support for table partition (#7637)

This adds in support for partition. Which will partition a table based off of a partition map.

Authors:
  - Robert (Bobby) Evans (@revans2)

Approvers:
  - Jason Lowe (@jlowe)

URL: https://github.com/rapidsai/cudf/pull/7637
---
 java/src/main/java/ai/rapids/cudf/Table.java  | 21 ++++++++++++
 java/src/main/native/src/TableJni.cpp         | 33 +++++++++++++++++++
 .../test/java/ai/rapids/cudf/TableTest.java   | 17 ++++++++++
 3 files changed, 71 insertions(+)

diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index d0e59fdc105..0dc529d423f 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -183,6 +183,9 @@ public long getDeviceMemorySize() {
   
   private static native ContiguousTable[] contiguousSplit(long inputTable, int[] indices);
 
+  private static native long[] partition(long inputTable, long partitionView,
+      int numberOfPartitions, int[] outputOffsets);
+
   private static native long[] hashPartition(long inputTable,
                                              int[] columnsToHash,
                                              int hashTypeId,
@@ -1257,6 +1260,24 @@ public Table repeat(ColumnVector counts, boolean checkCount) {
     return new Table(repeatColumnCount(this.nativeHandle, counts.getNativeView(), checkCount));
   }
 
+  /**
+   * Partition this table using the mapping in partitionMap. partitionMap must be an integer
+   * column. The number of rows in partitionMap must be the same as this table.  Each row
+   * in the map will indicate which partition the rows in the table belong to.
+   * @param partitionMap the partitions for each row.
+   * @param numberOfPartitions number of partitions
+   * @return {@link PartitionedTable} Table that exposes a limited functionality of the
+   * {@link Table} class
+   */
+  public PartitionedTable partition(ColumnView partitionMap, int numberOfPartitions) {
+    int[] partitionOffsets = new int[numberOfPartitions];
+    return new PartitionedTable(new Table(partition(
+        getNativeView(),
+        partitionMap.getNativeView(),
+        partitionOffsets.length,
+        partitionOffsets)), partitionOffsets);
+  }
+
   /**
    * Find smallest indices in a sorted table where values should be inserted to maintain order.
    * <pre>
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 02385a453d0..81b9882104f 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1614,6 +1614,39 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_concatenate(JNIEnv *env,
   CATCH_STD(env, NULL);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_partition(JNIEnv *env, jclass,
+                                                                 jlong input_table,
+                                                                 jlong partition_column,
+                                                                 jint number_of_partitions,
+                                                                 jintArray output_offsets) {
+
+  JNI_NULL_CHECK(env, input_table, "input table is null", NULL);
+  JNI_NULL_CHECK(env, partition_column, "partition_column is null", NULL);
+  JNI_NULL_CHECK(env, output_offsets, "output_offsets is null", NULL);
+  JNI_ARG_CHECK(env, number_of_partitions > 0, "number_of_partitions is zero", NULL);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
+    cudf::column_view *n_part_column = reinterpret_cast<cudf::column_view *>(partition_column);
+    cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
+
+    auto result = cudf::partition(*n_input_table,
+                                  *n_part_column,
+                                  number_of_partitions);
+
+    for (size_t i = 0; i < result.second.size() - 1; i++) {
+      // for what ever reason partition returns the length of the result at then
+      // end and hash partition/round robin do not, so skip the last entry for
+      // consistency
+      n_output_offsets[i] = result.second[i];
+    }
+
+    return cudf::jni::convert_table_for_return(env, result.first);
+  }
+  CATCH_STD(env, NULL);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(JNIEnv *env, jclass,
                                                                      jlong input_table,
                                                                      jintArray columns_to_hash,
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index c2e28e1cad8..c075f074068 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -1773,6 +1773,23 @@ void testPartStability() {
     }
   }
 
+  @Test
+  void testPartition() {
+    try (Table t = new Table.TestBuilder()
+        .column(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+        .build();
+         ColumnVector parts = ColumnVector
+             .fromInts(1, 2, 1, 2, 1, 2, 1, 2, 1, 2);
+         PartitionedTable pt = t.partition(parts, 3);
+         Table expected = new Table.TestBuilder()
+             .column(1, 3, 5, 7, 9, 2, 4, 6, 8, 10)
+             .build()) {
+      int[] partCutoffs = pt.getPartitions();
+      assertArrayEquals(new int[]{0, 0, 5}, partCutoffs);
+      assertTablesAreEqual(expected, pt.getTable());
+    }
+  }
+
   @Test
   void testIdentityHashPartition() {
     final int count = 1024 * 1024;

From 472305172bd373823db0976e1b0f53b3e7cc11f1 Mon Sep 17 00:00:00 2001
From: David <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 18 Mar 2021 10:33:55 -0400
Subject: [PATCH 11/26] Add gbenchmarks for string substrings functions (#7603)

Reference #5698
This creates a gbenchmark for the 4 variations of `cudf::strings::slice_strings()` API. The benchmarks measures various sized rows as well as strings lengths.
This PR also includes changes to `substring.cu` implementation cleaning up the code and using the more efficient `make_strings_children`. This change improved performance for all 4 functions on average by 2-3x.

Authors:
  - David (@davidwendt)

Approvers:
  - Nghia Truong (@ttnghia)
  - @nvdbaranec
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7603
---
 cpp/benchmarks/CMakeLists.txt                 |   1 +
 cpp/benchmarks/string/substring_benchmark.cpp |  93 ++++++++++++
 cpp/src/strings/substring.cu                  | 136 ++++++++----------
 3 files changed, 155 insertions(+), 75 deletions(-)
 create mode 100644 cpp/benchmarks/string/substring_benchmark.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index e63ea38a31b..682f1ac5fca 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -190,4 +190,5 @@ ConfigureBench(STRINGS_BENCH
   string/find_benchmark.cpp
   string/replace_benchmark.cpp
   string/split_benchmark.cpp
+  string/substring_benchmark.cpp
   string/url_decode_benchmark.cpp)
diff --git a/cpp/benchmarks/string/substring_benchmark.cpp b/cpp/benchmarks/string/substring_benchmark.cpp
new file mode 100644
index 00000000000..d47c42e45be
--- /dev/null
+++ b/cpp/benchmarks/string/substring_benchmark.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "string_bench_args.hpp"
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/strings/substring.hpp>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <limits>
+
+#include <thrust/iterator/constant_iterator.h>
+
+class StringSubstring : public cudf::benchmark {
+};
+
+enum substring_type { position, multi_position, delimiter, multi_delimiter };
+
+static void BM_substring(benchmark::State& state, substring_type rt)
+{
+  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
+  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+  auto starts_itr = thrust::constant_iterator<cudf::size_type>(1);
+  auto stops_itr  = thrust::constant_iterator<cudf::size_type>(max_str_length / 2);
+  cudf::test::fixed_width_column_wrapper<int32_t> starts(starts_itr, starts_itr + n_rows);
+  cudf::test::fixed_width_column_wrapper<int32_t> stops(stops_itr, stops_itr + n_rows);
+  auto delim_itr = thrust::constant_iterator<std::string>(" ");
+  cudf::test::strings_column_wrapper delimiters(delim_itr, delim_itr + n_rows);
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    switch (rt) {
+      case position: cudf::strings::slice_strings(input, 1, max_str_length / 2); break;
+      case multi_position: cudf::strings::slice_strings(input, starts, stops); break;
+      case delimiter: cudf::strings::slice_strings(input, std::string{" "}, 1); break;
+      case multi_delimiter:
+        cudf::strings::slice_strings(input, cudf::strings_column_view(delimiters), 1);
+        break;
+    }
+  }
+
+  state.SetBytesProcessed(state.iterations() * input.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 1 << 5;
+  int const max_rowlen = 1 << 13;
+  int const len_mult   = 4;
+  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+}
+
+#define STRINGS_BENCHMARK_DEFINE(name)                                  \
+  BENCHMARK_DEFINE_F(StringSubstring, name)                             \
+  (::benchmark::State & st) { BM_substring(st, substring_type::name); } \
+  BENCHMARK_REGISTER_F(StringSubstring, name)                           \
+    ->Apply(generate_bench_args)                                        \
+    ->UseManualTime()                                                   \
+    ->Unit(benchmark::kMillisecond);
+
+STRINGS_BENCHMARK_DEFINE(position)
+STRINGS_BENCHMARK_DEFINE(multi_position)
+STRINGS_BENCHMARK_DEFINE(delimiter)
+STRINGS_BENCHMARK_DEFINE(multi_delimiter)
diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu
index 68080c0eb89..f712b0cb6aa 100644
--- a/cpp/src/strings/substring.cu
+++ b/cpp/src/strings/substring.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,17 +43,25 @@ namespace {
  * using the provided start, stop, and step parameters.
  */
 struct substring_fn {
-  const column_device_view d_column;
-  numeric_scalar_device_view<size_type> d_start, d_stop, d_step;
-  const int32_t* d_offsets{};
+  column_device_view const d_column;
+  numeric_scalar_device_view<size_type> const d_start;
+  numeric_scalar_device_view<size_type> const d_stop;
+  numeric_scalar_device_view<size_type> const d_step;
+  int32_t* d_offsets{};
   char* d_chars{};
 
-  __device__ cudf::size_type operator()(size_type idx)
+  __device__ void operator()(size_type idx)
   {
-    if (d_column.is_null(idx)) return 0;  // null string
-    string_view d_str = d_column.template element<string_view>(idx);
+    if (d_column.is_null(idx)) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
+    auto const d_str  = d_column.template element<string_view>(idx);
     auto const length = d_str.length();
-    if (length == 0) return 0;  // empty string
+    if (length == 0) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
     size_type const step = d_step.is_valid() ? d_step.value() : 1;
     auto const begin     = [&] {  // always inclusive
       // when invalid, default depends on step
@@ -88,7 +96,7 @@ struct substring_fn {
       if (d_buffer) d_buffer += from_char_utf8(*itr, d_buffer);
       itr += step;
     }
-    return bytes;
+    if (!d_chars) d_offsets[idx] = bytes;
   }
 };
 
@@ -103,42 +111,26 @@ std::unique_ptr<column> slice_strings(
   rmm::cuda_stream_view stream           = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource())
 {
-  size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(stream, mr);
+  if (strings.is_empty()) return make_empty_strings_column(stream, mr);
 
   if (step.is_valid()) CUDF_EXPECTS(step.value(stream) != 0, "Step parameter must not be 0");
 
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_column       = *strings_column;
-  auto d_start        = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(start));
-  auto d_stop         = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(stop));
-  auto d_step         = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(step));
-
-  // copy the null mask
-  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
-
-  // build offsets column
-  auto offsets_transformer_itr = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<int32_t>(0), substring_fn{d_column, d_start, d_stop, d_step});
-  auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_new_offsets = offsets_column->view().data<int32_t>();
-
-  // build chars column
-  auto bytes = cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
-  auto chars_column = strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), bytes, stream, mr);
-  auto d_chars = chars_column->mutable_view().data<char>();
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     strings_count,
-                     substring_fn{d_column, d_start, d_stop, d_step, d_new_offsets, d_chars});
+  auto const d_column = column_device_view::create(strings.parent(), stream);
+  auto const d_start  = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(start));
+  auto const d_stop   = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(stop));
+  auto const d_step   = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(step));
 
-  return make_strings_column(strings_count,
-                             std::move(offsets_column),
-                             std::move(chars_column),
+  auto children = make_strings_children(substring_fn{*d_column, d_start, d_stop, d_step},
+                                        strings.size(),
+                                        strings.null_count(),
+                                        stream,
+                                        mr);
+
+  return make_strings_column(strings.size(),
+                             std::move(children.first),
+                             std::move(children.second),
                              strings.null_count(),
-                             std::move(null_mask),
+                             cudf::detail::copy_bitmask(strings.parent(), stream, mr),
                              stream,
                              mr);
 }
@@ -166,25 +158,33 @@ namespace {
  * This both calculates the output size and executes the substring.
  */
 struct substring_from_fn {
-  const column_device_view d_column;
-  const cudf::detail::input_indexalator starts;
-  const cudf::detail::input_indexalator stops;
-  const int32_t* d_offsets{};
+  column_device_view const d_column;
+  cudf::detail::input_indexalator const starts;
+  cudf::detail::input_indexalator const stops;
+  int32_t* d_offsets{};
   char* d_chars{};
 
-  __device__ size_type operator()(size_type idx)
+  __device__ void operator()(size_type idx)
   {
-    if (d_column.is_null(idx)) return 0;  // null string
-    string_view d_str = d_column.template element<string_view>(idx);
+    if (d_column.is_null(idx)) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
+    auto const d_str  = d_column.template element<string_view>(idx);
     auto const length = d_str.length();
     auto const start  = starts[idx];
-    if (start >= length) return 0;  // empty string
+    if (start >= length) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
     auto const stop = stops[idx];
     auto const end  = (((stop < 0) || (stop > length)) ? length : stop);
 
-    string_view d_substr = d_str.substr(start, end - start);
-    if (d_chars) memcpy(d_chars + d_offsets[idx], d_substr.data(), d_substr.size_bytes());
-    return d_substr.size_bytes();
+    auto const d_substr = d_str.substr(start, end - start);
+    if (d_chars)
+      memcpy(d_chars + d_offsets[idx], d_substr.data(), d_substr.size_bytes());
+    else
+      d_offsets[idx] = d_substr.size_bytes();
   }
 };
 
@@ -212,32 +212,18 @@ std::unique_ptr<column> compute_substrings_from_fn(column_device_view const& d_c
   auto strings_count = d_column.size();
 
   // Copy the null mask
-  rmm::device_buffer null_mask{0, stream, mr};
-  if (d_column.nullable())
-    null_mask = rmm::device_buffer(
-      d_column.null_mask(), cudf::bitmask_allocation_size_bytes(strings_count), stream, mr);
-
-  // Build offsets column
-  auto offsets_transformer_itr = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_type>(0), substring_from_fn{d_column, starts, stops});
-  auto offsets_column = cudf::strings::detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_new_offsets = offsets_column->view().data<int32_t>();
-
-  // Build chars column
-  auto bytes = cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
-  auto chars_column =
-    cudf::strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
-  auto chars_view = chars_column->mutable_view();
-  auto d_chars    = chars_view.template data<char>();
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<cudf::size_type>(0),
-                     strings_count,
-                     substring_from_fn{d_column, starts, stops, d_new_offsets, d_chars});
+  rmm::device_buffer null_mask =
+    !d_column.nullable()
+      ? rmm::device_buffer{0, stream, mr}
+      : rmm::device_buffer(
+          d_column.null_mask(), cudf::bitmask_allocation_size_bytes(strings_count), stream, mr);
+
+  auto children = make_strings_children(
+    substring_from_fn{d_column, starts, stops}, strings_count, null_count, stream, mr);
 
   return make_strings_column(strings_count,
-                             std::move(offsets_column),
-                             std::move(chars_column),
+                             std::move(children.first),
+                             std::move(children.second),
                              null_count,
                              std::move(null_mask),
                              stream,

From f5a421499369f810ef1611b54ec9b63f5e697160 Mon Sep 17 00:00:00 2001
From: David <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 18 Mar 2021 16:28:12 -0400
Subject: [PATCH 12/26] Add gbenchmark for strings::concatenate (#7560)

Reference #5698
This creates a gbenchmark for `cudf::strings::concatenate` function. The benchmarks measures various sized rows as well as strings lengths. This PR also includes some changes to `combine.cu` for cleaning up the code and replacing `device_vector` usages with `device_uvector`.

Authors:
  - David (@davidwendt)

Approvers:
  - Nghia Truong (@ttnghia)
  - @nvdbaranec
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7560
---
 cpp/benchmarks/CMakeLists.txt               |  1 +
 cpp/benchmarks/string/combine_benchmark.cpp | 72 +++++++++++++++++++
 cpp/benchmarks/string/string_bench_args.hpp |  2 +
 cpp/src/strings/combine.cu                  | 80 ++++++++++-----------
 4 files changed, 115 insertions(+), 40 deletions(-)
 create mode 100644 cpp/benchmarks/string/combine_benchmark.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 682f1ac5fca..0bf92ff54bb 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -180,6 +180,7 @@ ConfigureBench(SUBWORD_TOKENIZER_BENCH text/subword_benchmark.cpp)
 # - strings benchmark -------------------------------------------------------------------
 ConfigureBench(STRINGS_BENCH
   string/case_benchmark.cpp
+  string/combine_benchmark.cpp
   string/contains_benchmark.cpp
   string/convert_durations_benchmark.cpp
   string/convert_floats_benchmark.cpp
diff --git a/cpp/benchmarks/string/combine_benchmark.cpp b/cpp/benchmarks/string/combine_benchmark.cpp
new file mode 100644
index 00000000000..2a5013a9ae7
--- /dev/null
+++ b/cpp/benchmarks/string/combine_benchmark.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "string_bench_args.hpp"
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/combine.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+class StringCombine : public cudf::benchmark {
+};
+
+static void BM_combine(benchmark::State& state)
+{
+  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
+  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 2, row_count{n_rows}, table_profile);
+  cudf::strings_column_view input1(table->view().column(0));
+  cudf::strings_column_view input2(table->view().column(1));
+  cudf::string_scalar separator("+");
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    cudf::strings::concatenate(table->view(), separator);
+  }
+
+  state.SetBytesProcessed(state.iterations() * (input1.chars_size() + input2.chars_size()));
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 1 << 4;
+  int const max_rowlen = 1 << 11;
+  int const len_mult   = 4;
+  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+}
+
+#define STRINGS_BENCHMARK_DEFINE(name)          \
+  BENCHMARK_DEFINE_F(StringCombine, name)       \
+  (::benchmark::State & st) { BM_combine(st); } \
+  BENCHMARK_REGISTER_F(StringCombine, name)     \
+    ->Apply(generate_bench_args)                \
+    ->UseManualTime()                           \
+    ->Unit(benchmark::kMillisecond);
+
+STRINGS_BENCHMARK_DEFINE(concat)
diff --git a/cpp/benchmarks/string/string_bench_args.hpp b/cpp/benchmarks/string/string_bench_args.hpp
index 9c709b064dd..05ed1bf5b33 100644
--- a/cpp/benchmarks/string/string_bench_args.hpp
+++ b/cpp/benchmarks/string/string_bench_args.hpp
@@ -19,6 +19,8 @@
 
 #include <cudf/types.hpp>
 
+#include <limits>
+
 /**
  * @brief Generate row count and row length argument ranges for a string benchmark.
  *
diff --git a/cpp/src/strings/combine.cu b/cpp/src/strings/combine.cu
index 4e61d4d8c41..f9b8b9e0ea3 100644
--- a/cpp/src/strings/combine.cu
+++ b/cpp/src/strings/combine.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/valid_if.cuh>
@@ -31,7 +32,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/logical.h>
@@ -50,7 +51,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
-  auto num_columns = strings_columns.num_columns();
+  auto const num_columns = strings_columns.num_columns();
   CUDF_EXPECTS(num_columns > 0, "At least one column must be specified");
   // check all columns are of type string
   CUDF_EXPECTS(std::all_of(strings_columns.begin(),
@@ -59,7 +60,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                "All columns must be of type string");
   if (num_columns == 1)  // single strings column returns a copy
     return std::make_unique<column>(*(strings_columns.begin()), stream, mr);
-  auto strings_count = strings_columns.num_rows();
+  auto const strings_count = strings_columns.num_rows();
   if (strings_count == 0)  // empty begets empty
     return detail::make_empty_strings_column(stream, mr);
 
@@ -88,12 +89,12 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
   // build offsets column by computing sizes of each string in the output
   auto offsets_transformer = [d_table, d_separator, d_narep] __device__(size_type row_idx) {
     // for this row (idx), iterate over each column and add up the bytes
-    bool null_element =
+    bool const null_element =
       thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [row_idx](auto const& d_column) {
         return d_column.is_null(row_idx);
       });
     if (null_element && !d_narep.is_valid()) return 0;
-    size_type bytes = thrust::transform_reduce(
+    size_type const bytes = thrust::transform_reduce(
       thrust::seq,
       d_table.begin(),
       d_table.end(),
@@ -105,9 +106,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
       0,
       thrust::plus<size_type>());
     // separator goes only in between elements
-    if (bytes > 0)                        // if not null
-      bytes -= d_separator.size_bytes();  // remove the last separator
-    return bytes;
+    return bytes == 0 ? 0 : (bytes - d_separator.size_bytes());  // remove the last separator
   };
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<size_type>(0), offsets_transformer);
@@ -116,7 +115,8 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
   auto d_results_offsets = offsets_column->view().data<int32_t>();
 
   // create the chars column
-  size_type bytes = thrust::device_pointer_cast(d_results_offsets)[strings_count];
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
   auto chars_column =
     strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
   // fill the chars column
@@ -127,18 +127,17 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
     strings_count,
     [d_table, num_columns, d_separator, d_narep, d_results_offsets, d_results_chars] __device__(
       size_type idx) {
-      bool null_element = thrust::any_of(
+      bool const null_element = thrust::any_of(
         thrust::seq, d_table.begin(), d_table.end(), [idx](column_device_view const& col) {
           return col.is_null(idx);
         });
       if (null_element && !d_narep.is_valid())
         return;  // do not write to buffer at all if any column element for this row is null
-      size_type offset = d_results_offsets[idx];
-      char* d_buffer   = d_results_chars + offset;
+      char* d_buffer = d_results_chars + d_results_offsets[idx];
       // write out each column's entry for this row
       for (size_type col_idx = 0; col_idx < num_columns; ++col_idx) {
-        auto d_column = d_table.column(col_idx);
-        string_view d_str =
+        auto const d_column = d_table.column(col_idx);
+        string_view const d_str =
           d_column.is_null(idx) ? d_narep.value() : d_column.element<string_view>(idx);
         d_buffer = detail::copy_string(d_buffer, d_str);
         // separator goes only in between elements
@@ -173,8 +172,8 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
   auto d_strings      = *strings_column;
 
   // create an offsets array for building the output memory layout
-  rmm::device_vector<size_type> output_offsets(strings_count + 1);
-  auto d_output_offsets = output_offsets.data().get();
+  rmm::device_uvector<size_type> output_offsets(strings_count + 1, stream);
+  auto d_output_offsets = output_offsets.data();
   // using inclusive-scan to compute last entry which is the total size
   thrust::transform_inclusive_scan(
     rmm::exec_policy(stream),
@@ -192,9 +191,12 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
       return bytes;
     },
     thrust::plus<size_type>());
-  CUDA_TRY(cudaMemsetAsync(d_output_offsets, 0, sizeof(size_type), stream.value()));
+  size_type const zero = 0;
+  output_offsets.set_element_async(0, zero, stream);
   // total size is the last entry
-  size_type bytes = output_offsets.back();
+  // Note this call does a synchronize on the stream and thereby also protects the
+  // set_element_async parameter from going out of scope before it is used.
+  size_type const bytes = output_offsets.back_element(stream);
 
   // build offsets column (only 1 string so 2 offset entries)
   auto offsets_column =
@@ -254,7 +256,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
-  auto num_columns = strings_columns.num_columns();
+  auto const num_columns = strings_columns.num_columns();
   CUDF_EXPECTS(num_columns > 0, "At least one column must be specified");
   // Check if all columns are of type string
   CUDF_EXPECTS(std::all_of(strings_columns.begin(),
@@ -262,7 +264,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                            [](auto c) { return c.type().id() == type_id::STRING; }),
                "All columns must be of type string");
 
-  auto strings_count = strings_columns.num_rows();
+  auto const strings_count = strings_columns.num_rows();
   CUDF_EXPECTS(strings_count == separators.size(),
                "Separators column should be the same size as the strings columns");
   if (strings_count == 0)  // Empty begets empty
@@ -277,7 +279,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
 
   if (num_columns == 1) {
     // Shallow copy of the resultant strings
-    rmm::device_vector<string_view> out_col_strings(strings_count);
+    rmm::device_uvector<string_view> out_col_strings(strings_count, stream);
 
     // Device view of the only column in the table view
     auto const col0_ptr = column_device_view::create(strings_columns.column(0), stream);
@@ -288,7 +290,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
       rmm::exec_policy(stream),
       thrust::make_counting_iterator<size_type>(0),
       thrust::make_counting_iterator<size_type>(strings_count),
-      out_col_strings.data().get(),
+      out_col_strings.begin(),
       // Output depends on the separator
       [col0, invalid_str, separator_col_view, separator_rep, col_rep] __device__(auto ridx) {
         if (!separator_col_view.is_valid(ridx) && !separator_rep.is_valid()) return invalid_str;
@@ -334,7 +336,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
     if (!separator_col_view.is_valid(ridx) && !separator_rep.is_valid()) return 0;
 
     // For this row (idx), iterate over each column and add up the bytes
-    bool all_nulls =
+    bool const all_nulls =
       thrust::all_of(thrust::seq, d_table.begin(), d_table.end(), [ridx](auto const& d_column) {
         return d_column.is_null(ridx);
       });
@@ -343,11 +345,11 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
     if (all_nulls && !col_rep.is_valid()) return 0;
 
     // There is at least one non-null column value (it can still be empty though)
-    auto separator_str = separator_col_view.is_valid(ridx)
-                           ? separator_col_view.element<string_view>(ridx)
-                           : separator_rep.value();
+    auto const separator_str = separator_col_view.is_valid(ridx)
+                                 ? separator_col_view.element<string_view>(ridx)
+                                 : separator_rep.value();
 
-    size_type bytes = thrust::transform_reduce(
+    size_type const bytes = thrust::transform_reduce(
       thrust::seq,
       d_table.begin(),
       d_table.end(),
@@ -395,7 +397,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                        // to replace, do not write anything for this row
                        if (!separator_col_view.is_valid(ridx) && !separator_rep.is_valid()) return;
 
-                       bool all_nulls = thrust::all_of(
+                       bool const all_nulls = thrust::all_of(
                          thrust::seq, d_table.begin(), d_table.end(), [ridx](auto const& col) {
                            return col.is_null(ridx);
                          });
@@ -404,29 +406,27 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                        // skip this row
                        if (all_nulls && !col_rep.is_valid()) return;
 
-                       size_type offset    = d_results_offsets[ridx];
-                       char* d_buffer      = d_results_chars + offset;
+                       char* d_buffer      = d_results_chars + d_results_offsets[ridx];
                        bool colval_written = false;
 
                        // There is at least one non-null column value (it can still be empty though)
-                       auto separator_str = separator_col_view.is_valid(ridx)
-                                              ? separator_col_view.element<string_view>(ridx)
-                                              : separator_rep.value();
+                       auto const separator_str = separator_col_view.is_valid(ridx)
+                                                    ? separator_col_view.element<string_view>(ridx)
+                                                    : separator_rep.value();
 
                        // Write out each column's entry for this row
                        for (size_type col_idx = 0; col_idx < num_columns; ++col_idx) {
-                         auto d_column = d_table.column(col_idx);
-                         // If the column isn't valid and if there isn't a replacement for it, skip
-                         // it
+                         auto const d_column = d_table.column(col_idx);
+                         // If the row is null and if there is no replacement, skip it
                          if (d_column.is_null(ridx) && !col_rep.is_valid()) continue;
 
                          // Separator goes only in between elements
                          if (colval_written)
                            d_buffer = detail::copy_string(d_buffer, separator_str);
 
-                         string_view d_str = d_column.is_null(ridx)
-                                               ? col_rep.value()
-                                               : d_column.element<string_view>(ridx);
+                         string_view const d_str = d_column.is_null(ridx)
+                                                     ? col_rep.value()
+                                                     : d_column.element<string_view>(ridx);
                          d_buffer       = detail::copy_string(d_buffer, d_str);
                          colval_written = true;
                        }

From ec5364c2fc0a3c63583e648ec90efa8d3b5675bc Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 18 Mar 2021 14:38:53 -0700
Subject: [PATCH 13/26] Adds `explode` API (#7607)

Closes #2975

This PR introduces `explode` API, which flattens list columns and turns list elements into rows. Example:

```python
>>> s = cudf.Series([[1, 2, 3], [], None, [4, 5]])
>>> s
0    [1, 2, 3]
1           []
2         None
3       [4, 5]
dtype: list
>>> s.explode()
0       1
0       2
0       3
1    <NA>
2    <NA>
3       4
3       5
dtype: int64
```

Supersedes #7538

Authors:
  - Michael Wang (@isVoid)

Approvers:
  - Keith Kraus (@kkraus14)
  - GALI PREM SAGAR (@galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/7607
---
 python/cudf/cudf/_lib/cpp/lists/explode.pxd | 13 +++++
 python/cudf/cudf/_lib/lists.pyx             | 28 ++++++++++-
 python/cudf/cudf/core/dataframe.py          | 46 ++++++++++++++++++
 python/cudf/cudf/core/frame.py              | 22 +++++++++
 python/cudf/cudf/core/series.py             | 41 ++++++++++++++++
 python/cudf/cudf/tests/test_dataframe.py    | 53 +++++++++++++++++++++
 python/cudf/cudf/tests/test_series.py       | 29 +++++++++++
 7 files changed, 231 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf/cudf/_lib/cpp/lists/explode.pxd

diff --git a/python/cudf/cudf/_lib/cpp/lists/explode.pxd b/python/cudf/cudf/_lib/cpp/lists/explode.pxd
new file mode 100644
index 00000000000..cd2d44d2e42
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/lists/explode.pxd
@@ -0,0 +1,13 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.cpp.table.table cimport table
+from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.cpp.types cimport size_type
+
+cdef extern from "cudf/lists/explode.hpp" namespace "cudf" nogil:
+    cdef unique_ptr[table] explode_outer(
+        const table_view,
+        size_type explode_column_idx,
+    ) except +
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index aba13580912..0f0ee35556a 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -1,17 +1,25 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.
 
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr, shared_ptr, make_shared
 from libcpp.utility cimport move
 
 from cudf._lib.cpp.lists.count_elements cimport (
     count_elements as cpp_count_elements
 )
+from cudf._lib.cpp.lists.explode cimport (
+    explode_outer as cpp_explode_outer
+)
 from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.column.column cimport column
 
-from cudf._lib.column cimport Column
+from cudf._lib.cpp.table.table cimport table
+from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.cpp.types cimport size_type
 
+from cudf._lib.column cimport Column
+from cudf._lib.table cimport Table
 
 from cudf.core.dtypes import ListDtype
 
@@ -32,3 +40,21 @@ def count_elements(Column col):
 
     result = Column.from_unique_ptr(move(c_result))
     return result
+
+
+def explode_outer(Table tbl, int explode_column_idx, bool ignore_index=False):
+    cdef table_view c_table_view = (
+        tbl.data_view() if ignore_index else tbl.view()
+    )
+    cdef size_type c_explode_column_idx = explode_column_idx
+
+    cdef unique_ptr[table] c_result
+
+    with nogil:
+        c_result = move(cpp_explode_outer(c_table_view, c_explode_column_idx))
+
+    return Table.from_unique_ptr(
+        move(c_result),
+        column_names=tbl._column_names,
+        index_names=None if ignore_index else tbl._index_names
+    )
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 9672ab3002f..4fb5762e098 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -7705,6 +7705,52 @@ def equals(self, other):
                 return False
         return super().equals(other)
 
+    def explode(self, column, ignore_index=False):
+        """
+        Transform each element of a list-like to a row, replicating index
+        values.
+
+        Parameters
+        ----------
+        column : str or tuple
+            Column to explode.
+        ignore_index : bool, default False
+            If True, the resulting index will be labeled 0, 1, …, n - 1.
+
+        Returns
+        -------
+        DataFrame
+
+        Examples
+        --------
+        >>> import cudf
+        >>> cudf.DataFrame(
+                {"a": [[1, 2, 3], [], None, [4, 5]], "b": [11, 22, 33, 44]})
+                   a   b
+        0  [1, 2, 3]  11
+        1         []  22
+        2       None  33
+        3     [4, 5]  44
+        >>> df.explode('a')
+              a   b
+        0     1  11
+        0     2  11
+        0     3  11
+        1  <NA>  22
+        2  <NA>  33
+        3     4  44
+        3     5  44
+        """
+        if column not in self._column_names:
+            raise KeyError(column)
+
+        if not is_list_dtype(self._data[column].dtype):
+            data = self._data.copy(deep=True)
+            idx = None if ignore_index else self._index.copy(deep=True)
+            return self.__class__._from_data(data, index=idx)
+
+        return super()._explode(column, ignore_index)
+
     _accessors = set()  # type: Set[Any]
 
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index fab5936f94d..bfcc2d125db 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -573,6 +573,28 @@ def equals(self, other, **kwargs):
         else:
             return self._index.equals(other._index)
 
+    def _explode(self, explode_column: Any, ignore_index: bool):
+        """Helper function for `explode` in `Series` and `Dataframe`, explodes
+        a specified nested column. Other columns' corresponding rows are
+        duplicated. If ignore_index is set, the original index is not exploded
+        and will be replaced with a `RangeIndex`.
+        """
+        explode_column_num = self._column_names.index(explode_column)
+        if not ignore_index and self._index is not None:
+            explode_column_num += self._index.nlevels
+
+        res_tbl = libcudf.lists.explode_outer(
+            self, explode_column_num, ignore_index
+        )
+        res = self.__class__._from_table(res_tbl)
+
+        res._data.multiindex = self._data.multiindex
+        res._data._level_names = self._data._level_names
+
+        if not ignore_index and self._index is not None:
+            res.index.names = self._index.names
+        return res
+
     def _get_columns_by_label(self, labels, downcast):
         """
         Returns columns of the Frame specified by `labels`
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index b06fef178f6..7ed2157277c 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -6362,6 +6362,47 @@ def keys(self):
         """
         return self.index
 
+    def explode(self, ignore_index=False):
+        """
+        Transform each element of a list-like to a row, replicating index
+        values.
+
+        Parameters
+        ----------
+        ignore_index : bool, default False
+            If True, the resulting index will be labeled 0, 1, …, n - 1.
+
+        Returns
+        -------
+        DataFrame
+
+        Examples
+        --------
+        >>> import cudf
+        >>> s = cudf.Series([[1, 2, 3], [], None, [4, 5]])
+        >>> s
+        0    [1, 2, 3]
+        1           []
+        2         None
+        3       [4, 5]
+        dtype: list
+        >>> s.explode()
+        0       1
+        0       2
+        0       3
+        1    <NA>
+        2    <NA>
+        3       4
+        3       5
+        dtype: int64
+        """
+        if not is_list_dtype(self._column.dtype):
+            data = self._data.copy(deep=True)
+            idx = None if ignore_index else self._index.copy(deep=True)
+            return self.__class__._from_data(data, index=idx)
+
+        return super()._explode(self._column_names[0], ignore_index)
+
     _accessors = set()  # type: Set[Any]
 
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 5f4d571e8c5..b3ba439cb15 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -8442,3 +8442,56 @@ def test_rename_for_level_is_None_MC():
     got = gdf.rename(columns={"a": "f"}, level=None)
 
     assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [
+            [[1, 2, 3], 11, "a"],
+            [None, 22, "e"],
+            [[4], 33, "i"],
+            [[], 44, "o"],
+            [[5, 6], 55, "u"],
+        ],  # nested
+        [
+            [1, 11, "a"],
+            [2, 22, "e"],
+            [3, 33, "i"],
+            [4, 44, "o"],
+            [5, 55, "u"],
+        ],  # non-nested
+    ],
+)
+@pytest.mark.parametrize(
+    ("labels", "label_to_explode"),
+    [
+        (None, 0),
+        (pd.Index(["a", "b", "c"]), "a"),
+        (
+            pd.MultiIndex.from_tuples(
+                [(0, "a"), (0, "b"), (1, "a")], names=["l0", "l1"]
+            ),
+            (0, "a"),
+        ),
+    ],
+)
+@pytest.mark.parametrize("ignore_index", [True, False])
+@pytest.mark.parametrize(
+    "p_index",
+    [
+        None,
+        ["ia", "ib", "ic", "id", "ie"],
+        pd.MultiIndex.from_tuples(
+            [(0, "a"), (0, "b"), (0, "c"), (1, "a"), (1, "b")]
+        ),
+    ],
+)
+def test_explode(data, labels, ignore_index, p_index, label_to_explode):
+    pdf = pd.DataFrame(data, index=p_index, columns=labels)
+    gdf = cudf.from_pandas(pdf)
+
+    expect = pdf.explode(label_to_explode, ignore_index)
+    got = gdf.explode(label_to_explode, ignore_index)
+
+    assert_eq(expect, got, check_dtype=False)
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index a1b4236719d..beda14934ca 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -1118,3 +1118,32 @@ def test_series_drop_raises():
     actual = gs.drop("p", errors="ignore")
 
     assert_eq(actual, expect)
+
+
+@pytest.mark.parametrize(
+    "data", [[[1, 2, 3], None, [4], [], [5, 6]], [1, 2, 3, 4, 5]],
+)
+@pytest.mark.parametrize("ignore_index", [True, False])
+@pytest.mark.parametrize(
+    "p_index",
+    [
+        None,
+        ["ia", "ib", "ic", "id", "ie"],
+        pd.MultiIndex.from_tuples(
+            [(0, "a"), (0, "b"), (0, "c"), (1, "a"), (1, "b")]
+        ),
+    ],
+)
+def test_explode(data, ignore_index, p_index):
+    pdf = pd.Series(data, index=p_index, name="someseries")
+    gdf = cudf.from_pandas(pdf)
+
+    expect = pdf.explode(ignore_index)
+    got = gdf.explode(ignore_index)
+
+    if data == [1, 2, 3, 4, 5] and ignore_index and p_index is not None:
+        # https://github.com/pandas-dev/pandas/issues/40487
+        with pytest.raises(AssertionError, match="different"):
+            assert_eq(expect, got, check_dtype=False)
+    else:
+        assert_eq(expect, got, check_dtype=False)

From a5684328726bfa8dfdbbf10a50f47677ce57974b Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Thu, 18 Mar 2021 17:50:50 -0500
Subject: [PATCH 14/26] Fix internal compiler error during JNI Docker build
 (#7645)

This updates the JNI Docker build to devtoolset-8 to work around an internal compiler error crash with gcc 7.3.1.  The JNI Docker build is also updated to use the dlpack and RMM sources that were pulled down by CPM during the libcudf build rather than pulling and building them separately.

Authors:
  - Jason Lowe (@jlowe)

Approvers:
  - Raza Jafri (@razajafri)
  - Mike Wilson (@hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/7645
---
 java/ci/Dockerfile.centos7          |  6 +++---
 java/ci/README.md                   |  4 ++--
 java/ci/build-in-docker.sh          | 26 +-------------------------
 java/src/main/native/CMakeLists.txt |  6 ++++--
 4 files changed, 10 insertions(+), 32 deletions(-)

diff --git a/java/ci/Dockerfile.centos7 b/java/ci/Dockerfile.centos7
index 7e0e1c70d72..cbf7e22b229 100644
--- a/java/ci/Dockerfile.centos7
+++ b/java/ci/Dockerfile.centos7
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@ FROM nvidia/cuda:$CUDA_VERSION-devel-centos7
 
 ### Install basic requirements
 RUN yum install -y centos-release-scl
-RUN yum install -y devtoolset-7 epel-release
+RUN yum install -y devtoolset-8 epel-release
 RUN yum install -y git zlib-devel maven tar wget patch
 
 ## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins
@@ -34,7 +34,7 @@ RUN mkdir /usr/local/rapids && mkdir /rapids && chmod 777 /usr/local/rapids && c
 RUN cd /rapids/ && wget https://dl.bintray.com/boostorg/release/1.72.0/source/boost_1_72_0.tar.gz && \
    tar zxf boost_1_72_0.tar.gz && \
    cd boost_1_72_0 && \
-   scl enable devtoolset-7 "./bootstrap.sh --prefix=/usr && ./b2 install --with-filesystem threading=multi link=static cxxflags=-fPIC; exit 0"
+   scl enable devtoolset-8 "./bootstrap.sh --prefix=/usr && ./b2 install --with-filesystem threading=multi link=static cxxflags=-fPIC; exit 0"
 
 RUN cd /usr/local/ && wget --quiet https://github.com/Kitware/CMake/releases/download/v3.19.0/cmake-3.19.0-Linux-x86_64.tar.gz && \
    tar zxf cmake-3.19.0-Linux-x86_64.tar.gz
diff --git a/java/ci/README.md b/java/ci/README.md
index 865ed7fd083..3ffed71b27c 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -39,12 +39,12 @@ Here I choose to download again in the container.
 git clone --recursive https://github.com/rapidsai/cudf.git -b branch-0.19
 ```
 
-### Build cuDF jar
+### Build cuDF jar with devtoolset
 
 ```bash
 cd cudf
 export WORKSPACE=`pwd`
-scl enable devtoolset-7 "java/ci/build-in-docker.sh"
+scl enable devtoolset-8 "java/ci/build-in-docker.sh"
 ```
 
 ### The output
diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh
index 7e51a150ebc..eee943cde38 100755
--- a/java/ci/build-in-docker.sh
+++ b/java/ci/build-in-docker.sh
@@ -31,12 +31,7 @@ SIGN_FILE=$1
 OUT_PATH=$WORKSPACE/$OUT
 
 # set on Jenkins parameter
-if [ -z $RMM_VERSION ]
-then
-RMM_VERSION=`git describe --tags | grep -o -E '([0-9]+\.[0-9]+)'`
-fi
-echo "RMM_VERSION: $RMM_VERSION,\
- SIGN_FILE: $SIGN_FILE,\
+echo "SIGN_FILE: $SIGN_FILE,\
  SKIP_JAVA_TESTS: $SKIP_JAVA_TESTS,\
  BUILD_CPP_TESTS: $BUILD_CPP_TESTS,\
  ENABLED_PTDS: $ENABLE_PTDS,\
@@ -47,30 +42,11 @@ INSTALL_PREFIX=/usr/local/rapids
 export GIT_COMMITTER_NAME="ci"
 export GIT_COMMITTER_EMAIL="ci@nvidia.com"
 export CUDACXX=/usr/local/cuda/bin/nvcc
-export RMM_ROOT=$INSTALL_PREFIX
-export DLPACK_ROOT=$INSTALL_PREFIX
 export LIBCUDF_KERNEL_CACHE_PATH=/rapids
 
 # add cmake 3.19 to PATH
 export PATH=/usr/local/cmake-3.19.0-Linux-x86_64/bin:$PATH
 
-cd /rapids/
-git clone --recurse-submodules https://github.com/rapidsai/rmm.git -b branch-$RMM_VERSION
-git clone --recurse-submodules https://github.com/rapidsai/dlpack.git -b cudf
-
-###### Build rmm/dlpack ######
-mkdir -p /rapids/rmm/build
-cd /rapids/rmm/build
-echo "RMM SHA: `git rev-parse HEAD`"
-cmake .. -DCMAKE_INSTALL_PREFIX=$INSTALL_PREFIX -DBUILD_TESTS=$BUILD_CPP_TESTS
-make -j$PARALLEL_LEVEL install
-
-mkdir -p /rapids/dlpack/build
-cd /rapids/dlpack/build
-echo "DLPACK SHA: `git rev-parse HEAD`"
-cmake .. -DCMAKE_INSTALL_PREFIX=$INSTALL_PREFIX -DBUILD_TESTS=$BUILD_CPP_TESTS
-make -j$PARALLEL_LEVEL install
-
 ###### Build libcudf ######
 rm -rf $WORKSPACE/cpp/build
 mkdir -p $WORKSPACE/cpp/build
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index ceafc75f840..46b3f0c5a53 100755
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -125,7 +125,8 @@ find_path(LIBCUDACXX_INCLUDE "cuda"
           "${CUDF_CPP_BUILD_DIR}/_deps/libcudacxx-src/include")
 
 find_path(SPDLOG_INCLUDE "spdlog"
-    HINTS "$ENV{RMM_ROOT}/_deps/spdlog-src/include"
+    HINTS "${CUDF_CPP_BUILD_DIR}/_deps/spdlog-src/include"
+          "$ENV{RMM_ROOT}/_deps/spdlog-src/include"
           "$ENV{RMM_ROOT}/include"
           "$ENV{CONDA_PREFIX}/include")
 
@@ -147,7 +148,8 @@ find_library(CUDF_LIB "cudf" REQUIRED HINTS ${CUDF_LIB_HINTS})
 # - RMM -------------------------------------------------------------------------------------------
 
 find_path(RMM_INCLUDE "rmm"
-          HINTS "$ENV{RMM_ROOT}/include"
+          HINTS "${CUDF_CPP_BUILD_DIR}/_deps/rmm-src/include"
+                "$ENV{RMM_ROOT}/include"
                 "$ENV{RMM_HOME}/include"
                 "$ENV{CONDA_PREFIX}/include/rmm"
                 "$ENV{CONDA_PREFIX}/include")

From c13351d0d323e79c77e4ad77e739d3f7d12ad140 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Thu, 18 Mar 2021 20:36:04 -0400
Subject: [PATCH 15/26] Handle constructing a `cudf.Scalar` from a
 `cudf.Scalar` (#7639)

...also fix `DeviceScalar.__repr__` to print `"DeviceScalar"` instead of `"Scalar"`.

Authors:
  - Ashwin Srinath (@shwina)

Approvers:
  - Keith Kraus (@kkraus14)
  - @brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/7639
---
 python/cudf/cudf/_lib/scalar.pyx      | 10 +++++++---
 python/cudf/cudf/core/scalar.py       | 14 ++++++++++++--
 python/cudf/cudf/tests/test_scalar.py | 14 ++++++++++++++
 3 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 6a78be000c9..a5945bc72f0 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -47,7 +47,7 @@ cdef class DeviceScalar:
 
     def __init__(self, value, dtype):
         """
-        cudf.Scalar: Type representing a scalar value on the device
+        Type representing an *immutable* scalar value on the device
 
         Parameters
         ----------
@@ -63,6 +63,7 @@ cdef class DeviceScalar:
         self._set_value(value, dtype)
 
     def _set_value(self, value, dtype):
+        # IMPORTANT: this should only ever be called from __init__
         valid = not _is_null_host_scalar(value)
 
         if pd.api.types.is_string_dtype(dtype):
@@ -128,9 +129,12 @@ cdef class DeviceScalar:
 
     def __repr__(self):
         if self.value is cudf.NA:
-            return f"Scalar({self.value}, {self.dtype.__repr__()})"
+            return (
+                f"{self.__class__.__name__}"
+                f"({self.value}, {self.dtype.__repr__()})"
+            )
         else:
-            return f"Scalar({self.value.__repr__()})"
+            return f"{self.__class__.__name__}({self.value.__repr__()})"
 
     @staticmethod
     cdef DeviceScalar from_unique_ptr(unique_ptr[scalar] ptr):
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index a3467e6fbe0..1e998ae37e2 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -56,7 +56,14 @@ def __init__(self, value, dtype=None):
         self._host_value = None
         self._host_dtype = None
         self._device_value = None
-        if isinstance(value, DeviceScalar):
+
+        if isinstance(value, Scalar):
+            if value._is_host_value_current:
+                self._host_value = value._host_value
+                self._host_dtype = value._host_dtype
+            else:
+                self._device_value = value._device_value
+        elif isinstance(value, DeviceScalar):
             self._device_value = value
         else:
             self._host_value, self._host_dtype = self._preprocess_host_value(
@@ -248,7 +255,10 @@ def __neg__(self):
     def __repr__(self):
         # str() fixes a numpy bug with NaT
         # https://github.com/numpy/numpy/issues/17552
-        return f"Scalar({str(self.value)}, dtype={self.dtype})"
+        return (
+            f"{self.__class__.__name__}"
+            f"({str(self.value)}, dtype={self.dtype})"
+        )
 
     def _binop_result_dtype_or_error(self, other, op):
         if op in {"__eq__", "__ne__", "__lt__", "__gt__", "__le__", "__ge__"}:
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index 003e46c7e0d..58115cecee7 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -289,3 +289,17 @@ def test_device_scalar_direct_construction(value):
         assert s.dtype == "object"
     else:
         assert s.dtype == dtype
+
+
+@pytest.mark.parametrize("value", SCALAR_VALUES)
+def test_construct_from_scalar(value):
+    value = cudf.utils.utils.to_cudf_compatible_scalar(value)
+    x = cudf.Scalar(value, value.dtype)
+    y = cudf.Scalar(x)
+    assert x.value == y.value or np.isnan(x.value) and np.isnan(y.value)
+
+    # check that this works:
+    y.device_value
+
+    x._is_host_value_current == y._is_host_value_current
+    x._is_device_value_current == y._is_device_value_current

From 6fe8b1b7be71c0f3587d3ef43c535aec376d1e45 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Fri, 19 Mar 2021 21:03:56 +1100
Subject: [PATCH 16/26] Eliminate literal parameters to
 uvector::set_element_async and device_scalar::set_value (#7563)

After rapidsai/rmm#725 is merged, this PR updates cuspatial to eliminate passing literal values to `device_scalar::set_value`.

Note there are still similar issues with uses of `cudf::scalar` that need to be addressed. But I wanted to get this open before the end of the week. This can be merged after RMM 725 is merged to fix the build, so I think the `scalar` issues should be fixed separately.

Authors:
  - Mark Harris (@harrism)

Approvers:
  - Devavret Makkar (@devavret)
  - Paul Taylor (@trxcllnt)
  - Mike Wilson (@hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/7563
---
 cpp/src/io/json/reader_impl.cu                | 2 +-
 cpp/src/join/hash_join.cu                     | 2 +-
 cpp/src/join/hash_join.cuh                    | 2 +-
 cpp/src/join/nested_loop_join.cuh             | 6 +++---
 cpp/tests/jit/jit-cache-multiprocess-test.cpp | 6 ++++--
 5 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 5a82c9891b8..1a1fa8d0602 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -158,7 +158,7 @@ std::unique_ptr<table> create_json_keys_info_table(const parse_options_view &opt
   auto const info_table_mdv = mutable_table_device_view::create(info_table->mutable_view(), stream);
 
   // Reset the key counter - now used for indexing
-  key_counter.set_value(0, stream);
+  key_counter.set_value_zero(stream);
   // Fill the allocated columns
   cudf::io::json::gpu::collect_keys_info(
     options, data, row_offsets, key_counter.data(), {*info_table_mdv}, stream);
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index c2c32d4165a..b64e91c18bd 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -287,7 +287,7 @@ std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>> probe_jo
 
     constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE};
     detail::grid_1d config(probe_table.num_rows(), block_size);
-    write_index.set_value(0, stream);
+    write_index.set_value_zero(stream);
 
     row_hash hash_probe{probe_table};
     row_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL};
diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh
index 712d771bd73..b37f228f6d3 100644
--- a/cpp/src/join/hash_join.cuh
+++ b/cpp/src/join/hash_join.cuh
@@ -120,7 +120,7 @@ size_type estimate_join_output_size(table_device_view build_table,
   do {
     sample_probe_num_rows = std::min(sample_probe_num_rows, probe_table_num_rows);
 
-    size_estimate.set_value(0, stream);
+    size_estimate.set_value_zero(stream);
 
     row_hash hash_probe{probe_table};
     row_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL};
diff --git a/cpp/src/join/nested_loop_join.cuh b/cpp/src/join/nested_loop_join.cuh
index 03d684f91d4..580017a6704 100644
--- a/cpp/src/join/nested_loop_join.cuh
+++ b/cpp/src/join/nested_loop_join.cuh
@@ -89,7 +89,7 @@ size_type estimate_nested_loop_join_output_size(table_device_view left,
   int num_sms{-1};
   CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
 
-  size_estimate.set_value(0, stream);
+  size_estimate.set_value_zero(stream);
 
   row_equality equality{left, right, compare_nulls == null_equality::EQUAL};
   // Determine number of output rows without actually building the output to simply
@@ -163,7 +163,7 @@ get_base_nested_loop_join_indices(table_view const& left,
 
     constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE};
     detail::grid_1d config(left_table->num_rows(), block_size);
-    write_index.set_value(0);
+    write_index.set_value_zero(stream);
 
     row_equality equality{*left_table, *right_table, compare_nulls == null_equality::EQUAL};
     const auto& join_output_l =
@@ -182,7 +182,7 @@ get_base_nested_loop_join_indices(table_view const& left,
 
     CHECK_CUDA(stream.value());
 
-    join_size              = write_index.value();
+    join_size              = write_index.value(stream);
     current_estimated_size = estimated_size;
     estimated_size *= 2;
   } while ((current_estimated_size < join_size));
diff --git a/cpp/tests/jit/jit-cache-multiprocess-test.cpp b/cpp/tests/jit/jit-cache-multiprocess-test.cpp
index 3dbf5b59c88..2f0b353673e 100644
--- a/cpp/tests/jit/jit-cache-multiprocess-test.cpp
+++ b/cpp/tests/jit/jit-cache-multiprocess-test.cpp
@@ -49,8 +49,10 @@ TEST_F(JitCacheMultiProcessTest, MultiProcessTest)
     // Brand new cache object that has nothing in in-memory cache
     cudf::jit::cudfJitCache cache;
 
-    input->set_value(4);
-    output->set_value(1);
+    auto const in{4};
+    auto const out{1};
+    input->set_value(in);
+    output->set_value(out);
 
     // make program
     auto program = cache.getProgram("FileCacheTestProg3", program3_source);

From bd29a922c44e52286580053e5d68f831c7c4404a Mon Sep 17 00:00:00 2001
From: Devavret Makkar <devavret@users.noreply.github.com>
Date: Fri, 19 Mar 2021 19:58:45 +0530
Subject: [PATCH 17/26] Add struct support to parquet writer (#7461)

### Adds struct writing ability to parquet writer.
The internals of the writer have been changed in the following way:
- Previously we would construct `parquet_column_view` from the cudf columns and the input options and used it to construct schema. Now we construct schema directly from the input cudf columns and the input options.
- The constructed schema is used to generate views of cudf columns which have a single child hierarchy. e.g. One `struct<int, float>` column is converted into two columns: `struct<int>`, `struct<float>`. Each of these columns result in a separate `parquet_column_view` which is used only for encoding.
- In order to allow finer control to the user about the per-column options, the old metadata class is replaced by `table_input_metadata`.

#### Breaking change: Input metadata
The new input metadata class `table_input_metadata` contains a vector of `column_in_metadata` which contains a vector of `column_in_metadata`, one for each child of the input column. It can be constructed using the input table and then specific options can be changed for each level.

For a table with a single struct column
```
Struct<is_human:bool (non-nullable),
       Struct<weight:float>,
              age:int
             > (nullable)
      > (non-nullable)
```
We can set the per level names and optional nullability as follows:
```c++
cudf::io::table_input_metadata metadata(table);
metadata.column_metadata[0].set_name("being").set_nullability(false);
metadata.column_metadata[0].child(0).set_name("human?").set_nullability(false);
metadata.column_metadata[0].child(1).set_name("particulars");
metadata.column_metadata[0].child(1).child(0).set_name("weight");
metadata.column_metadata[0].child(1).child(1).set_name("age");
```
#### Related issues
Closes #6989
Closes #6816
Strangely, there isn't an issue asking for struct writing support.

Authors:
  - Devavret Makkar (@devavret)
  - Kumar Aatish (@kaatish)

Approvers:
  - Vukasin Milovanovic (@vuule)
  - @nvdbaranec
  - GALI PREM SAGAR (@galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/7461
---
 cpp/include/cudf/io/parquet.hpp          |  233 +++--
 cpp/src/io/functions.cpp                 |   16 +
 cpp/src/io/parquet/page_dict.cu          |    2 +-
 cpp/src/io/parquet/page_enc.cu           |  382 ++++---
 cpp/src/io/parquet/parquet_gpu.hpp       |   38 +-
 cpp/src/io/parquet/writer_impl.cu        | 1219 ++++++++++++----------
 cpp/src/io/parquet/writer_impl.hpp       |   18 +-
 cpp/src/io/utilities/column_utils.cuh    |   39 +-
 cpp/tests/io/parquet_test.cpp            |  729 ++++++++++---
 python/cudf/cudf/_lib/cpp/io/parquet.pxd |   35 +-
 python/cudf/cudf/_lib/parquet.pyx        |   67 +-
 python/cudf/cudf/_lib/utils.pyx          |    3 +-
 python/cudf/cudf/core/dataframe.py       |    9 -
 python/cudf/cudf/tests/test_parquet.py   |   86 ++
 14 files changed, 1910 insertions(+), 966 deletions(-)

diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index cd3b7bf27da..3e63e8fc770 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -376,6 +376,162 @@ table_with_metadata read_parquet(
  * @{
  * @file
  */
+class table_input_metadata;
+
+class column_in_metadata {
+  friend table_input_metadata;
+  std::string _name = "";
+  thrust::optional<bool> _nullable;
+  // TODO: This isn't implemented yet
+  bool _list_column_is_map  = false;
+  bool _use_int96_timestamp = false;
+  // bool _output_as_binary = false;
+  thrust::optional<uint8_t> _decimal_precision;
+  std::vector<column_in_metadata> children;
+
+ public:
+  /**
+   * @brief Set the name of this column
+   *
+   * @return this for chaining
+   */
+  column_in_metadata& set_name(std::string const& name)
+  {
+    _name = name;
+    return *this;
+  }
+
+  /**
+   * @brief Set the nullability of this column
+   *
+   * Only valid in case of chunked writes. In single writes, this option is ignored.
+   *
+   * @return column_in_metadata&
+   */
+  column_in_metadata& set_nullability(bool nullable)
+  {
+    _nullable = nullable;
+    return *this;
+  }
+
+  /**
+   * @brief Specify that this list column should be encoded as a map in the written parquet file
+   *
+   * The column must have the structure list<struct<key, value>>. This option is invalid otherwise
+   *
+   * @return this for chaining
+   */
+  column_in_metadata& set_list_column_as_map()
+  {
+    _list_column_is_map = true;
+    return *this;
+  }
+
+  /**
+   * @brief Specifies whether this timestamp column should be encoded using the deprecated int96
+   * physical type. Only valid for the following column types:
+   * timestamp_s, timestamp_ms, timestamp_us, timestamp_ns
+   *
+   * @param req True = use int96 physical type. False = use int64 physical type
+   * @return this for chaining
+   */
+  column_in_metadata& set_int96_timestamps(bool req)
+  {
+    _use_int96_timestamp = req;
+    return *this;
+  }
+
+  /**
+   * @brief Set the decimal precision of this column. Only valid if this column is a decimal
+   * (fixed-point) type
+   *
+   * @param precision The integer precision to set for this decimal column
+   * @return this for chaining
+   */
+  column_in_metadata& set_decimal_precision(uint8_t precision)
+  {
+    _decimal_precision = precision;
+    return *this;
+  }
+
+  /**
+   * @brief Get reference to a child of this column
+   *
+   * @param i Index of the child to get
+   * @return this for chaining
+   */
+  column_in_metadata& child(size_type i) { return children[i]; }
+
+  /**
+   * @brief Get const reference to a child of this column
+   *
+   * @param i Index of the child to get
+   * @return this for chaining
+   */
+  column_in_metadata const& child(size_type i) const { return children[i]; }
+
+  /**
+   * @brief Get the name of this column
+   */
+  std::string get_name() const { return _name; }
+
+  /**
+   * @brief Get whether nullability has been explicitly set for this column.
+   */
+  bool is_nullability_defined() const { return _nullable.has_value(); }
+
+  /**
+   * @brief Gets the explicitly set nullability for this column.
+   * @throws If nullability is not explicitly defined for this column.
+   *         Check using `is_nullability_defined()` first.
+   */
+  bool nullable() const { return _nullable.value(); }
+
+  /**
+   * @brief If this is the metadata of a list column, returns whether it is to be encoded as a map.
+   */
+  bool is_map() const { return _list_column_is_map; }
+
+  /**
+   * @brief Get whether to encode this timestamp column using deprecated int96 physical type
+   */
+  bool is_enabled_int96_timestamps() const { return _use_int96_timestamp; }
+
+  /**
+   * @brief Get whether precision has been set for this decimal column
+   */
+  bool is_decimal_precision_set() const { return _decimal_precision.has_value(); }
+
+  /**
+   * @brief Get the decimal precision that was set for this column.
+   * @throws If decimal precision was not set for this column.
+   *         Check using `is_decimal_precision_set()` first.
+   */
+  uint8_t get_decimal_precision() const { return _decimal_precision.value(); }
+
+  /**
+   * @brief Get the number of children of this column
+   */
+  size_type num_children() const { return children.size(); }
+};
+
+class table_input_metadata {
+ public:
+  table_input_metadata() = default;  // Required by cython
+
+  /**
+   * @brief Construct a new table_input_metadata from a table_view.
+   *
+   * The constructed table_input_metadata has the same structure as the passed table_view
+   *
+   * @param table The table_view to construct metadata for
+   * @param user_data Optional Additional metadata to encode, as key-value pairs
+   */
+  table_input_metadata(table_view const& table, std::map<std::string, std::string> user_data = {});
+
+  std::vector<column_in_metadata> column_metadata;
+  std::map<std::string, std::string> user_data;  //!< Format-dependent metadata as key-values pairs
+};
 
 /**
  * @brief Class to build `parquet_writer_options`.
@@ -395,14 +551,12 @@ class parquet_writer_options {
   // Sets of columns to output
   table_view _table;
   // Optional associated metadata
-  const table_metadata* _metadata = nullptr;
-  // Parquet writes can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
+  table_input_metadata const* _metadata = nullptr;
+  // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
+  // If true then overrides any per-column setting in _metadata.
   bool _write_timestamps_as_int96 = false;
   // Column chunks file path to be set in the raw output metadata
   std::string _column_chunks_file_path;
-  /// vector of precision values for decimal writing. Exactly one entry
-  /// per decimal column. Optional unless decimals are being written.
-  std::vector<uint8_t> _decimal_precision;
 
   /**
    * @brief Constructor from sink and table.
@@ -465,7 +619,7 @@ class parquet_writer_options {
   /**
    * @brief Returns associated metadata.
    */
-  table_metadata const* get_metadata() const { return _metadata; }
+  table_input_metadata const* get_metadata() const { return _metadata; }
 
   /**
    * @brief Returns `true` if timestamps will be written as INT96
@@ -477,17 +631,12 @@ class parquet_writer_options {
    */
   std::string get_column_chunks_file_path() const { return _column_chunks_file_path; }
 
-  /**
-   * @brief Returns a constant reference to the decimal precision vector.
-   */
-  std::vector<uint8_t> const& get_decimal_precision() const { return _decimal_precision; }
-
   /**
    * @brief Sets metadata.
    *
    * @param metadata Associated metadata.
    */
-  void set_metadata(table_metadata const* metadata) { _metadata = metadata; }
+  void set_metadata(table_input_metadata const* metadata) { _metadata = metadata; }
 
   /**
    * @brief Sets the level of statistics.
@@ -520,11 +669,6 @@ class parquet_writer_options {
   {
     _column_chunks_file_path.assign(file_path);
   }
-
-  /**
-   * @brief Sets the decimal precision vector data.
-   */
-  void set_decimal_precision(std::vector<uint8_t> dp) { _decimal_precision = std::move(dp); }
 };
 
 class parquet_writer_options_builder {
@@ -555,7 +699,7 @@ class parquet_writer_options_builder {
    * @param metadata Associated metadata.
    * @return this for chaining.
    */
-  parquet_writer_options_builder& metadata(table_metadata const* metadata)
+  parquet_writer_options_builder& metadata(table_input_metadata const* metadata)
   {
     options._metadata = metadata;
     return *this;
@@ -672,11 +816,10 @@ class chunked_parquet_writer_options {
   // Specify the level of statistics in the output file
   statistics_freq _stats_level = statistics_freq::STATISTICS_ROWGROUP;
   // Optional associated metadata.
-  const table_metadata_with_nullability* _nullable_metadata = nullptr;
-  // Parquet writes can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
+  table_input_metadata const* _metadata = nullptr;
+  // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
+  // If true then overrides any per-column setting in _metadata.
   bool _write_timestamps_as_int96 = false;
-  // Optional decimal precision data - must be present if writing decimals
-  std::vector<uint8_t> _decimal_precision = {};
 
   /**
    * @brief Constructor from sink.
@@ -711,17 +854,9 @@ class chunked_parquet_writer_options {
   statistics_freq get_stats_level() const { return _stats_level; }
 
   /**
-   * @brief Returns nullable metadata information.
+   * @brief Returns metadata information.
    */
-  const table_metadata_with_nullability* get_nullable_metadata() const
-  {
-    return _nullable_metadata;
-  }
-
-  /**
-   * @brief Returns decimal precision pointer.
-   */
-  std::vector<uint8_t> const& get_decimal_precision() const { return _decimal_precision; }
+  table_input_metadata const* get_metadata() const { return _metadata; }
 
   /**
    * @brief Returns `true` if timestamps will be written as INT96
@@ -729,22 +864,11 @@ class chunked_parquet_writer_options {
   bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; }
 
   /**
-   * @brief Sets nullable metadata.
+   * @brief Sets metadata.
    *
    * @param metadata Associated metadata.
    */
-  void set_nullable_metadata(const table_metadata_with_nullability* metadata)
-  {
-    _nullable_metadata = metadata;
-  }
-
-  /**
-   * @brief Sets decimal precision data.
-   *
-   * @param v Vector of precision data flattened with exactly one entry per
-   *          decimal column.
-   */
-  void set_decimal_precision_data(std::vector<uint8_t> const& v) { _decimal_precision = v; }
+  void set_metadata(table_input_metadata const* metadata) { _metadata = metadata; }
 
   /**
    * @brief Sets the level of statistics in parquet_writer_options.
@@ -797,15 +921,14 @@ class chunked_parquet_writer_options_builder {
   chunked_parquet_writer_options_builder(sink_info const& sink) : options(sink){};
 
   /**
-   * @brief Sets nullable metadata to chunked_parquet_writer_options.
+   * @brief Sets metadata to chunked_parquet_writer_options.
    *
    * @param metadata Associated metadata.
    * @return this for chaining.
    */
-  chunked_parquet_writer_options_builder& nullable_metadata(
-    const table_metadata_with_nullability* metadata)
+  chunked_parquet_writer_options_builder& metadata(table_input_metadata const* metadata)
   {
-    options._nullable_metadata = metadata;
+    options._metadata = metadata;
     return *this;
   }
 
@@ -821,18 +944,6 @@ class chunked_parquet_writer_options_builder {
     return *this;
   }
 
-  /**
-   * @brief Sets decimal precision data.
-   *
-   * @param v Vector of precision data flattened with exactly one entry per
-   *          decimal column.
-   */
-  chunked_parquet_writer_options_builder& decimal_precision(std::vector<uint8_t> const& v)
-  {
-    options._decimal_precision = v;
-    return *this;
-  }
-
   /**
    * @brief Sets compression type to chunked_parquet_writer_options.
    *
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 1b7635f8d0d..bc6d36a0328 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -419,6 +419,22 @@ std::unique_ptr<std::vector<uint8_t>> merge_rowgroup_metadata(
   return detail_parquet::writer::merge_rowgroup_metadata(metadata_list);
 }
 
+table_input_metadata::table_input_metadata(table_view const& table,
+                                           std::map<std::string, std::string> user_data)
+  : user_data{std::move(user_data)}
+{
+  // Create a metadata hierarchy using `table`
+  std::function<column_in_metadata(column_view const&)> get_children = [&](column_view const& col) {
+    auto col_meta = column_in_metadata{};
+    std::transform(
+      col.child_begin(), col.child_end(), std::back_inserter(col_meta.children), get_children);
+    return col_meta;
+  };
+
+  std::transform(
+    table.begin(), table.end(), std::back_inserter(this->column_metadata), get_children);
+}
+
 /**
  * @copydoc cudf::io::write_parquet
  */
diff --git a/cpp/src/io/parquet/page_dict.cu b/cpp/src/io/parquet/page_dict.cu
index d984cc1e44f..46d471d5cf7 100644
--- a/cpp/src/io/parquet/page_dict.cu
+++ b/cpp/src/io/parquet/page_dict.cu
@@ -36,7 +36,7 @@ struct dict_state_s {
   uint32_t num_dict_entries;    //!< Dictionary entries in current fragment to add
   uint32_t frag_dict_size;
   EncColumnChunk ck;
-  EncColumnDesc col;
+  parquet_column_device_view col;
   PageFragment frag;
   volatile uint32_t scratch_red[32];
   uint16_t frag_dict[max_page_fragment_size];
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 8b99248e2fd..3b29394686f 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -44,7 +44,7 @@ constexpr int init_hash_bits       = 12;
 constexpr uint32_t rle_buffer_size = (1 << 9);
 
 struct frag_init_state_s {
-  EncColumnDesc col;
+  parquet_column_device_view col;
   PageFragment frag;
   uint32_t total_dupes;
   size_type start_value_idx;
@@ -70,7 +70,7 @@ struct page_enc_state_s {
   volatile uint32_t scratch_red[32];
   EncPage page;
   EncColumnChunk ck;
-  EncColumnDesc col;
+  parquet_column_device_view col;
   gpu_inflate_input_s comp_in;
   gpu_inflate_status_s comp_out;
   uint16_t vals[rle_buffer_size];
@@ -111,12 +111,13 @@ inline __device__ uint32_t uint64_init_hash(uint64_t v)
  */
 // blockDim {512,1,1}
 template <int block_size>
-__global__ void __launch_bounds__(block_size) gpuInitPageFragments(PageFragment *frag,
-                                                                   const EncColumnDesc *col_desc,
-                                                                   int32_t num_fragments,
-                                                                   int32_t num_columns,
-                                                                   uint32_t fragment_size,
-                                                                   uint32_t max_num_rows)
+__global__ void __launch_bounds__(block_size)
+  gpuInitPageFragments(PageFragment *frag,
+                       const parquet_column_device_view *col_desc,
+                       int32_t num_fragments,
+                       int32_t num_columns,
+                       uint32_t fragment_size,
+                       uint32_t max_num_rows)
 {
   __shared__ __align__(16) frag_init_state_s state_g;
 
@@ -158,12 +159,18 @@ __global__ void __launch_bounds__(block_size) gpuInitPageFragments(PageFragment
     } else {
       auto col                     = *(s->col.parent_column);
       auto current_start_value_idx = start_row;
-      while (col.type().id() == type_id::LIST) {
-        auto offset_col = col.child(lists_column_view::offsets_column_index);
-        current_start_value_idx =
-          offset_col.element<size_type>(current_start_value_idx + col.offset());
-        end_value_idx = offset_col.element<size_type>(end_value_idx + col.offset());
-        col           = col.child(lists_column_view::child_column_index);
+      while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
+        if (col.type().id() == type_id::STRUCT) {
+          current_start_value_idx += col.offset();
+          end_value_idx += col.offset();
+          col = col.child(0);
+        } else {
+          auto offset_col = col.child(lists_column_view::offsets_column_index);
+          current_start_value_idx =
+            offset_col.element<size_type>(current_start_value_idx + col.offset());
+          end_value_idx = offset_col.element<size_type>(end_value_idx + col.offset());
+          col           = col.child(lists_column_view::child_column_index);
+        }
       }
       s->start_value_idx = current_start_value_idx;
     }
@@ -372,12 +379,13 @@ __global__ void __launch_bounds__(block_size) gpuInitPageFragments(PageFragment
 }
 
 // blockDim {128,1,1}
-__global__ void __launch_bounds__(128) gpuInitFragmentStats(statistics_group *groups,
-                                                            const PageFragment *fragments,
-                                                            const EncColumnDesc *col_desc,
-                                                            int32_t num_fragments,
-                                                            int32_t num_columns,
-                                                            uint32_t fragment_size)
+__global__ void __launch_bounds__(128)
+  gpuInitFragmentStats(statistics_group *groups,
+                       const PageFragment *fragments,
+                       const parquet_column_device_view *col_desc,
+                       int32_t num_fragments,
+                       int32_t num_columns,
+                       uint32_t fragment_size)
 {
   __shared__ __align__(8) statistics_group group_g[4];
 
@@ -397,13 +405,13 @@ __global__ void __launch_bounds__(128) gpuInitFragmentStats(statistics_group *gr
 // blockDim {128,1,1}
 __global__ void __launch_bounds__(128) gpuInitPages(EncColumnChunk *chunks,
                                                     EncPage *pages,
-                                                    const EncColumnDesc *col_desc,
+                                                    const parquet_column_device_view *col_desc,
                                                     statistics_merge_group *page_grstats,
                                                     statistics_merge_group *chunk_grstats,
                                                     int32_t num_rowgroups,
                                                     int32_t num_columns)
 {
-  __shared__ __align__(8) EncColumnDesc col_g;
+  __shared__ __align__(8) parquet_column_device_view col_g;
   __shared__ __align__(8) EncColumnChunk ck_g;
   __shared__ __align__(8) PageFragment frag_g;
   __shared__ __align__(8) EncPage page_g;
@@ -541,8 +549,8 @@ __global__ void __launch_bounds__(128) gpuInitPages(EncColumnChunk *chunks,
           page_g.num_rows         = rows_in_page;
           page_g.num_leaf_values  = leaf_values_in_page;
           page_g.num_values       = values_in_page;
-          uint32_t def_level_bits = col_g.level_bits & 0xf;
-          uint32_t rep_level_bits = col_g.level_bits >> 4;
+          uint32_t def_level_bits = col_g.num_def_level_bits();
+          uint32_t rep_level_bits = col_g.num_rep_level_bits();
           // Run length = 4, max(rle/bitpack header) = 5, add one byte per 256 values for overhead
           // TODO (dm): Improve readability of these calculations.
           uint32_t def_level_size =
@@ -936,10 +944,12 @@ __global__ void __launch_bounds__(128, 8) gpuEncodePages(EncPage *pages,
   __syncthreads();
 
   // Encode Repetition and Definition levels
-  if (s->page.page_type != PageType::DICTIONARY_PAGE && s->col.level_bits != 0 &&
-      s->col.parent_column == nullptr) {
+  if (s->page.page_type != PageType::DICTIONARY_PAGE &&
+      (s->col.num_def_level_bits()) != 0 &&  // This means max definition level is not 0 (nullable)
+      (s->col.num_rep_level_bits()) == 0     // This means there are no repetition levels (non-list)
+  ) {
     // Calculate definition levels from validity
-    uint32_t def_lvl_bits = s->col.level_bits & 0xf;
+    uint32_t def_lvl_bits = s->col.num_def_level_bits();
     if (def_lvl_bits != 0) {
       if (!t) {
         s->rle_run     = 0;
@@ -954,9 +964,32 @@ __global__ void __launch_bounds__(128, 8) gpuEncodePages(EncPage *pages,
         uint32_t row         = s->page.start_row + rle_numvals + t;
         // Definition level encodes validity. Checks the valid map and if it is valid, then sets the
         // def_lvl accordingly and sets it in s->vals which is then given to RleEncode to encode
-        uint32_t def_lvl = (rle_numvals + t < s->page.num_rows && row < s->col.num_rows)
-                             ? s->col.leaf_column->is_valid(row)
-                             : 0;
+        uint32_t def_lvl = [&]() {
+          bool within_bounds = rle_numvals + t < s->page.num_rows && row < s->col.num_rows;
+          if (not within_bounds) { return 0u; }
+          uint32_t def       = 0;
+          size_type l        = 0;
+          bool is_col_struct = false;
+          auto col           = *s->col.parent_column;
+          do {
+            // If col not nullable then it does not contribute to def levels
+            if (s->col.nullability[l]) {
+              if (col.is_valid(row)) {
+                ++def;
+              } else {
+                // We have found the shallowest level at which this row is null
+                break;
+              }
+            }
+            is_col_struct = (col.type().id() == type_id::STRUCT);
+            if (is_col_struct) {
+              row += col.offset();
+              col = col.child(0);
+              ++l;
+            }
+          } while (is_col_struct);
+          return def;
+        }();
         s->vals[(rle_numvals + t) & (rle_buffer_size - 1)] = def_lvl;
         __syncthreads();
         rle_numvals += nrows;
@@ -974,7 +1007,9 @@ __global__ void __launch_bounds__(128, 8) gpuEncodePages(EncPage *pages,
         if (t == 0) { s->cur = rle_out; }
       }
     }
-  } else if (s->page.page_type != PageType::DICTIONARY_PAGE && s->col.parent_column != nullptr) {
+  } else if (s->page.page_type != PageType::DICTIONARY_PAGE &&
+             s->col.num_rep_level_bits() != 0  // This means there ARE repetition levels (has list)
+  ) {
     auto encode_levels = [&](uint8_t const *lvl_val_data, uint32_t nbits) {
       // For list types, the repetition and definition levels are pre-calculated. We just need to
       // encode and write them now.
@@ -1010,9 +1045,9 @@ __global__ void __launch_bounds__(128, 8) gpuEncodePages(EncPage *pages,
         if (t == 0) { s->cur = rle_out; }
       }
     };
-    encode_levels(s->col.rep_values, s->col.level_bits >> 4);
+    encode_levels(s->col.rep_values, s->col.num_rep_level_bits());
     __syncthreads();
-    encode_levels(s->col.def_values, s->col.level_bits & 0xf);
+    encode_levels(s->col.def_values, s->col.num_def_level_bits());
   }
   // Encode data values
   __syncthreads();
@@ -1041,10 +1076,15 @@ __global__ void __launch_bounds__(128, 8) gpuEncodePages(EncPage *pages,
     if (s->col.parent_column != nullptr) {
       auto col                    = *(s->col.parent_column);
       auto current_page_start_val = s->page_start_val;
-      while (col.type().id() == type_id::LIST) {
-        current_page_start_val = col.child(lists_column_view::offsets_column_index)
-                                   .element<size_type>(current_page_start_val + col.offset());
-        col = col.child(lists_column_view::child_column_index);
+      while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
+        if (col.type().id() == type_id::STRUCT) {
+          current_page_start_val += col.offset();
+          col = col.child(0);
+        } else {
+          current_page_start_val = col.child(lists_column_view::offsets_column_index)
+                                     .element<size_type>(current_page_start_val + col.offset());
+          col = col.child(lists_column_view::child_column_index);
+        }
       }
       s->page_start_val = current_page_start_val;
     }
@@ -1156,11 +1196,13 @@ __global__ void __launch_bounds__(128, 8) gpuEncodePages(EncPage *pages,
             auto const ret = convert_nanoseconds([&]() {
               using namespace cuda::std::chrono;
 
-              switch (s->col.converted_type) {
-                case TIMESTAMP_MILLIS: {
+              switch (s->col.leaf_column->type().id()) {
+                case type_id::TIMESTAMP_SECONDS:
+                case type_id::TIMESTAMP_MILLISECONDS: {
                   return sys_time<nanoseconds>{milliseconds{v}};
                 } break;
-                case TIMESTAMP_MICROS: {
+                case type_id::TIMESTAMP_MICROSECONDS:
+                case type_id::TIMESTAMP_NANOSECONDS: {
                   return sys_time<nanoseconds>{microseconds{v}};
                 } break;
               }
@@ -1383,7 +1425,7 @@ class header_encoder {
 
 __device__ uint8_t *EncodeStatistics(uint8_t *start,
                                      const statistics_chunk *s,
-                                     const EncColumnDesc *col,
+                                     const parquet_column_device_view *col,
                                      float *fp_scratch)
 {
   uint8_t *end, dtype, dtype_len;
@@ -1441,7 +1483,7 @@ __global__ void __launch_bounds__(128) gpuEncodePageHeaders(EncPage *pages,
                                                             const statistics_chunk *chunk_stats,
                                                             uint32_t start_page)
 {
-  __shared__ __align__(8) EncColumnDesc col_g;
+  __shared__ __align__(8) parquet_column_device_view col_g;
   __shared__ __align__(8) EncColumnChunk ck_g;
   __shared__ __align__(8) EncPage page_g;
   __shared__ __align__(8) float fp_scratch[2];
@@ -1567,6 +1609,42 @@ __global__ void __launch_bounds__(1024) gpuGatherPages(EncColumnChunk *chunks, c
   }
 }
 
+/**
+ * @brief Functor to get definition level value for a nested struct column until the leaf level or
+ * the first list level.
+ *
+ */
+struct def_level_fn {
+  column_device_view const *parent_col;
+  uint8_t const *d_nullability;
+  uint8_t sub_level_start;
+  uint8_t curr_def_level;
+
+  __device__ uint32_t operator()(size_type i)
+  {
+    uint32_t def       = curr_def_level;
+    uint8_t l          = sub_level_start;
+    bool is_col_struct = false;
+    auto col           = *parent_col;
+    do {
+      // If col not nullable then it does not contribute to def levels
+      if (d_nullability[l]) {
+        if (not col.nullable() or bit_is_set(col.null_mask(), i)) {
+          ++def;
+        } else {  // We have found the shallowest level at which this row is null
+          break;
+        }
+      }
+      is_col_struct = (col.type().id() == type_id::STRUCT);
+      if (is_col_struct) {
+        col = col.child(0);
+        ++l;
+      }
+    } while (is_col_struct);
+    return def;
+  }
+};
+
 /**
  * @brief Get the dremel offsets and repetition and definition levels for a LIST column
  *
@@ -1633,16 +1711,53 @@ __global__ void __launch_bounds__(1024) gpuGatherPages(EncColumnChunk *chunks, c
  * ```
  *
  * Similarly we merge up all the way till level 0 offsets
+ *
+ * STRUCT COLUMNS :
+ * In case of struct columns, we don't have to merge struct levels with their children because a
+ * struct is the same size as its children. e.g. for a column `struct<int, float>`, if the row `i`
+ * is null, then the children columns `int` and `float` are also null at `i`. They also have the
+ * null entry represented in their respective null masks. So for any case of strictly struct based
+ * nesting, we can get the definition levels merely by iterating over the nesting for the same row.
+ *
+ * In case struct and lists are intermixed, the definition levels of all the contiguous struct
+ * levels can be constructed using the aforementioned iterative method. Only when we reach a list
+ * level, we need to do a merge with the subsequent level.
+ *
+ * So, for a column like `struct<list<int>>`, we are going to merge between the levels `struct<list`
+ * and `int`.
+ * For a column like `list<struct<int>>`, we are going to merge between `list` and `struct<int>`.
+ *
+ * In general, one nesting level is the list level and any struct level that precedes it.
+ *
+ * A few more examples to visualize the partitioning of column hierarchy into nesting levels:
+ * (L is list, S is struct, i is integer(leaf data level), angle brackets omitted)
+ * ```
+ * 1. LSi     = L   Si
+ *              - | --
+ *
+ * 2. LLSi    = L   L   Si
+ *              - | - | --
+ *
+ * 3. SSLi    = SSL   i
+ *              --- | -
+ *
+ * 4. LLSLSSi = L   L   SL   SSi
+ *              - | - | -- | ---
+```
  */
 dremel_data get_dremel_data(column_view h_col,
-                            std::vector<bool> const &level_nullability,
+                            // TODO(cp): use device_span once it is converted to a single hd_vec
+                            rmm::device_uvector<uint8_t> const &d_nullability,
+                            std::vector<uint8_t> const &nullability,
                             rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS(h_col.type().id() == type_id::LIST,
-               "Can only get rep/def levels for LIST type column");
+  auto get_list_level = [](column_view col) {
+    while (col.type().id() == type_id::STRUCT) { col = col.child(0); }
+    return col;
+  };
 
   auto get_empties = [&](column_view col, size_type start, size_type end) {
-    auto lcv = lists_column_view(col);
+    auto lcv = lists_column_view(get_list_level(col));
     rmm::device_uvector<size_type> empties_idx(lcv.size(), stream);
     rmm::device_uvector<size_type> empties(lcv.size(), stream);
     auto d_off = lcv.offsets().data<size_type>();
@@ -1663,38 +1778,60 @@ dremel_data get_dremel_data(column_view h_col,
     return std::make_tuple(std::move(empties), std::move(empties_idx), empties_size);
   };
 
-  // Reverse the nesting in order to merge the deepest level with the leaf first and merge bottom
-  // up
-  auto curr_col        = h_col;
-  size_t max_vals_size = 0;
+  auto curr_col = h_col;
   std::vector<column_view> nesting_levels;
   std::vector<uint8_t> def_at_level;
-  size_type level       = 0;
-  auto add_def_at_level = [&](size_type level) {
-    auto is_level_nullable =
-      curr_col.nullable() or (not level_nullability.empty() and level_nullability[level]);
-    def_at_level.push_back(is_level_nullable ? 2 : 1);
+  std::vector<uint8_t> start_at_sub_level;
+  uint8_t curr_nesting_level_idx = 0;
+
+  auto add_def_at_level = [&](column_view col) {
+    // Add up all def level contributions in this column all the way till the first list column
+    // appears in the hierarchy or until we get to leaf
+    uint32_t def = 0;
+    start_at_sub_level.push_back(curr_nesting_level_idx);
+    while (col.type().id() == type_id::STRUCT) {
+      def += (nullability[curr_nesting_level_idx]) ? 1 : 0;
+      col = col.child(0);
+      ++curr_nesting_level_idx;
+    }
+    // At the end of all those structs is either a list column or the leaf. Leaf column contributes
+    // at least one def level. It doesn't matter what the leaf contributes because it'll be at the
+    // end of the exclusive scan.
+    def += (nullability[curr_nesting_level_idx]) ? 2 : 1;
+    def_at_level.push_back(def);
+    ++curr_nesting_level_idx;
   };
-  while (curr_col.type().id() == type_id::LIST) {
+  while (cudf::is_nested(curr_col.type())) {
     nesting_levels.push_back(curr_col);
-    add_def_at_level(level);
-    auto lcv = lists_column_view(curr_col);
-    max_vals_size += lcv.offsets().size();
-    curr_col = lcv.child();
-    level++;
+    add_def_at_level(curr_col);
+    while (curr_col.type().id() == type_id::STRUCT) {
+      // Go down the hierarchy until we get to the LIST or the leaf level
+      curr_col = curr_col.child(0);
+    }
+    if (curr_col.type().id() == type_id::LIST) {
+      curr_col = curr_col.child(lists_column_view::child_column_index);
+      if (not is_nested(curr_col.type())) {
+        // Special case: when the leaf data column is the immediate child of the list col then we
+        // want it to be included right away. Otherwise the struct containing it will be included in
+        // the next iteration of this loop.
+        nesting_levels.push_back(curr_col);
+        add_def_at_level(curr_col);
+        break;
+      }
+    }
   }
-  // One more entry for leaf col
-  add_def_at_level(level);
-  max_vals_size += curr_col.size();
 
-  // Add one more value at the end so that we can have the max def level
-  def_at_level.push_back(0);
+  std::unique_ptr<rmm::device_buffer> device_view_owners;
+  column_device_view *d_nesting_levels;
+  std::tie(device_view_owners, d_nesting_levels) =
+    contiguous_copy_column_device_views<column_device_view>(nesting_levels, stream);
+
   thrust::exclusive_scan(
     thrust::host, def_at_level.begin(), def_at_level.end(), def_at_level.begin());
 
   // Sliced list column views only have offsets applied to top level. Get offsets for each level.
-  rmm::device_uvector<size_type> d_column_offsets(nesting_levels.size() + 1, stream);
-  rmm::device_uvector<size_type> d_column_ends(nesting_levels.size() + 1, stream);
+  rmm::device_uvector<size_type> d_column_offsets(nesting_levels.size(), stream);
+  rmm::device_uvector<size_type> d_column_ends(nesting_levels.size(), stream);
 
   auto d_col = column_device_view::create(h_col, stream);
   cudf::detail::device_single_thread(
@@ -1709,24 +1846,29 @@ dremel_data get_dremel_data(column_view h_col,
       end_idx_at_level[level] = end;
       ++level;
       // Apply offset recursively until we get to leaf data
-      while (curr_col.type().id() == type_id::LIST) {
-        off = curr_col.child(lists_column_view::offsets_column_index).element<size_type>(off);
-        end = curr_col.child(lists_column_view::offsets_column_index).element<size_type>(end);
-        offset_at_level[level]  = off;
-        end_idx_at_level[level] = end;
-        ++level;
-        curr_col = curr_col.child(lists_column_view::child_column_index);
+      // Skip doing the following for any structs we encounter in between.
+      while (curr_col.type().id() == type_id::LIST or curr_col.type().id() == type_id::STRUCT) {
+        if (curr_col.type().id() == type_id::LIST) {
+          off = curr_col.child(lists_column_view::offsets_column_index).element<size_type>(off);
+          end = curr_col.child(lists_column_view::offsets_column_index).element<size_type>(end);
+          offset_at_level[level]  = off;
+          end_idx_at_level[level] = end;
+          ++level;
+          curr_col = curr_col.child(lists_column_view::child_column_index);
+        } else {
+          curr_col = curr_col.child(0);
+        }
       }
     },
     stream);
 
-  thrust::host_vector<size_type> column_offsets(nesting_levels.size() + 1);
+  thrust::host_vector<size_type> column_offsets(d_column_offsets.size());
   CUDA_TRY(cudaMemcpyAsync(column_offsets.data(),
                            d_column_offsets.data(),
                            d_column_offsets.size() * sizeof(size_type),
                            cudaMemcpyDeviceToHost,
                            stream.value()));
-  thrust::host_vector<size_type> column_ends(nesting_levels.size() + 1);
+  thrust::host_vector<size_type> column_ends(d_column_ends.size());
   CUDA_TRY(cudaMemcpyAsync(column_ends.data(),
                            d_column_ends.data(),
                            d_column_ends.size() * sizeof(size_type),
@@ -1735,6 +1877,11 @@ dremel_data get_dremel_data(column_view h_col,
 
   stream.synchronize();
 
+  size_t max_vals_size = 0;
+  for (size_t l = 0; l < column_offsets.size(); ++l) {
+    max_vals_size += column_ends[l] - column_offsets[l];
+  }
+
   rmm::device_uvector<uint8_t> rep_level(max_vals_size, stream);
   rmm::device_uvector<uint8_t> def_level(max_vals_size, stream);
 
@@ -1745,9 +1892,13 @@ dremel_data get_dremel_data(column_view h_col,
   {
     // At this point, curr_col contains the leaf column. Max nesting level is
     // nesting_levels.size().
-    size_t level              = nesting_levels.size() - 1;
+
+    // We are going to start by merging the last column in nesting_levels (the leaf, which is at the
+    // index `nesting_levels.size() - 1`) with the second-to-last (which is at
+    // `nesting_levels.size() - 2`).
+    size_t level              = nesting_levels.size() - 2;
     curr_col                  = nesting_levels[level];
-    auto lcv                  = lists_column_view(curr_col);
+    auto lcv                  = lists_column_view(get_list_level(curr_col));
     auto offset_size_at_level = column_ends[level] - column_offsets[level] + 1;
 
     // Get empties at this level
@@ -1760,25 +1911,21 @@ dremel_data get_dremel_data(column_view h_col,
     // Merge empty at deepest parent level with the rep, def level vals at leaf level
 
     auto input_parent_rep_it = thrust::make_constant_iterator(level);
-    auto input_parent_def_it = thrust::make_transform_iterator(
-      thrust::make_counting_iterator(0),
-      [idx            = empties_idx.data(),
-       mask           = lcv.null_mask(),
-       level_nullable = level_nullability.empty() ? false : level_nullability[level],
-       curr_def_level = def_at_level[level]] __device__(auto i) {
-        return curr_def_level +
-               ((mask && bit_is_set(mask, idx[i]) or (!mask && level_nullable)) ? 1 : 0);
-      });
-
-    auto input_child_rep_it = thrust::make_constant_iterator(nesting_levels.size());
-    auto input_child_def_it = thrust::make_transform_iterator(
-      thrust::make_counting_iterator(column_offsets[level + 1]),
-      [mask           = lcv.child().null_mask(),
-       level_nullable = level_nullability.empty() ? false : level_nullability[level + 1],
-       curr_def_level = def_at_level[level + 1]] __device__(auto i) {
-        return curr_def_level +
-               ((mask && bit_is_set(mask, i) or (!mask && level_nullable)) ? 1 : 0);
-      });
+    auto input_parent_def_it =
+      thrust::make_transform_iterator(empties_idx.begin(),
+                                      def_level_fn{d_nesting_levels + level,
+                                                   d_nullability.data(),
+                                                   start_at_sub_level[level],
+                                                   def_at_level[level]});
+
+    // `nesting_levels.size()` == no of list levels + leaf. Max repetition level = no of list levels
+    auto input_child_rep_it = thrust::make_constant_iterator(nesting_levels.size() - 1);
+    auto input_child_def_it =
+      thrust::make_transform_iterator(thrust::make_counting_iterator(column_offsets[level + 1]),
+                                      def_level_fn{d_nesting_levels + level + 1,
+                                                   d_nullability.data(),
+                                                   start_at_sub_level[level + 1],
+                                                   def_at_level[level + 1]});
 
     // Zip the input and output value iterators so that merge operation is done only once
     auto input_parent_zip_it =
@@ -1831,9 +1978,11 @@ dremel_data get_dremel_data(column_view h_col,
                     rep_level.begin());
   }
 
-  for (int level = nesting_levels.size() - 2; level >= 0; level--) {
+  // Having already merged the last two levels, we are now going to merge the result with the
+  // third-last level which is at index `nesting_levels.size() - 3`.
+  for (int level = nesting_levels.size() - 3; level >= 0; level--) {
     curr_col                  = nesting_levels[level];
-    auto lcv                  = lists_column_view(curr_col);
+    auto lcv                  = lists_column_view(get_list_level(curr_col));
     auto offset_size_at_level = column_ends[level] - column_offsets[level] + 1;
 
     // Get empties at this level
@@ -1857,15 +2006,12 @@ dremel_data get_dremel_data(column_view h_col,
     auto transformed_empties = thrust::make_transform_iterator(empties.begin(), offset_transformer);
 
     auto input_parent_rep_it = thrust::make_constant_iterator(level);
-    auto input_parent_def_it = thrust::make_transform_iterator(
-      thrust::make_counting_iterator(0),
-      [idx            = empties_idx.data(),
-       mask           = lcv.null_mask(),
-       level_nullable = level_nullability.empty() ? false : level_nullability[level],
-       curr_def_level = def_at_level[level]] __device__(auto i) {
-        return curr_def_level +
-               ((mask && bit_is_set(mask, idx[i]) or (!mask && level_nullable)) ? 1 : 0);
-      });
+    auto input_parent_def_it =
+      thrust::make_transform_iterator(empties_idx.begin(),
+                                      def_level_fn{d_nesting_levels + level,
+                                                   d_nullability.data(),
+                                                   start_at_sub_level[level],
+                                                   def_at_level[level]});
 
     // Zip the input and output value iterators so that merge operation is done only once
     auto input_parent_zip_it =
@@ -1927,16 +2073,10 @@ dremel_data get_dremel_data(column_view h_col,
 
   stream.synchronize();
 
-  size_type leaf_col_offset = column_offsets[column_offsets.size() - 1];
-  size_type leaf_data_size  = column_ends[column_ends.size() - 1] - leaf_col_offset;
-  uint8_t max_def_level     = def_at_level.back() - 1;
+  size_type leaf_data_size = column_ends.back() - column_offsets.back();
 
-  return dremel_data{std::move(new_offsets),
-                     std::move(rep_level),
-                     std::move(def_level),
-                     leaf_col_offset,
-                     leaf_data_size,
-                     max_def_level};
+  return dremel_data{
+    std::move(new_offsets), std::move(rep_level), std::move(def_level), leaf_data_size};
 }
 
 /**
@@ -1949,7 +2089,7 @@ dremel_data get_dremel_data(column_view h_col,
  * @param[in] stream CUDA stream to use, default 0
  */
 void InitPageFragments(PageFragment *frag,
-                       const EncColumnDesc *col_desc,
+                       const parquet_column_device_view *col_desc,
                        int32_t num_fragments,
                        int32_t num_columns,
                        uint32_t fragment_size,
@@ -1974,7 +2114,7 @@ void InitPageFragments(PageFragment *frag,
  */
 void InitFragmentStatistics(statistics_group *groups,
                             const PageFragment *fragments,
-                            const EncColumnDesc *col_desc,
+                            const parquet_column_device_view *col_desc,
                             int32_t num_fragments,
                             int32_t num_columns,
                             uint32_t fragment_size,
@@ -1999,7 +2139,7 @@ void InitFragmentStatistics(statistics_group *groups,
  */
 void InitEncoderPages(EncColumnChunk *chunks,
                       EncPage *pages,
-                      const EncColumnDesc *col_desc,
+                      const parquet_column_device_view *col_desc,
                       int32_t num_rowgroups,
                       int32_t num_columns,
                       statistics_merge_group *page_grstats,
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index f920aee1c29..ad3c214069f 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -215,7 +215,7 @@ struct ColumnChunkDesc {
 /**
  * @brief Struct describing an encoder column
  */
-struct EncColumnDesc : stats_column_desc {
+struct parquet_column_device_view : stats_column_desc {
   uint32_t *dict_index;    //!< Dictionary index [row]
   uint32_t *dict_data;     //!< Dictionary data (unique row indices)
   uint8_t physical_type;   //!< physical data type
@@ -223,12 +223,17 @@ struct EncColumnDesc : stats_column_desc {
   // TODO (dm): Evaluate if this is sufficient. At 4 bits, this allows a maximum 16 level nesting
   uint8_t level_bits;  //!< bits to encode max definition (lower nibble) & repetition (upper nibble)
                        //!< levels
+  constexpr uint8_t num_def_level_bits() { return level_bits & 0xf; }
+  constexpr uint8_t num_rep_level_bits() { return level_bits >> 4; }
   size_type const *const
     *nesting_offsets;  //!< If column is a nested type, contains offset array of each nesting level
 
   size_type const *level_offsets;  //!< Offset array for per-row pre-calculated rep/def level values
   uint8_t const *rep_values;       //!< Pre-calculated repetition level values
   uint8_t const *def_values;       //!< Pre-calculated definition level values
+  uint8_t *nullability;  //!< Array of nullability of each nesting level. e.g. nullable[0] is
+                         //!< nullability of parent_column. May be different from col.nullable() in
+                         //!< case of chunked writing.
 };
 
 constexpr int max_page_fragment_size = 5000;  //!< Max number of rows in a page fragment
@@ -299,15 +304,15 @@ inline size_t __device__ __host__ GetMaxCompressedBfrSize(size_t uncomp_size,
  * @brief Struct describing an encoder column chunk
  */
 struct EncColumnChunk {
-  const EncColumnDesc *col_desc;  //!< Column description
-  PageFragment *fragments;        //!< First fragment in chunk
-  uint8_t *uncompressed_bfr;      //!< Uncompressed page data
-  uint8_t *compressed_bfr;        //!< Compressed page data
-  const statistics_chunk *stats;  //!< Fragment statistics
-  uint32_t bfr_size;              //!< Uncompressed buffer size
-  uint32_t compressed_size;       //!< Compressed buffer size
-  uint32_t start_row;             //!< First row of chunk
-  uint32_t num_rows;              //!< Number of rows in chunk
+  const parquet_column_device_view *col_desc;  //!< Column description
+  PageFragment *fragments;                     //!< First fragment in chunk
+  uint8_t *uncompressed_bfr;                   //!< Uncompressed page data
+  uint8_t *compressed_bfr;                     //!< Compressed page data
+  const statistics_chunk *stats;               //!< Fragment statistics
+  uint32_t bfr_size;                           //!< Uncompressed buffer size
+  uint32_t compressed_size;                    //!< Compressed buffer size
+  uint32_t start_row;                          //!< First row of chunk
+  uint32_t num_rows;                           //!< Number of rows in chunk
   uint32_t num_values;      //!< Number of values in chunk. Different from num_rows for nested types
   uint32_t first_fragment;  //!< First fragment of chunk
   uint32_t first_page;      //!< First page of chunk
@@ -398,9 +403,7 @@ struct dremel_data {
   rmm::device_uvector<uint8_t> rep_level;
   rmm::device_uvector<uint8_t> def_level;
 
-  size_type leaf_col_offset;
   size_type leaf_data_size;
-  uint8_t max_def_level;
 };
 
 /**
@@ -423,8 +426,9 @@ struct dremel_data {
  * @return A struct containing dremel data
  */
 dremel_data get_dremel_data(column_view h_col,
-                            std::vector<bool> const &level_nullability = {},
-                            rmm::cuda_stream_view stream               = rmm::cuda_stream_default);
+                            rmm::device_uvector<uint8_t> const &d_nullability,
+                            std::vector<uint8_t> const &nullability,
+                            rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
  * @brief Launches kernel for initializing encoder page fragments
@@ -438,7 +442,7 @@ dremel_data get_dremel_data(column_view h_col,
  * @param[in] stream CUDA stream to use, default 0
  */
 void InitPageFragments(PageFragment *frag,
-                       const EncColumnDesc *col_desc,
+                       const parquet_column_device_view *col_desc,
                        int32_t num_fragments,
                        int32_t num_columns,
                        uint32_t fragment_size,
@@ -458,7 +462,7 @@ void InitPageFragments(PageFragment *frag,
  */
 void InitFragmentStatistics(statistics_group *groups,
                             const PageFragment *fragments,
-                            const EncColumnDesc *col_desc,
+                            const parquet_column_device_view *col_desc,
                             int32_t num_fragments,
                             int32_t num_columns,
                             uint32_t fragment_size,
@@ -478,7 +482,7 @@ void InitFragmentStatistics(statistics_group *groups,
  */
 void InitEncoderPages(EncColumnChunk *chunks,
                       EncPage *pages,
-                      const EncColumnDesc *col_desc,
+                      const parquet_column_device_view *col_desc,
                       int32_t num_rowgroups,
                       int32_t num_columns,
                       statistics_merge_group *page_grstats  = nullptr,
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index dd68bc50043..31baf419f45 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -70,364 +70,453 @@ parquet::Compression to_parquet_compression(compression_type compression)
   }
 }
 
-std::vector<std::vector<bool>> get_per_column_nullability(table_view const &table,
-                                                          std::vector<bool> const &col_nullable)
-{
-  auto get_depth = [](column_view const &col) {
-    column_view curr_col = col;
-    uint16_t depth       = 1;
-    while (curr_col.type().id() == type_id::LIST) {
-      depth++;
-      curr_col = lists_column_view{curr_col}.child();
+}  // namespace
+
+struct linked_column_view;
+
+using LinkedColPtr    = std::shared_ptr<linked_column_view>;
+using LinkedColVector = std::vector<LinkedColPtr>;
+
+/**
+ * @brief column_view with the added member pointer to the parent of this column.
+ *
+ */
+struct linked_column_view : public column_view {
+  // TODO(cp): we are currently keeping all column_view children info multiple times - once for each
+  //       copy of this object. Options:
+  // 1. Inherit from column_view_base. Only lose out on children vector. That is not needed.
+  // 2. Don't inherit at all. make linked_column_view keep a reference wrapper to its column_view
+  linked_column_view(column_view const &col) : column_view(col), parent(nullptr)
+  {
+    for (auto child_it = col.child_begin(); child_it < col.child_end(); ++child_it) {
+      children.push_back(std::make_shared<linked_column_view>(this, *child_it));
     }
-    return depth;
-  };
+  }
 
-  // for each column, check depth and add subsequent bool values to its nullable vector
-  std::vector<std::vector<bool>> per_column_nullability;
-  auto null_it  = col_nullable.begin();
-  auto const_it = thrust::make_constant_iterator(true);
-  for (auto const &col : table) {
-    uint16_t depth = get_depth(col);
-    if (col_nullable.empty()) {
-      // If no per-column nullability is specified then assume that all columns are nullable
-      per_column_nullability.emplace_back(const_it, const_it + depth);
-    } else {
-      CUDF_EXPECTS(
-        null_it + depth <= col_nullable.end(),
-        "Mismatch between size of column nullability passed in user_metadata_with_nullability and "
-        "number of null masks expected in table. Expected more values in passed metadata");
-      per_column_nullability.emplace_back(null_it, null_it + depth);
-      null_it += depth;
+  linked_column_view(linked_column_view *parent, column_view const &col)
+    : column_view(col), parent(parent)
+  {
+    for (auto child_it = col.child_begin(); child_it < col.child_end(); ++child_it) {
+      children.push_back(std::make_shared<linked_column_view>(this, *child_it));
     }
   }
-  CUDF_EXPECTS(
-    null_it == col_nullable.end(),
-    "Mismatch between size of column nullability passed in user_metadata_with_nullability and "
-    "number of null masks expected in table. Too many values in passed metadata");
-  return per_column_nullability;
-}
+
+  linked_column_view *parent;  //!< Pointer to parent of this column. Nullptr if root
+  LinkedColVector children;
+};
 
 /**
- * @brief Get the leaf column
+ * @brief Converts all column_views of a table into linked_column_views
  *
- * Returns the dtype of the leaf column when `col` is a list column.
+ * @param table table of columns to convert
+ * @return Vector of converted linked_column_views
  */
-column_view get_leaf_col(column_view col)
+LinkedColVector input_table_to_linked_columns(table_view const &table)
 {
-  column_view curr_col = col;
-  while (curr_col.type().id() == type_id::LIST) { curr_col = lists_column_view{curr_col}.child(); }
-  return curr_col;
-}
+  LinkedColVector result;
+  for (column_view const &col : table) {
+    result.emplace_back(std::make_shared<linked_column_view>(col));
+  }
 
-}  // namespace
+  return result;
+}
 
 /**
- * @brief Helper kernel for converting string data/offsets into nvstrdesc
- * REMOVEME: Once we eliminate the legacy readers/writers, the kernels could be
- * made to use the native offset+data layout.
+ * @brief Extends SchemaElement to add members required in constructing parquet_column_view
+ *
+ * Added members are:
+ * 1. leaf_column: Pointer to leaf linked_column_view which points to the corresponding data stream
+ *    of a leaf schema node. For non-leaf struct node, this is nullptr.
+ * 2. stats_dtype: datatype for statistics calculation required for the data stream of a leaf node.
+ * 3. ts_scale: scale to multiply or divide timestamp by in order to convert timestamp to parquet
+ *    supported types
  */
-__global__ void stringdata_to_nvstrdesc(gpu::nvstrdesc_s *dst,
-                                        const size_type *offsets,
-                                        const char *strdata,
-                                        const uint32_t *nulls,
-                                        size_type column_size)
-{
-  size_type row = blockIdx.x * blockDim.x + threadIdx.x;
-  if (row < column_size) {
-    uint32_t is_valid = (nulls) ? (nulls[row >> 5] >> (row & 0x1f)) & 1 : 1;
-    size_t count;
-    const char *ptr;
-    if (is_valid) {
-      size_type cur  = offsets[row];
-      size_type next = offsets[row + 1];
-      ptr            = strdata + cur;
-      count          = (next > cur) ? next - cur : 0;
+struct schema_tree_node : public SchemaElement {
+  LinkedColPtr leaf_column;
+  statistics_dtype stats_dtype;
+  int32_t ts_scale;
+
+  // TODO(fut): Think about making schema a class that holds a vector of schema_tree_nodes. The
+  // function construct_schema_tree could be its constructor. It can have method to get the per
+  // column nullability given a schema node index corresponding to a leaf schema. Much easier than
+  // that is a method to get path in schema, given a leaf node
+};
+
+struct leaf_schema_fn {
+  schema_tree_node &col_schema;
+  LinkedColPtr const &col;
+  column_in_metadata const &col_meta;
+  bool timestamp_is_int96;
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, bool>::value, void> operator()()
+  {
+    col_schema.type        = Type::BOOLEAN;
+    col_schema.stats_dtype = statistics_dtype::dtype_bool;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, int8_t>::value, void> operator()()
+  {
+    col_schema.type           = Type::INT32;
+    col_schema.converted_type = ConvertedType::INT_8;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int8;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, int16_t>::value, void> operator()()
+  {
+    col_schema.type           = Type::INT32;
+    col_schema.converted_type = ConvertedType::INT_16;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int16;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, int32_t>::value, void> operator()()
+  {
+    col_schema.type        = Type::INT32;
+    col_schema.stats_dtype = statistics_dtype::dtype_int32;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, int64_t>::value, void> operator()()
+  {
+    col_schema.type        = Type::INT64;
+    col_schema.stats_dtype = statistics_dtype::dtype_int64;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, uint8_t>::value, void> operator()()
+  {
+    col_schema.type           = Type::INT32;
+    col_schema.converted_type = ConvertedType::UINT_8;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int8;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, uint16_t>::value, void> operator()()
+  {
+    col_schema.type           = Type::INT32;
+    col_schema.converted_type = ConvertedType::UINT_16;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int16;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, uint32_t>::value, void> operator()()
+  {
+    col_schema.type           = Type::INT32;
+    col_schema.converted_type = ConvertedType::UINT_32;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, uint64_t>::value, void> operator()()
+  {
+    col_schema.type           = Type::INT64;
+    col_schema.converted_type = ConvertedType::UINT_64;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int64;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, float>::value, void> operator()()
+  {
+    col_schema.type        = Type::FLOAT;
+    col_schema.stats_dtype = statistics_dtype::dtype_float32;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, double>::value, void> operator()()
+  {
+    col_schema.type        = Type::DOUBLE;
+    col_schema.stats_dtype = statistics_dtype::dtype_float64;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, cudf::string_view>::value, void> operator()()
+  {
+    col_schema.type           = Type::BYTE_ARRAY;
+    col_schema.converted_type = ConvertedType::UTF8;
+    col_schema.stats_dtype    = statistics_dtype::dtype_string;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, cudf::timestamp_D>::value, void> operator()()
+  {
+    col_schema.type           = Type::INT32;
+    col_schema.converted_type = ConvertedType::DATE;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, cudf::timestamp_s>::value, void> operator()()
+  {
+    col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
+    col_schema.converted_type =
+      (timestamp_is_int96) ? ConvertedType::UNKNOWN : ConvertedType::TIMESTAMP_MILLIS;
+    col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
+    col_schema.ts_scale    = 1000;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, cudf::timestamp_ms>::value, void> operator()()
+  {
+    col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
+    col_schema.converted_type =
+      (timestamp_is_int96) ? ConvertedType::UNKNOWN : ConvertedType::TIMESTAMP_MILLIS;
+    col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, cudf::timestamp_us>::value, void> operator()()
+  {
+    col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
+    col_schema.converted_type =
+      (timestamp_is_int96) ? ConvertedType::UNKNOWN : ConvertedType::TIMESTAMP_MICROS;
+    col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, cudf::timestamp_ns>::value, void> operator()()
+  {
+    col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
+    col_schema.converted_type =
+      (timestamp_is_int96) ? ConvertedType::UNKNOWN : ConvertedType::TIMESTAMP_MICROS;
+    col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
+    col_schema.ts_scale    = -1000;  // negative value indicates division by absolute value
+  }
+
+  //  unsupported outside cudf for parquet 1.0.
+  template <typename T>
+  std::enable_if_t<std::is_same<T, cudf::duration_D>::value, void> operator()()
+  {
+    col_schema.type           = Type::INT32;
+    col_schema.converted_type = ConvertedType::TIME_MILLIS;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int64;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, cudf::duration_s>::value, void> operator()()
+  {
+    col_schema.type           = Type::INT64;
+    col_schema.converted_type = ConvertedType::TIME_MILLIS;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int64;
+    col_schema.ts_scale       = 1000;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, cudf::duration_ms>::value, void> operator()()
+  {
+    col_schema.type           = Type::INT64;
+    col_schema.converted_type = ConvertedType::TIME_MILLIS;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int64;
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, cudf::duration_us>::value, void> operator()()
+  {
+    col_schema.type           = Type::INT64;
+    col_schema.converted_type = ConvertedType::TIME_MICROS;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int64;
+  }
+
+  //  unsupported outside cudf for parquet 1.0.
+  template <typename T>
+  std::enable_if_t<std::is_same<T, cudf::duration_ns>::value, void> operator()()
+  {
+    col_schema.type           = Type::INT64;
+    col_schema.converted_type = ConvertedType::TIME_MICROS;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int64;
+    col_schema.ts_scale       = -1000;  // negative value indicates division by absolute value
+  }
+
+  template <typename T>
+  std::enable_if_t<cudf::is_fixed_point<T>(), void> operator()()
+  {
+    if (std::is_same<T, numeric::decimal32>::value) {
+      col_schema.type        = Type::INT32;
+      col_schema.stats_dtype = statistics_dtype::dtype_int32;
+    } else if (std::is_same<T, numeric::decimal64>::value) {
+      col_schema.type        = Type::INT64;
+      col_schema.stats_dtype = statistics_dtype::dtype_decimal64;
     } else {
-      ptr   = nullptr;
-      count = 0;
+      CUDF_FAIL("Unsupported fixed point type for parquet writer");
     }
-    dst[row].ptr   = ptr;
-    dst[row].count = count;
+    col_schema.converted_type = ConvertedType::DECIMAL;
+    col_schema.decimal_scale = -col->type().scale();  // parquet and cudf disagree about scale signs
+    CUDF_EXPECTS(col_meta.is_decimal_precision_set(),
+                 "Precision must be specified for decimal columns");
+    CUDF_EXPECTS(col_meta.get_decimal_precision() >= col_schema.decimal_scale,
+                 "Precision must be equal to or greater than scale!");
+    col_schema.decimal_precision = col_meta.get_decimal_precision();
   }
-}
 
-/**
- * @brief Helper class that adds parquet-specific column info
- */
-class parquet_column_view {
- public:
-  /**
-   * @brief Constructor that extracts out the string position + length pairs
-   * for building dictionaries for string columns
-   */
-  explicit parquet_column_view(size_t id,
-                               column_view const &col,
-                               std::vector<bool> const &nullability,
-                               const table_metadata *metadata,
-                               bool int96_timestamps,
-                               std::vector<uint8_t> const &decimal_precision,
-                               uint &decimal_precision_idx,
-                               rmm::cuda_stream_view stream)
-    : _col(col),
-      _leaf_col(get_leaf_col(col)),
-      _id(id),
-      _string_type(_leaf_col.type().id() == type_id::STRING),
-      _list_type(col.type().id() == type_id::LIST),
-      _type_width((_string_type || _list_type) ? 0 : cudf::size_of(col.type())),
-      _row_count(col.size()),
-      _null_count(_leaf_col.null_count()),
-      _data(col.head<uint8_t>() + col.offset() * _type_width),
-      _nulls(_leaf_col.nullable() ? _leaf_col.null_mask() : nullptr),
-      _offset(col.offset()),
-      _converted_type(ConvertedType::UNKNOWN),
-      _ts_scale(0),
-      _dremel_offsets(0, stream),
-      _rep_level(0, stream),
-      _def_level(0, stream),
-      _nullability(nullability)
+  template <typename T>
+  std::enable_if_t<cudf::is_nested<T>(), void> operator()()
   {
-    switch (_leaf_col.type().id()) {
-      case cudf::type_id::INT8:
-        _physical_type  = Type::INT32;
-        _converted_type = ConvertedType::INT_8;
-        _stats_dtype    = statistics_dtype::dtype_int8;
-        break;
-      case cudf::type_id::INT16:
-        _physical_type  = Type::INT32;
-        _converted_type = ConvertedType::INT_16;
-        _stats_dtype    = statistics_dtype::dtype_int16;
-        break;
-      case cudf::type_id::INT32:
-        _physical_type = Type::INT32;
-        _stats_dtype   = statistics_dtype::dtype_int32;
-        break;
-      case cudf::type_id::INT64:
-        _physical_type = Type::INT64;
-        _stats_dtype   = statistics_dtype::dtype_int64;
-        break;
-      case cudf::type_id::UINT8:
-        _physical_type  = Type::INT32;
-        _converted_type = ConvertedType::UINT_8;
-        _stats_dtype    = statistics_dtype::dtype_int8;
-        break;
-      case cudf::type_id::UINT16:
-        _physical_type  = Type::INT32;
-        _converted_type = ConvertedType::UINT_16;
-        _stats_dtype    = statistics_dtype::dtype_int16;
-        break;
-      case cudf::type_id::UINT32:
-        _physical_type  = Type::INT32;
-        _converted_type = ConvertedType::UINT_32;
-        _stats_dtype    = statistics_dtype::dtype_int32;
-        break;
-      case cudf::type_id::UINT64:
-        _physical_type  = Type::INT64;
-        _converted_type = ConvertedType::UINT_64;
-        _stats_dtype    = statistics_dtype::dtype_int64;
-        break;
-      case cudf::type_id::FLOAT32:
-        _physical_type = Type::FLOAT;
-        _stats_dtype   = statistics_dtype::dtype_float32;
-        break;
-      case cudf::type_id::FLOAT64:
-        _physical_type = Type::DOUBLE;
-        _stats_dtype   = statistics_dtype::dtype_float64;
-        break;
-      case cudf::type_id::BOOL8:
-        _physical_type = Type::BOOLEAN;
-        _stats_dtype   = statistics_dtype::dtype_bool;
-        break;
-      // unsupported outside cudf for parquet 1.0.
-      case cudf::type_id::DURATION_DAYS:
-        _physical_type  = Type::INT32;
-        _converted_type = ConvertedType::TIME_MILLIS;
-        _stats_dtype    = statistics_dtype::dtype_int64;
-        break;
-      case cudf::type_id::DURATION_SECONDS:
-        _physical_type  = Type::INT64;
-        _converted_type = ConvertedType::TIME_MILLIS;
-        _stats_dtype    = statistics_dtype::dtype_int64;
-        _ts_scale       = 1000;
-        break;
-      case cudf::type_id::DURATION_MILLISECONDS:
-        _physical_type  = Type::INT64;
-        _converted_type = ConvertedType::TIME_MILLIS;
-        _stats_dtype    = statistics_dtype::dtype_int64;
-        break;
-      case cudf::type_id::DURATION_MICROSECONDS:
-        _physical_type  = Type::INT64;
-        _converted_type = ConvertedType::TIME_MICROS;
-        _stats_dtype    = statistics_dtype::dtype_int64;
-        break;
-      // unsupported outside cudf for parquet 1.0.
-      case cudf::type_id::DURATION_NANOSECONDS:
-        _physical_type  = Type::INT64;
-        _converted_type = ConvertedType::TIME_MICROS;
-        _stats_dtype    = statistics_dtype::dtype_int64;
-        _ts_scale       = -1000;  // negative value indicates division by absolute value
-        break;
-      case cudf::type_id::TIMESTAMP_DAYS:
-        _physical_type  = Type::INT32;
-        _converted_type = ConvertedType::DATE;
-        _stats_dtype    = statistics_dtype::dtype_int32;
-        break;
-      case cudf::type_id::TIMESTAMP_SECONDS:
-        _physical_type  = int96_timestamps ? Type::INT96 : Type::INT64;
-        _converted_type = ConvertedType::TIMESTAMP_MILLIS;
-        _stats_dtype    = statistics_dtype::dtype_timestamp64;
-        _ts_scale       = 1000;
-        break;
-      case cudf::type_id::TIMESTAMP_MILLISECONDS:
-        _physical_type  = int96_timestamps ? Type::INT96 : Type::INT64;
-        _converted_type = ConvertedType::TIMESTAMP_MILLIS;
-        _stats_dtype    = statistics_dtype::dtype_timestamp64;
-        break;
-      case cudf::type_id::TIMESTAMP_MICROSECONDS:
-        _physical_type  = int96_timestamps ? Type::INT96 : Type::INT64;
-        _converted_type = ConvertedType::TIMESTAMP_MICROS;
-        _stats_dtype    = statistics_dtype::dtype_timestamp64;
-        break;
-      case cudf::type_id::TIMESTAMP_NANOSECONDS:
-        _physical_type  = int96_timestamps ? Type::INT96 : Type::INT64;
-        _converted_type = ConvertedType::TIMESTAMP_MICROS;
-        _stats_dtype    = statistics_dtype::dtype_timestamp64;
-        _ts_scale       = -1000;  // negative value indicates division by absolute value
-        break;
-      case cudf::type_id::STRING:
-        _physical_type  = Type::BYTE_ARRAY;
-        _converted_type = ConvertedType::UTF8;
-        _stats_dtype    = statistics_dtype::dtype_string;
-        break;
-      case cudf::type_id::DECIMAL32:
-        _physical_type  = Type::INT32;
-        _converted_type = ConvertedType::DECIMAL;
-        _stats_dtype    = statistics_dtype::dtype_int32;
-        _decimal_scale  = -_leaf_col.type().scale();  // parquet and cudf disagree about scale signs
-        CUDF_EXPECTS(decimal_precision.size() > decimal_precision_idx,
-                     "Not enough decimal precision values passed for data!");
-        CUDF_EXPECTS(decimal_precision[decimal_precision_idx] >= _decimal_scale,
-                     "Precision must be equal to or greater than scale!");
-        _decimal_precision = decimal_precision[decimal_precision_idx++];
-        break;
-      case cudf::type_id::DECIMAL64:
-        _physical_type  = Type::INT64;
-        _converted_type = ConvertedType::DECIMAL;
-        _stats_dtype    = statistics_dtype::dtype_decimal64;
-        _decimal_scale  = -_leaf_col.type().scale();  // parquet and cudf disagree about scale signs
-        CUDF_EXPECTS(decimal_precision.size() > decimal_precision_idx,
-                     "Not enough decimal precision values passed for data!");
-        CUDF_EXPECTS(decimal_precision[decimal_precision_idx] >= _decimal_scale,
-                     "Precision must be equal to or greater than scale!");
-        _decimal_precision = decimal_precision[decimal_precision_idx++];
-        break;
-      default:
-        _physical_type = UNDEFINED_TYPE;
-        _stats_dtype   = dtype_none;
-        break;
-    }
-    size_type leaf_col_offset = col.offset();
-    _data_count               = col.size();
-    if (_list_type) {
-      // Top level column's offsets are not applied to all children. Get the effective offset and
-      // size of the leaf column
-      // Calculate row offset into dremel data (repetition/definition values) and the respective
-      // definition and repetition levels
-      gpu::dremel_data dremel = gpu::get_dremel_data(col, _nullability, stream);
-      _dremel_offsets         = std::move(dremel.dremel_offsets);
-      _rep_level              = std::move(dremel.rep_level);
-      _def_level              = std::move(dremel.def_level);
-      leaf_col_offset         = dremel.leaf_col_offset;
-      _data_count             = dremel.leaf_data_size;
-      _max_def_level          = dremel.max_def_level;
-
-      _type_width = (is_fixed_width(_leaf_col.type())) ? cudf::size_of(_leaf_col.type()) : 0;
-      _data       = (is_fixed_width(_leaf_col.type()))
-                ? _leaf_col.head<uint8_t>() + leaf_col_offset * _type_width
-                : nullptr;
-
-      // Calculate nesting levels
-      column_view curr_col = col;
-      _nesting_levels      = 0;
-      while (curr_col.type().id() == type_id::LIST) {
-        lists_column_view list_col(curr_col);
-        _nesting_levels++;
-        curr_col = list_col.child();
-      }
+    CUDF_FAIL("This functor is only meant for physical data types");
+  }
 
-      // Update level nullability if no nullability was passed in.
-      curr_col = col;
-      if (_nullability.empty()) {
-        while (curr_col.type().id() == type_id::LIST) {
-          lists_column_view list_col(curr_col);
-          _nullability.push_back(list_col.null_mask() != nullptr);
-          curr_col = list_col.child();
+  template <typename T>
+  std::enable_if_t<cudf::is_dictionary<T>(), void> operator()()
+  {
+    CUDF_FAIL("Dictionary columns are not supported for writing");
+  }
+};
+
+/**
+ * @brief Construct schema from input columns and per-column input options
+ *
+ * Recursively traverses through linked_columns and corresponding metadata to construct schema tree.
+ * The resulting schema tree is stored in a vector in pre-order traversal order.
+ */
+std::vector<schema_tree_node> construct_schema_tree(LinkedColVector const &linked_columns,
+                                                    table_input_metadata const &metadata,
+                                                    bool single_write_mode,
+                                                    bool int96_timestamps)
+{
+  std::vector<schema_tree_node> schema;
+  schema_tree_node root{};
+  root.type            = UNDEFINED_TYPE;
+  root.repetition_type = NO_REPETITION_TYPE;
+  root.name            = "schema";
+  root.num_children    = linked_columns.size();
+  root.parent_idx      = -1;  // root schema has no parent
+  schema.push_back(std::move(root));
+
+  std::function<void(LinkedColPtr const &, column_in_metadata const &, size_t)> add_schema =
+    [&](LinkedColPtr const &col, column_in_metadata const &col_meta, size_t parent_idx) {
+      bool col_nullable = [&]() {
+        if (single_write_mode) {
+          return col->nullable();
+        } else {
+          if (col_meta.is_nullability_defined()) {
+            if (col_meta.nullable() == false) {
+              CUDF_EXPECTS(
+                col->nullable() == false,
+                "Mismatch in metadata prescribed nullability and input column nullability. "
+                "Metadata for nullable input column cannot prescribe nullability = false");
+            }
+            return col_meta.nullable();
+          } else {
+            // For chunked write, when not provided nullability, we assume the worst case scenario
+            // that all columns are nullable.
+            return true;
+          }
+        }
+      }();
+
+      if (col->type().id() == type_id::STRUCT) {
+        // if struct, add current and recursively call for all children
+        schema_tree_node struct_schema{};
+        struct_schema.repetition_type =
+          col_nullable ? FieldRepetitionType::OPTIONAL : FieldRepetitionType::REQUIRED;
+
+        struct_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
+        struct_schema.num_children = col->num_children();
+        struct_schema.parent_idx   = parent_idx;
+        schema.push_back(std::move(struct_schema));
+
+        auto struct_node_index = schema.size() - 1;
+        // for (auto child_it = col->children.begin(); child_it < col->children.end(); child_it++) {
+        //   add_schema(*child_it, struct_node_index);
+        // }
+        CUDF_EXPECTS(col->num_children() == static_cast<int>(col_meta.num_children()),
+                     "Mismatch in number of child columns between input table and metadata");
+        for (size_t i = 0; i < col->children.size(); ++i) {
+          add_schema(col->children[i], col_meta.child(i), struct_node_index);
+        }
+      } else if (col->type().id() == type_id::LIST) {
+        // List schema is denoted by two levels for each nesting level and one final level for leaf.
+        // The top level is the same name as the column name.
+        // So e.g. List<List<int>> is denoted in the schema by
+        // "col_name" : { "list" : { "element" : { "list" : { "element" } } } }
+
+        schema_tree_node list_schema_1{};
+        list_schema_1.converted_type = ConvertedType::LIST;
+        list_schema_1.repetition_type =
+          col_nullable ? FieldRepetitionType::OPTIONAL : FieldRepetitionType::REQUIRED;
+        list_schema_1.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
+        list_schema_1.num_children = 1;
+        list_schema_1.parent_idx   = parent_idx;
+        schema.push_back(std::move(list_schema_1));
+
+        schema_tree_node list_schema_2{};
+        list_schema_2.repetition_type = FieldRepetitionType::REPEATED;
+        list_schema_2.name            = "list";
+        list_schema_2.num_children    = 1;
+        list_schema_2.parent_idx      = schema.size() - 1;  // Parent is list_schema_1, last added.
+        schema.push_back(std::move(list_schema_2));
+
+        CUDF_EXPECTS(col_meta.num_children() == 2,
+                     "List column's metadata should have exactly two children");
+
+        add_schema(col->children[lists_column_view::child_column_index],
+                   col_meta.child(lists_column_view::child_column_index),
+                   schema.size() - 1);
+      } else {
+        // if leaf, add current
+        if (col->type().id() == type_id::STRING) {
+          CUDF_EXPECTS(col_meta.num_children() == 2 or col_meta.num_children() == 0,
+                       "String column's corresponding metadata should have zero or two children");
+        } else {
+          CUDF_EXPECTS(col_meta.num_children() == 0,
+                       "Leaf column's corresponding metadata cannot have children");
         }
-        _nullability.push_back(curr_col.null_mask() != nullptr);
-      }
 
-      stream.synchronize();
-    } else {
-      if (_nullability.empty()) { _nullability = {col.nullable()}; }
-      _max_def_level = (_nullability[0]) ? 1 : 0;
-    }
-    if (_string_type && _data_count > 0) {
-      strings_column_view view{_leaf_col};
-      _indexes = rmm::device_buffer(_data_count * sizeof(gpu::nvstrdesc_s), stream);
-
-      stringdata_to_nvstrdesc<<<((_data_count - 1) >> 8) + 1, 256, 0, stream.value()>>>(
-        reinterpret_cast<gpu::nvstrdesc_s *>(_indexes.data()),
-        view.offsets().data<size_type>() + leaf_col_offset,
-        view.chars().data<char>(),
-        _nulls,
-        _data_count);
-      _data = _indexes.data();
-
-      stream.synchronize();
-    }
+        schema_tree_node col_schema{};
 
-    // Generating default name if name isn't present in metadata
-    if (metadata && _id < metadata->column_names.size()) {
-      _name = metadata->column_names[_id];
-    } else {
-      _name = "_col" + std::to_string(_id);
-    }
-    _path_in_schema.push_back(_name);
+        bool timestamp_is_int96 = int96_timestamps or col_meta.is_enabled_int96_timestamps();
+
+        cudf::type_dispatcher(col->type(),
+                              leaf_schema_fn{col_schema, col, col_meta, timestamp_is_int96});
+
+        col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED;
+        col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
+        col_schema.parent_idx  = parent_idx;
+        col_schema.leaf_column = col;
+        schema.push_back(col_schema);
+      }
+    };
+
+  CUDF_EXPECTS(metadata.column_metadata.size() == linked_columns.size(),
+               "Mismatch in the number of columns and the corresponding metadata elements");
+  // Add all linked_columns to schema using parent_idx = 0 (root)
+  for (size_t i = 0; i < linked_columns.size(); ++i) {
+    add_schema(linked_columns[i], metadata.column_metadata[i], 0);
   }
 
-  auto is_string() const noexcept { return _string_type; }
-  auto is_list() const noexcept { return _list_type; }
-  size_t type_width() const noexcept { return _type_width; }
-  size_t row_count() const noexcept { return _row_count; }
-  size_t data_count() const noexcept { return _data_count; }
-  size_t null_count() const noexcept { return _null_count; }
-  bool nullable() const { return _nullability.back(); }
-  void const *data() const noexcept { return _data; }
-  uint32_t const *nulls() const noexcept { return _nulls; }
-  size_type offset() const noexcept { return _offset; }
-  bool level_nullable(size_t level) const { return _nullability[level]; }
-  int32_t decimal_scale() const noexcept { return _decimal_scale; }
-  uint8_t decimal_precision() const noexcept { return _decimal_precision; }
-
-  // List related data
-  column_view cudf_col() const noexcept { return _col; }
-  column_view leaf_col() const noexcept { return _leaf_col; }
-  size_type nesting_levels() const noexcept { return _nesting_levels; }
-  size_type const *level_offsets() const noexcept { return _dremel_offsets.data(); }
-  uint8_t const *repetition_levels() const noexcept { return _rep_level.data(); }
-  uint8_t const *definition_levels() const noexcept { return _def_level.data(); }
-  uint16_t max_def_level() const noexcept { return _max_def_level; }
-  void set_def_level(uint16_t def_level) { _max_def_level = def_level; }
-
-  auto name() const noexcept { return _name; }
-  auto physical_type() const noexcept { return _physical_type; }
-  auto converted_type() const noexcept { return _converted_type; }
-  auto stats_type() const noexcept { return _stats_dtype; }
-  int32_t ts_scale() const noexcept { return _ts_scale; }
-  void set_path_in_schema(std::vector<std::string> path) { _path_in_schema = std::move(path); }
-  auto get_path_in_schema() const noexcept { return _path_in_schema; }
-
-  // Dictionary management
+  return schema;
+}
+
+/**
+ * @brief Class to store parquet specific information for one data stream.
+ *
+ * Contains information about a single data stream. In case of struct columns, a data stream is one
+ * of the child leaf columns that contains data.
+ * e.g. A column Struct<int, List<float>> contains 2 data streams:
+ * - Struct<int>
+ * - Struct<List<float>>
+ *
+ */
+struct parquet_column_view {
+  parquet_column_view(schema_tree_node const &schema_node,
+                      std::vector<schema_tree_node> const &schema_tree,
+                      rmm::cuda_stream_view stream);
+
+  column_view leaf_column_view() const;
+  gpu::parquet_column_device_view get_device_view();
+
+  column_view cudf_column_view() const { return cudf_col; }
+  parquet::Type physical_type() const { return schema_node.type; }
+
+  std::vector<std::string> const &get_path_in_schema() { return path_in_schema; }
+
+  // LIST related member functions
+  uint8_t max_def_level() const noexcept { return _max_def_level; }
+  uint8_t max_rep_level() const noexcept { return _max_rep_level; }
+  bool is_list() const noexcept { return _is_list; }
+
+  // Dictionary related member functions
   uint32_t *get_dict_data() { return (_dict_data.size()) ? _dict_data.data().get() : nullptr; }
   uint32_t *get_dict_index() { return (_dict_index.size()) ? _dict_index.data().get() : nullptr; }
   void use_dictionary(bool use_dict) { _dictionary_used = use_dict; }
@@ -448,56 +537,185 @@ class parquet_column_view {
   }
 
  private:
-  // cudf data column
-  column_view _col;
-  column_view _leaf_col;
-
-  // Identifier within set of columns
-  size_t _id        = 0;
-  bool _string_type = false;
-  bool _list_type   = false;
-
-  size_t _type_width     = 0;
-  size_t _row_count      = 0;
-  size_t _data_count     = 0;
-  size_t _null_count     = 0;
-  void const *_data      = nullptr;
-  uint32_t const *_nulls = nullptr;
-  size_type _offset      = 0;
-
-  // parquet-related members
-  std::string _name{};
-  Type _physical_type;
-  ConvertedType _converted_type;
-  statistics_dtype _stats_dtype;
-  int32_t _ts_scale;
-  std::vector<std::string> _path_in_schema;
-
-  // Dictionary-related members
-  bool _dictionary_used = false;
-  rmm::device_vector<uint32_t> _dict_data;
-  rmm::device_vector<uint32_t> _dict_index;
+  // Schema related members
+  schema_tree_node schema_node;
+  std::vector<std::string> path_in_schema;
+  uint8_t _max_def_level = 0;
+  uint8_t _max_rep_level = 0;
+  rmm::device_uvector<uint8_t> _d_nullability;
+
+  column_view cudf_col;
 
   // List-related members
+  bool _is_list;
   rmm::device_uvector<size_type>
     _dremel_offsets;  ///< For each row, the absolute offset into the repetition and definition
                       ///< level vectors. O(num rows)
   rmm::device_uvector<uint8_t> _rep_level;
   rmm::device_uvector<uint8_t> _def_level;
-  std::vector<bool> _nullability;
-  size_type _max_def_level  = -1;
-  size_type _nesting_levels = 0;
+  std::vector<uint8_t> _nullability;
+  size_type _data_count = 0;
 
-  // String-related members
-  rmm::device_buffer _indexes;
-
-  // Decimal-related members
-  int32_t _decimal_scale     = 0;
-  uint8_t _decimal_precision = 0;
+  // Dictionary related members
+  bool _dictionary_used = false;
+  rmm::device_vector<uint32_t> _dict_data;
+  rmm::device_vector<uint32_t> _dict_index;
 };
 
+parquet_column_view::parquet_column_view(schema_tree_node const &schema_node,
+                                         std::vector<schema_tree_node> const &schema_tree,
+                                         rmm::cuda_stream_view stream)
+  : schema_node(schema_node),
+    _d_nullability(0, stream),
+    _dremel_offsets(0, stream),
+    _rep_level(0, stream),
+    _def_level(0, stream)
+{
+  // Construct single inheritance column_view from linked_column_view
+  auto curr_col                           = schema_node.leaf_column.get();
+  column_view single_inheritance_cudf_col = *curr_col;
+  while (curr_col->parent) {
+    auto const &parent = *curr_col->parent;
+
+    // For list columns, we still need to retain the offset child column.
+    auto children =
+      (parent.type().id() == type_id::LIST)
+        ? std::vector<column_view>{parent.child(lists_column_view::offsets_column_index),
+                                   single_inheritance_cudf_col}
+        : std::vector<column_view>{single_inheritance_cudf_col};
+
+    single_inheritance_cudf_col = column_view(parent.type(),
+                                              parent.size(),
+                                              parent.head(),
+                                              parent.null_mask(),
+                                              UNKNOWN_NULL_COUNT,
+                                              parent.offset(),
+                                              children);
+
+    curr_col = curr_col->parent;
+  }
+  cudf_col = single_inheritance_cudf_col;
+
+  // Construct path_in_schema by travelling up in the schema_tree
+  std::vector<std::string> path;
+  auto curr_schema_node = schema_node;
+  do {
+    path.push_back(curr_schema_node.name);
+    if (curr_schema_node.parent_idx != -1) {
+      curr_schema_node = schema_tree[curr_schema_node.parent_idx];
+    }
+  } while (curr_schema_node.parent_idx != -1);
+  path_in_schema = std::vector<std::string>(path.crbegin(), path.crend());
+
+  // Calculate max definition level by counting the number of levels that are optional (nullable)
+  // and max repetition level by counting the number of REPEATED levels in this column's hierarchy
+  uint16_t max_def_level = 0;
+  uint16_t max_rep_level = 0;
+  curr_schema_node       = schema_node;
+  while (curr_schema_node.parent_idx != -1) {
+    if (curr_schema_node.repetition_type == parquet::REPEATED or
+        curr_schema_node.repetition_type == parquet::OPTIONAL) {
+      ++max_def_level;
+    }
+    if (curr_schema_node.repetition_type == parquet::REPEATED) { ++max_rep_level; }
+    curr_schema_node = schema_tree[curr_schema_node.parent_idx];
+  }
+  CUDF_EXPECTS(max_def_level < 256, "Definition levels above 255 are not supported");
+  CUDF_EXPECTS(max_rep_level < 256, "Definition levels above 255 are not supported");
+
+  _max_def_level = max_def_level;
+  _max_rep_level = max_rep_level;
+
+  // Construct nullability vector using repetition_type from schema.
+  std::vector<uint8_t> r_nullability;
+  curr_schema_node = schema_node;
+  while (curr_schema_node.parent_idx != -1) {
+    if (not curr_schema_node.is_stub()) {
+      r_nullability.push_back(curr_schema_node.repetition_type == FieldRepetitionType::OPTIONAL);
+    }
+    curr_schema_node = schema_tree[curr_schema_node.parent_idx];
+  }
+  _nullability = std::vector<uint8_t>(r_nullability.crbegin(), r_nullability.crend());
+  // TODO(cp): Explore doing this for all columns in a single go outside this ctor. Maybe using
+  // hostdevice_vector. Currently this involves a cudaMemcpyAsync for each column.
+  _d_nullability = rmm::device_uvector<uint8_t>(_nullability.size(), stream);
+  CUDA_TRY(cudaMemcpyAsync(_d_nullability.data(),
+                           _nullability.data(),
+                           _nullability.size() * sizeof(uint8_t),
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
+
+  _is_list = (_max_rep_level > 0);
+
+  if (cudf_col.size() == 0) { return; }
+
+  if (_is_list) {
+    // Top level column's offsets are not applied to all children. Get the effective offset and
+    // size of the leaf column
+    // Calculate row offset into dremel data (repetition/definition values) and the respective
+    // definition and repetition levels
+    gpu::dremel_data dremel = gpu::get_dremel_data(cudf_col, _d_nullability, _nullability, stream);
+    _dremel_offsets         = std::move(dremel.dremel_offsets);
+    _rep_level              = std::move(dremel.rep_level);
+    _def_level              = std::move(dremel.def_level);
+    _data_count = dremel.leaf_data_size;  // Needed for knowing what size dictionary to allocate
+
+    stream.synchronize();
+  } else {
+    // For non-list struct, the size of the root column is the same as the size of the leaf column
+    _data_count = cudf_col.size();
+  }
+}
+
+column_view parquet_column_view::leaf_column_view() const
+{
+  auto col = cudf_col;
+  while (cudf::is_nested(col.type())) {
+    if (col.type().id() == type_id::LIST) {
+      col = col.child(lists_column_view::child_column_index);
+    } else if (col.type().id() == type_id::STRUCT) {
+      col = col.child(0);  // Stored cudf_col has only one child if struct
+    }
+  }
+  return col;
+}
+
+gpu::parquet_column_device_view parquet_column_view::get_device_view()
+{
+  column_view col  = leaf_column_view();
+  auto desc        = gpu::parquet_column_device_view{};  // Zero out all fields
+  desc.stats_dtype = schema_node.stats_dtype;
+  desc.ts_scale    = schema_node.ts_scale;
+
+  // TODO (dm): Enable dictionary for list after refactor
+  if (physical_type() != BOOLEAN && physical_type() != UNDEFINED_TYPE && !is_list()) {
+    alloc_dictionary(_data_count);
+    desc.dict_index = get_dict_index();
+    desc.dict_data  = get_dict_data();
+  }
+
+  if (is_list()) {
+    desc.level_offsets = _dremel_offsets.data();
+    desc.rep_values    = _rep_level.data();
+    desc.def_values    = _def_level.data();
+  }
+  desc.num_rows      = cudf_col.size();
+  desc.physical_type = static_cast<uint8_t>(physical_type());
+  auto count_bits    = [](uint16_t number) {
+    int16_t nbits = 0;
+    while (number > 0) {
+      nbits++;
+      number >>= 1;
+    }
+    return nbits;
+  };
+  desc.level_bits  = count_bits(max_rep_level()) << 4 | count_bits(max_def_level());
+  desc.nullability = _d_nullability.data();
+  return desc;
+}
+
 void writer::impl::init_page_fragments(hostdevice_vector<gpu::PageFragment> &frag,
-                                       hostdevice_vector<gpu::EncColumnDesc> &col_desc,
+                                       hostdevice_vector<gpu::parquet_column_device_view> &col_desc,
                                        uint32_t num_columns,
                                        uint32_t num_fragments,
                                        uint32_t num_rows,
@@ -513,12 +731,13 @@ void writer::impl::init_page_fragments(hostdevice_vector<gpu::PageFragment> &fra
   frag.device_to_host(stream, true);
 }
 
-void writer::impl::gather_fragment_statistics(statistics_chunk *frag_stats_chunk,
-                                              hostdevice_vector<gpu::PageFragment> &frag,
-                                              hostdevice_vector<gpu::EncColumnDesc> &col_desc,
-                                              uint32_t num_columns,
-                                              uint32_t num_fragments,
-                                              uint32_t fragment_size)
+void writer::impl::gather_fragment_statistics(
+  statistics_chunk *frag_stats_chunk,
+  hostdevice_vector<gpu::PageFragment> &frag,
+  hostdevice_vector<gpu::parquet_column_device_view> &col_desc,
+  uint32_t num_columns,
+  uint32_t num_fragments,
+  uint32_t fragment_size)
 {
   rmm::device_vector<statistics_group> frag_stats_group(num_fragments * num_columns);
 
@@ -534,11 +753,12 @@ void writer::impl::gather_fragment_statistics(statistics_chunk *frag_stats_chunk
   stream.synchronize();
 }
 
-void writer::impl::build_chunk_dictionaries(hostdevice_vector<gpu::EncColumnChunk> &chunks,
-                                            hostdevice_vector<gpu::EncColumnDesc> &col_desc,
-                                            uint32_t num_rowgroups,
-                                            uint32_t num_columns,
-                                            uint32_t num_dictionaries)
+void writer::impl::build_chunk_dictionaries(
+  hostdevice_vector<gpu::EncColumnChunk> &chunks,
+  hostdevice_vector<gpu::parquet_column_device_view> &col_desc,
+  uint32_t num_rowgroups,
+  uint32_t num_columns,
+  uint32_t num_dictionaries)
 {
   size_t dict_scratch_size = (size_t)num_dictionaries * gpu::kDictScratchSize;
   rmm::device_vector<uint32_t> dict_scratch(dict_scratch_size / sizeof(uint32_t));
@@ -560,7 +780,7 @@ void writer::impl::build_chunk_dictionaries(hostdevice_vector<gpu::EncColumnChun
 }
 
 void writer::impl::init_encoder_pages(hostdevice_vector<gpu::EncColumnChunk> &chunks,
-                                      hostdevice_vector<gpu::EncColumnDesc> &col_desc,
+                                      hostdevice_vector<gpu::parquet_column_device_view> &col_desc,
                                       gpu::EncPage *pages,
                                       statistics_chunk *page_stats,
                                       statistics_chunk *frag_stats,
@@ -651,10 +871,11 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
     stats_granularity_(options.get_stats_level()),
     int96_timestamps(options.is_enabled_int96_timestamps()),
     out_sink_(std::move(sink)),
-    decimal_precision(options.get_decimal_precision()),
-    single_write_mode(mode == SingleWriteMode::YES),
-    user_metadata(options.get_metadata())
+    single_write_mode(mode == SingleWriteMode::YES)
 {
+  if (options.get_metadata()) {
+    table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
+  }
   init_state();
 }
 
@@ -668,15 +889,12 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
     compression_(to_parquet_compression(options.get_compression())),
     stats_granularity_(options.get_stats_level()),
     int96_timestamps(options.is_enabled_int96_timestamps()),
-    decimal_precision(options.get_decimal_precision()),
     single_write_mode(mode == SingleWriteMode::YES),
     out_sink_(std::move(sink))
 {
-  if (options.get_nullable_metadata() != nullptr) {
-    user_metadata_with_nullability = *options.get_nullable_metadata();
-    user_metadata                  = &user_metadata_with_nullability;
+  if (options.get_metadata()) {
+    table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
   }
-
   init_state();
 }
 
@@ -695,148 +913,51 @@ void writer::impl::write(table_view const &table)
 {
   CUDF_EXPECTS(not closed, "Data has already been flushed to out and closed");
 
-  size_type num_columns = table.num_columns();
-  size_type num_rows    = table.num_rows();
+  size_type num_rows = table.num_rows();
 
-  // Wrapper around cudf columns to attach parquet-specific type info.
-  // Note : I wish we could do this in the begin() function but since the
-  // metadata is optional we would have no way of knowing how many columns
-  // we actually have.
-  std::vector<parquet_column_view> parquet_columns;
-  parquet_columns.reserve(num_columns);  // Avoids unnecessary re-allocation
-
-  // because the repetition type is global (in the sense of, not per-rowgroup or per write_chunk()
-  // call) we cannot know up front if the user is going to end up passing tables with nulls/no nulls
-  // in the multiple write_chunk() case.  so we'll do some special handling.
-  // The user can pass in information about the nullability of a column to be enforced across
-  // write_chunk() calls, in a flattened bool vector. Figure out that per column.
-  auto per_column_nullability =
-    (single_write_mode)
-      ? std::vector<std::vector<bool>>{}
-      : get_per_column_nullability(table, user_metadata_with_nullability.column_nullable);
-
-  uint decimal_precision_idx = 0;
-
-  for (auto it = table.begin(); it < table.end(); ++it) {
-    const auto col        = *it;
-    const auto current_id = parquet_columns.size();
-
-    // if the user is explicitly saying "I am only calling this once", assume the columns in this
-    // one table tell us everything we need to know about their nullability.
-    // Empty nullability means the writer figures out the nullability from the cudf columns.
-    auto const &this_column_nullability =
-      (single_write_mode) ? std::vector<bool>{} : per_column_nullability[current_id];
-
-    parquet_columns.emplace_back(current_id,
-                                 col,
-                                 this_column_nullability,
-                                 user_metadata,
-                                 int96_timestamps,
-                                 decimal_precision,
-                                 decimal_precision_idx,
-                                 stream);
-  }
+  if (not table_meta) { table_meta = std::make_unique<table_input_metadata>(table); }
 
-  CUDF_EXPECTS(decimal_precision_idx == decimal_precision.size(),
-               "Too many decimal precision values!");
+  // Fill unnamed columns' names in table_meta
+  std::function<void(column_in_metadata &, std::string)> add_default_name =
+    [&](column_in_metadata &col_meta, std::string default_name) {
+      if (col_meta.get_name().empty()) col_meta.set_name(default_name);
+      for (size_type i = 0; i < col_meta.num_children(); ++i) {
+        add_default_name(col_meta.child(i), col_meta.get_name() + "_" + std::to_string(i));
+      }
+    };
+  for (size_t i = 0; i < table_meta->column_metadata.size(); ++i) {
+    add_default_name(table_meta->column_metadata[i], "_col" + std::to_string(i));
+  }
 
-  // first call. setup metadata. num_rows will get incremented as write_chunk is
-  // called multiple times.
-  // Calculate the sum of depths of all list columns
-  size_type const list_col_depths = std::accumulate(
-    parquet_columns.cbegin(), parquet_columns.cend(), 0, [](size_type sum, auto const &col) {
-      return sum + col.nesting_levels();
-    });
+  auto vec         = input_table_to_linked_columns(table);
+  auto schema_tree = construct_schema_tree(vec, *table_meta, single_write_mode, int96_timestamps);
+  // Construct parquet_column_views from the schema tree leaf nodes.
+  std::vector<parquet_column_view> parquet_columns;
 
-  // Make schema with current table
-  std::vector<SchemaElement> this_table_schema;
-  {
-    // Each level of nesting requires two levels of Schema. The leaf level needs one schema element
-    this_table_schema.reserve(1 + num_columns + list_col_depths * 2);
-    SchemaElement root{};
-    root.type            = UNDEFINED_TYPE;
-    root.repetition_type = NO_REPETITION_TYPE;
-    root.name            = "schema";
-    root.num_children    = num_columns;
-    this_table_schema.push_back(std::move(root));
-    for (auto i = 0; i < num_columns; i++) {
-      auto &col = parquet_columns[i];
-      if (col.is_list()) {
-        size_type nesting_depth = col.nesting_levels();
-        // Each level of nesting requires two levels of Schema. The leaf level needs one schema
-        // element
-        std::vector<SchemaElement> list_schema(nesting_depth * 2 + 1);
-        for (size_type j = 0; j < nesting_depth; j++) {
-          // List schema is denoted by two levels for each nesting level and one final level for
-          // leaf. The top level is the same name as the column name.
-          // So e.g. List<List<int>> is denoted in the schema by
-          // "col_name" : { "list" : { "element" : { "list" : { "element" } } } }
-          auto const group_idx = 2 * j;
-          auto const list_idx  = 2 * j + 1;
-
-          list_schema[group_idx].name            = (j == 0) ? col.name() : "element";
-          list_schema[group_idx].repetition_type = (col.level_nullable(j)) ? OPTIONAL : REQUIRED;
-          list_schema[group_idx].converted_type  = ConvertedType::LIST;
-          list_schema[group_idx].num_children    = 1;
-
-          list_schema[list_idx].name            = "list";
-          list_schema[list_idx].repetition_type = REPEATED;
-          list_schema[list_idx].num_children    = 1;
-        }
-        list_schema[nesting_depth * 2].name = "element";
-        list_schema[nesting_depth * 2].repetition_type =
-          col.level_nullable(nesting_depth) ? OPTIONAL : REQUIRED;
-        auto const &physical_type           = col.physical_type();
-        list_schema[nesting_depth * 2].type = physical_type;
-        list_schema[nesting_depth * 2].converted_type =
-          physical_type == parquet::Type::INT96 ? ConvertedType::UNKNOWN : col.converted_type();
-        list_schema[nesting_depth * 2].num_children      = 0;
-        list_schema[nesting_depth * 2].decimal_precision = col.decimal_precision();
-        list_schema[nesting_depth * 2].decimal_scale     = col.decimal_scale();
-
-        std::vector<std::string> path_in_schema;
-        std::transform(
-          list_schema.cbegin(), list_schema.cend(), std::back_inserter(path_in_schema), [](auto s) {
-            return s.name;
-          });
-        col.set_path_in_schema(path_in_schema);
-        this_table_schema.insert(this_table_schema.end(), list_schema.begin(), list_schema.end());
-      } else {
-        SchemaElement col_schema{};
-        // Column metadata
-        auto const &physical_type = col.physical_type();
-        col_schema.type           = physical_type;
-        col_schema.converted_type =
-          physical_type == parquet::Type::INT96 ? ConvertedType::UNKNOWN : col.converted_type();
-
-        col_schema.repetition_type =
-          (col.max_def_level() == 1 || (single_write_mode && col.row_count() < (size_t)num_rows))
-            ? OPTIONAL
-            : REQUIRED;
-
-        col_schema.name              = col.name();
-        col_schema.num_children      = 0;  // Leaf node
-        col_schema.decimal_precision = col.decimal_precision();
-        col_schema.decimal_scale     = col.decimal_scale();
-
-        this_table_schema.push_back(std::move(col_schema));
-      }
-    }
+  for (schema_tree_node const &schema_node : schema_tree) {
+    if (schema_node.leaf_column) { parquet_columns.emplace_back(schema_node, schema_tree, stream); }
   }
 
+  // Mass allocation of column_device_views for each parquet_column_view
+  std::vector<column_view> cudf_cols;
+  cudf_cols.reserve(parquet_columns.size());
+  for (auto const &parq_col : parquet_columns) { cudf_cols.push_back(parq_col.cudf_column_view()); }
+  table_view single_streams_table(cudf_cols);
+  size_type num_columns = single_streams_table.num_columns();
+
+  std::vector<SchemaElement> this_table_schema(schema_tree.begin(), schema_tree.end());
+
   if (md.version == 0) {
     md.version  = 1;
     md.num_rows = num_rows;
     md.column_order_listsize =
       (stats_granularity_ != statistics_freq::STATISTICS_NONE) ? num_columns : 0;
-    if (user_metadata != nullptr) {
-      std::transform(user_metadata->user_data.begin(),
-                     user_metadata->user_data.end(),
-                     std::back_inserter(md.key_value_metadata),
-                     [](auto const &kv) {
-                       return KeyValue{kv.first, kv.second};
-                     });
-    }
+    std::transform(table_meta->user_data.begin(),
+                   table_meta->user_data.end(),
+                   std::back_inserter(md.key_value_metadata),
+                   [](auto const &kv) {
+                     return KeyValue{kv.first, kv.second};
+                   });
     md.schema = this_table_schema;
   } else {
     // verify the user isn't passing mismatched tables
@@ -848,49 +969,17 @@ void writer::impl::write(table_view const &table)
   }
   // Create table_device_view so that corresponding column_device_view data
   // can be written into col_desc members
-  auto parent_column_table_device_view = table_device_view::create(table);
+  auto parent_column_table_device_view = table_device_view::create(single_streams_table);
   rmm::device_uvector<column_device_view> leaf_column_views(0, stream);
 
   // Initialize column description
-  hostdevice_vector<gpu::EncColumnDesc> col_desc(num_columns, stream);
-
-  // setup gpu column description.
-  // applicable to only this _write_chunk() call
-  for (auto i = 0; i < num_columns; i++) {
-    auto &col = parquet_columns[i];
-    // GPU column description
-    auto *desc             = &col_desc[i];
-    *desc                  = gpu::EncColumnDesc{};  // Zero out all fields
-    desc->column_data_base = col.data();
-    desc->valid_map_base   = col.nulls();
-    desc->column_offset    = col.offset();
-    desc->stats_dtype      = col.stats_type();
-    desc->ts_scale         = col.ts_scale();
-    // TODO (dm): Enable dictionary for list after refactor
-    if (col.physical_type() != BOOLEAN && col.physical_type() != UNDEFINED_TYPE && !col.is_list()) {
-      col.alloc_dictionary(col.data_count());
-      desc->dict_index = col.get_dict_index();
-      desc->dict_data  = col.get_dict_data();
-    }
-    if (col.is_list()) {
-      desc->level_offsets = col.level_offsets();
-      desc->rep_values    = col.repetition_levels();
-      desc->def_values    = col.definition_levels();
-    }
-    desc->num_values     = col.data_count();
-    desc->num_rows       = col.row_count();
-    desc->physical_type  = static_cast<uint8_t>(col.physical_type());
-    desc->converted_type = static_cast<uint8_t>(col.converted_type());
-    auto count_bits      = [](uint16_t number) {
-      int16_t nbits = 0;
-      while (number > 0) {
-        nbits++;
-        number >>= 1;
-      }
-      return nbits;
-    };
-    desc->level_bits = count_bits(col.nesting_levels()) << 4 | count_bits(col.max_def_level());
-  }
+  hostdevice_vector<gpu::parquet_column_device_view> col_desc(parquet_columns.size(), stream);
+  // This should've been `auto const&` but isn't since dictionary space is allocated when calling
+  // get_device_view(). Fix during dictionary refactor.
+  std::transform(
+    parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [](auto &pcol) {
+      return pcol.get_device_view();
+    });
 
   // Init page fragments
   // 5000 is good enough for up to ~200-character strings. Longer strings will start producing
@@ -909,7 +998,7 @@ void writer::impl::write(table_view const &table)
   if (fragments.size() != 0) {
     // Move column info to device
     col_desc.host_to_device(stream);
-    leaf_column_views = create_leaf_column_device_views<gpu::EncColumnDesc>(
+    leaf_column_views = create_leaf_column_device_views<gpu::parquet_column_device_view>(
       col_desc, *parent_column_table_device_view, stream);
 
     init_page_fragments(fragments, col_desc, num_columns, num_fragments, num_rows, fragment_size);
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index f5e0f7408c5..b8532d755eb 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -44,7 +44,7 @@ namespace io {
 namespace detail {
 namespace parquet {
 // Forward internal classes
-class parquet_column_view;
+struct parquet_column_view;
 
 using namespace cudf::io::parquet;
 using namespace cudf::io;
@@ -130,7 +130,7 @@ class writer::impl {
    * @param fragment_size Number of rows per fragment
    */
   void init_page_fragments(hostdevice_vector<gpu::PageFragment>& frag,
-                           hostdevice_vector<gpu::EncColumnDesc>& col_desc,
+                           hostdevice_vector<gpu::parquet_column_device_view>& col_desc,
                            uint32_t num_columns,
                            uint32_t num_fragments,
                            uint32_t num_rows,
@@ -148,7 +148,7 @@ class writer::impl {
    */
   void gather_fragment_statistics(statistics_chunk* dst_stats,
                                   hostdevice_vector<gpu::PageFragment>& frag,
-                                  hostdevice_vector<gpu::EncColumnDesc>& col_desc,
+                                  hostdevice_vector<gpu::parquet_column_device_view>& col_desc,
                                   uint32_t num_columns,
                                   uint32_t num_fragments,
                                   uint32_t fragment_size);
@@ -162,7 +162,7 @@ class writer::impl {
    * @param num_dictionaries Total number of dictionaries
    */
   void build_chunk_dictionaries(hostdevice_vector<gpu::EncColumnChunk>& chunks,
-                                hostdevice_vector<gpu::EncColumnDesc>& col_desc,
+                                hostdevice_vector<gpu::parquet_column_device_view>& col_desc,
                                 uint32_t num_rowgroups,
                                 uint32_t num_columns,
                                 uint32_t num_dictionaries);
@@ -178,7 +178,7 @@ class writer::impl {
    * @param num_stats_bfr Number of statistics buffers
    */
   void init_encoder_pages(hostdevice_vector<gpu::EncColumnChunk>& chunks,
-                          hostdevice_vector<gpu::EncColumnDesc>& col_desc,
+                          hostdevice_vector<gpu::parquet_column_device_view>& col_desc,
                           gpu::EncPage* pages,
                           statistics_chunk* page_stats,
                           statistics_chunk* frag_stats,
@@ -228,15 +228,9 @@ class writer::impl {
   // Overall file metadata.  Filled in during the process and written during write_chunked_end()
   cudf::io::parquet::FileMetaData md;
   // optional user metadata
-  table_metadata_with_nullability user_metadata_with_nullability;
-  // only used in the write_chunked() case. copied from the (optionally) user supplied
-  // argument to write()
-  table_metadata const* user_metadata = nullptr;
+  std::unique_ptr<table_input_metadata> table_meta;
   // to track if the output has been written to sink
   bool closed = false;
-  // vector of precision values for decimal writing. Exactly one entry
-  // per decimal column.
-  std::vector<uint8_t> decimal_precision;
   // current write position for rowgroups/chunks
   std::size_t current_chunk_offset;
   // special parameter only used by detail::write() to indicate that we are guaranteeing
diff --git a/cpp/src/io/utilities/column_utils.cuh b/cpp/src/io/utilities/column_utils.cuh
index 4f41e846631..c08f42583ef 100644
--- a/cpp/src/io/utilities/column_utils.cuh
+++ b/cpp/src/io/utilities/column_utils.cuh
@@ -57,27 +57,24 @@ rmm::device_uvector<column_device_view> create_leaf_column_device_views(
   auto leaf_columns = cudf::device_span<column_device_view>{leaf_column_views};
 
   auto iter = thrust::make_counting_iterator<size_type>(0);
-  thrust::for_each(rmm::exec_policy(stream),
-                   iter,
-                   iter + parent_table_device_view.num_columns(),
-                   [col_desc, parent_col_view = parent_table_device_view, leaf_columns] __device__(
-                     size_type index) mutable {
-                     column_device_view col = parent_col_view.column(index);
-
-                     if (col.type().id() == type_id::LIST) {
-                       col_desc[index].parent_column = parent_col_view.begin() + index;
-                     } else {
-                       col_desc[index].parent_column = nullptr;
-                     }
-                     // traverse till leaf column
-                     while (col.type().id() == type_id::LIST) {
-                       col = col.child(lists_column_view::child_column_index);
-                     }
-                     // Store leaf_column to device storage
-                     column_device_view *leaf_col_ptr = leaf_columns.begin() + index;
-                     *leaf_col_ptr                    = col;
-                     col_desc[index].leaf_column      = leaf_col_ptr;
-                   });
+  thrust::for_each(
+    rmm::exec_policy(stream),
+    iter,
+    iter + parent_table_device_view.num_columns(),
+    [col_desc, parent_col_view = parent_table_device_view, leaf_columns] __device__(
+      size_type index) mutable {
+      col_desc[index].parent_column = parent_col_view.begin() + index;
+      column_device_view col        = parent_col_view.column(index);
+      // traverse till leaf column
+      while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
+        col = (col.type().id() == type_id::LIST) ? col.child(lists_column_view::child_column_index)
+                                                 : col.child(0);
+      }
+      // Store leaf_column to device storage
+      column_device_view *leaf_col_ptr = leaf_columns.begin() + index;
+      *leaf_col_ptr                    = col;
+      col_desc[index].leaf_column      = leaf_col_ptr;
+    });
 
   return leaf_column_views;
 }
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 420bdc3e3ba..995ee94472f 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -100,6 +100,25 @@ std::unique_ptr<cudf::table> create_compressible_fixed_table(cudf::size_type num
   return create_fixed_table<T>(num_columns, num_rows, include_validity, compressible_elements);
 }
 
+void compare_metadata_equality(cudf::io::table_input_metadata in_meta,
+                               cudf::io::table_metadata out_meta)
+{
+  std::function<void(cudf::io::column_name_info, cudf::io::column_in_metadata)> compare_names =
+    [&](cudf::io::column_name_info out_col, cudf::io::column_in_metadata in_col) {
+      if (not in_col.get_name().empty()) { EXPECT_EQ(out_col.name, in_col.get_name()); }
+      EXPECT_EQ(out_col.children.size(), in_col.num_children());
+      for (size_t i = 0; i < out_col.children.size(); ++i) {
+        compare_names(out_col.children[i], in_col.child(i));
+      }
+    };
+
+  EXPECT_EQ(out_meta.schema_info.size(), in_meta.column_metadata.size());
+
+  for (size_t i = 0; i < out_meta.schema_info.size(); ++i) {
+    compare_names(out_meta.schema_info[i], in_meta.column_metadata[i]);
+  }
+}
+
 // Base test fixture for tests
 struct ParquetWriterTest : public cudf::test::BaseFixture {
 };
@@ -308,16 +327,6 @@ TEST_F(ParquetWriterTest, MultiColumn)
   column_wrapper<numeric::decimal32> col6{col6_data, col6_data + num_rows, validity};
   column_wrapper<numeric::decimal64> col7{col7_data, col7_data + num_rows, validity};
 
-  cudf_io::table_metadata expected_metadata;
-  // expected_metadata.column_names.emplace_back("bools");
-  expected_metadata.column_names.emplace_back("int8s");
-  expected_metadata.column_names.emplace_back("int16s");
-  expected_metadata.column_names.emplace_back("int32s");
-  expected_metadata.column_names.emplace_back("floats");
-  expected_metadata.column_names.emplace_back("doubles");
-  expected_metadata.column_names.emplace_back("decimal32s");
-  expected_metadata.column_names.emplace_back("decimal64s");
-
   std::vector<std::unique_ptr<column>> cols;
   // cols.push_back(col0.release());
   cols.push_back(col1.release());
@@ -330,12 +339,20 @@ TEST_F(ParquetWriterTest, MultiColumn)
   auto expected = std::make_unique<table>(std::move(cols));
   EXPECT_EQ(7, expected->num_columns());
 
+  cudf_io::table_input_metadata expected_metadata(*expected);
+  // expected_metadata.column_metadata[0].set_name( "bools");
+  expected_metadata.column_metadata[0].set_name("int8s");
+  expected_metadata.column_metadata[1].set_name("int16s");
+  expected_metadata.column_metadata[2].set_name("int32s");
+  expected_metadata.column_metadata[3].set_name("floats");
+  expected_metadata.column_metadata[4].set_name("doubles");
+  expected_metadata.column_metadata[5].set_name("decimal32s").set_decimal_precision(10);
+  expected_metadata.column_metadata[6].set_name("decimal64s").set_decimal_precision(20);
+
   auto filepath = temp_env->get_temp_filepath("MultiColumn.parquet");
   cudf_io::parquet_writer_options out_opts =
     cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected->view())
       .metadata(&expected_metadata);
-  std::vector<uint8_t> precisions = {10, 20};
-  out_opts.set_decimal_precision(precisions);
   cudf_io::write_parquet(out_opts);
 
   cudf_io::parquet_reader_options in_opts =
@@ -343,7 +360,7 @@ TEST_F(ParquetWriterTest, MultiColumn)
   auto result = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+  compare_metadata_equality(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, MultiColumnWithNulls)
@@ -390,16 +407,6 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls)
   column_wrapper<numeric::decimal32> col6{col6_data, col6_data + num_rows, col6_mask};
   column_wrapper<numeric::decimal64> col7{col7_data, col7_data + num_rows, col7_mask};
 
-  cudf_io::table_metadata expected_metadata;
-  // expected_metadata.column_names.emplace_back("bools");
-  expected_metadata.column_names.emplace_back("int8s");
-  expected_metadata.column_names.emplace_back("int16s");
-  expected_metadata.column_names.emplace_back("int32s");
-  expected_metadata.column_names.emplace_back("floats");
-  expected_metadata.column_names.emplace_back("doubles");
-  expected_metadata.column_names.emplace_back("decimal32s");
-  expected_metadata.column_names.emplace_back("decimal64s");
-
   std::vector<std::unique_ptr<column>> cols;
   // cols.push_back(col0.release());
   cols.push_back(col1.release());
@@ -412,12 +419,20 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls)
   auto expected = std::make_unique<table>(std::move(cols));
   EXPECT_EQ(7, expected->num_columns());
 
+  cudf_io::table_input_metadata expected_metadata(*expected);
+  // expected_metadata.column_names.emplace_back("bools");
+  expected_metadata.column_metadata[0].set_name("int8s");
+  expected_metadata.column_metadata[1].set_name("int16s");
+  expected_metadata.column_metadata[2].set_name("int32s");
+  expected_metadata.column_metadata[3].set_name("floats");
+  expected_metadata.column_metadata[4].set_name("doubles");
+  expected_metadata.column_metadata[5].set_name("decimal32s").set_decimal_precision(9);
+  expected_metadata.column_metadata[6].set_name("decimal64s").set_decimal_precision(20);
+
   auto filepath = temp_env->get_temp_filepath("MultiColumnWithNulls.parquet");
   cudf_io::parquet_writer_options out_opts =
     cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected->view())
       .metadata(&expected_metadata);
-  std::vector<uint8_t> precisions = {9, 20};
-  out_opts.set_decimal_precision(precisions);
 
   cudf_io::write_parquet(out_opts);
 
@@ -426,7 +441,10 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls)
   auto result = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+  // TODO: Need to be able to return metadata in tree form from reader so they can be compared.
+  // Unfortunately the closest thing to a heirarchical schema is column_name_info which does not
+  // have any tests for it c++ or python.
+  compare_metadata_equality(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, Strings)
@@ -443,11 +461,6 @@ TEST_F(ParquetWriterTest, Strings)
   column_wrapper<cudf::string_view> col1{strings.begin(), strings.end()};
   column_wrapper<float> col2{seq_col2.begin(), seq_col2.end(), validity};
 
-  cudf_io::table_metadata expected_metadata;
-  expected_metadata.column_names.emplace_back("col_other");
-  expected_metadata.column_names.emplace_back("col_string");
-  expected_metadata.column_names.emplace_back("col_another");
-
   std::vector<std::unique_ptr<column>> cols;
   cols.push_back(col0.release());
   cols.push_back(col1.release());
@@ -455,6 +468,11 @@ TEST_F(ParquetWriterTest, Strings)
   auto expected = std::make_unique<table>(std::move(cols));
   EXPECT_EQ(3, expected->num_columns());
 
+  cudf_io::table_input_metadata expected_metadata(*expected);
+  expected_metadata.column_metadata[0].set_name("col_other");
+  expected_metadata.column_metadata[1].set_name("col_string");
+  expected_metadata.column_metadata[2].set_name("col_another");
+
   auto filepath = temp_env->get_temp_filepath("Strings.parquet");
   cudf_io::parquet_writer_options out_opts =
     cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected->view())
@@ -466,7 +484,7 @@ TEST_F(ParquetWriterTest, Strings)
   auto result = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+  compare_metadata_equality(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, SlicedTable)
@@ -479,7 +497,8 @@ TEST_F(ParquetWriterTest, SlicedTable)
 
   auto seq_col0 = random_values<int>(num_rows);
   auto seq_col2 = random_values<float>(num_rows);
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+  auto validity =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 != 0; });
 
   column_wrapper<int> col0{seq_col0.begin(), seq_col0.end(), validity};
   column_wrapper<cudf::string_view> col1{strings.begin(), strings.end()};
@@ -510,17 +529,64 @@ TEST_F(ParquetWriterTest, SlicedTable)
            },
            valids2};
 
-  cudf_io::table_metadata expected_metadata;
-  expected_metadata.column_names.emplace_back("col_other");
-  expected_metadata.column_names.emplace_back("col_string");
-  expected_metadata.column_names.emplace_back("col_another");
-  expected_metadata.column_names.emplace_back("col_list");
-  expected_metadata.column_names.emplace_back("col_multi_level_list");
+  // Struct column
+  auto ages_col = cudf::test::fixed_width_column_wrapper<int32_t>{
+    {48, 27, 25, 31, 351, 351, 29, 15}, {1, 1, 1, 1, 1, 0, 1, 1}};
 
-  auto expected = table_view({col0, col1, col2, col3, col4});
+  auto col5 = cudf::test::structs_column_wrapper{{ages_col}, {1, 1, 1, 1, 0, 1, 1, 1}};
 
+  // Struct/List mixed column
+
+  // []
+  // [NULL, 2, NULL]
+  // [4, 5]
+  // NULL
+  // []
+  // [7, 8, 9]
+  // [10]
+  // [11, 12]
+  lcw land{{{}, {{1, 2, 3}, valids}, {4, 5}, {}, {}, {7, 8, 9}, {10}, {11, 12}}, valids2};
+
+  // []
+  // [[1, 2, 3], [], [4, 5], [], [0, 6, 0]]
+  // [[7, 8], []]
+  // [[]]
+  // [[]]
+  // [[], [], []]
+  // [[10]]
+  // [[13, 14], [15]]
+  lcw flats{lcw{},
+            {{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}},
+            {{7, 8}, {}},
+            lcw{lcw{}},
+            lcw{lcw{}},
+            lcw{lcw{}, lcw{}, lcw{}},
+            {lcw{10}},
+            {{13, 14}, {15}}};
+
+  auto struct_1 = cudf::test::structs_column_wrapper{land, flats};
+  auto is_human = cudf::test::fixed_width_column_wrapper<bool>{
+    {true, true, false, false, true, false, true, false}};
+  auto col6 = cudf::test::structs_column_wrapper{{is_human, struct_1}};
+
+  auto expected = table_view({col0, col1, col2, col3, col4, col5, col6});
+
+  // auto expected_slice = expected;
   auto expected_slice = cudf::slice(expected, {2, static_cast<cudf::size_type>(num_rows) - 1});
 
+  cudf_io::table_input_metadata expected_metadata(expected_slice);
+  expected_metadata.column_metadata[0].set_name("col_other");
+  expected_metadata.column_metadata[1].set_name("col_string");
+  expected_metadata.column_metadata[2].set_name("col_another");
+  expected_metadata.column_metadata[3].set_name("col_list");
+  expected_metadata.column_metadata[4].set_name("col_multi_level_list");
+  expected_metadata.column_metadata[5].set_name("col_struct");
+  expected_metadata.column_metadata[5].set_name("col_struct_list");
+  expected_metadata.column_metadata[6].child(0).set_name("human?");
+  expected_metadata.column_metadata[6].child(1).set_name("particulars");
+  expected_metadata.column_metadata[6].child(1).child(0).set_name("land");
+  expected_metadata.column_metadata[6].child(1).child(1).set_name("flats");
+
   auto filepath = temp_env->get_temp_filepath("SlicedTable.parquet");
   cudf_io::parquet_writer_options out_opts =
     cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected_slice)
@@ -532,7 +598,7 @@ TEST_F(ParquetWriterTest, SlicedTable)
   auto result = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view());
-  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+  compare_metadata_equality(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, ListColumn)
@@ -607,18 +673,18 @@ TEST_F(ParquetWriterTest, ListColumn)
            },
            valids2};
 
-  cudf_io::table_metadata expected_metadata;
-  expected_metadata.column_names.emplace_back("col_list_int_0");
-  expected_metadata.column_names.emplace_back("col_list_list_int_1");
-  expected_metadata.column_names.emplace_back("col_list_list_int_nullable_2");
-  expected_metadata.column_names.emplace_back("col_list_list_nullable_double_nullable_3");
-  // expected_metadata.column_names.emplace_back("col_list_list_uint16_4");
-  expected_metadata.column_names.emplace_back("col_list_nullable_list_nullable_int_nullable_5");
-  expected_metadata.column_names.emplace_back("col_list_list_string_6");
-  expected_metadata.column_names.emplace_back("col_list_list_list_7");
-
   table_view expected({col0, col1, col2, col3, /* col4, */ col5, col6, col7});
 
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("col_list_int_0");
+  expected_metadata.column_metadata[1].set_name("col_list_list_int_1");
+  expected_metadata.column_metadata[2].set_name("col_list_list_int_nullable_2");
+  expected_metadata.column_metadata[3].set_name("col_list_list_nullable_double_nullable_3");
+  // expected_metadata.column_metadata[0].set_name("col_list_list_uint16_4");
+  expected_metadata.column_metadata[4].set_name("col_list_nullable_list_nullable_int_nullable_5");
+  expected_metadata.column_metadata[5].set_name("col_list_list_string_6");
+  expected_metadata.column_metadata[6].set_name("col_list_list_list_7");
+
   auto filepath = temp_env->get_temp_filepath("ListColumn.parquet");
   auto out_opts = cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected)
                     .metadata(&expected_metadata)
@@ -630,7 +696,7 @@ TEST_F(ParquetWriterTest, ListColumn)
   auto result  = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+  compare_metadata_equality(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, MultiIndex)
@@ -650,15 +716,6 @@ TEST_F(ParquetWriterTest, MultiIndex)
   column_wrapper<float> col4{col4_data.begin(), col4_data.end(), validity};
   column_wrapper<double> col5{col5_data.begin(), col5_data.end(), validity};
 
-  cudf_io::table_metadata expected_metadata;
-  expected_metadata.column_names.emplace_back("int8s");
-  expected_metadata.column_names.emplace_back("int16s");
-  expected_metadata.column_names.emplace_back("int32s");
-  expected_metadata.column_names.emplace_back("floats");
-  expected_metadata.column_names.emplace_back("doubles");
-  expected_metadata.user_data.insert(
-    {"pandas", "\"index_columns\": [\"floats\", \"doubles\"], \"column1\": [\"int8s\"]"});
-
   std::vector<std::unique_ptr<column>> cols;
   cols.push_back(col1.release());
   cols.push_back(col2.release());
@@ -668,6 +725,15 @@ TEST_F(ParquetWriterTest, MultiIndex)
   auto expected = std::make_unique<table>(std::move(cols));
   EXPECT_EQ(5, expected->num_columns());
 
+  cudf_io::table_input_metadata expected_metadata(*expected);
+  expected_metadata.column_metadata[0].set_name("int8s");
+  expected_metadata.column_metadata[1].set_name("int16s");
+  expected_metadata.column_metadata[2].set_name("int32s");
+  expected_metadata.column_metadata[3].set_name("floats");
+  expected_metadata.column_metadata[4].set_name("doubles");
+  expected_metadata.user_data.insert(
+    {"pandas", "\"index_columns\": [\"floats\", \"doubles\"], \"column1\": [\"int8s\"]"});
+
   auto filepath = temp_env->get_temp_filepath("MultiIndex.parquet");
   cudf_io::parquet_writer_options out_opts =
     cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected->view())
@@ -681,7 +747,7 @@ TEST_F(ParquetWriterTest, MultiIndex)
   auto result = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+  compare_metadata_equality(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, HostBuffer)
@@ -692,14 +758,14 @@ TEST_F(ParquetWriterTest, HostBuffer)
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
   column_wrapper<int> col{seq_col.begin(), seq_col.end(), validity};
 
-  cudf_io::table_metadata expected_metadata;
-  expected_metadata.column_names.emplace_back("col_other");
-
   std::vector<std::unique_ptr<column>> cols;
   cols.push_back(col.release());
   const auto expected = std::make_unique<table>(std::move(cols));
   EXPECT_EQ(1, expected->num_columns());
 
+  cudf_io::table_input_metadata expected_metadata(*expected);
+  expected_metadata.column_metadata[0].set_name("col_other");
+
   std::vector<char> out_buffer;
   cudf_io::parquet_writer_options out_opts =
     cudf_io::parquet_writer_options::builder(cudf_io::sink_info(&out_buffer), expected->view())
@@ -710,7 +776,7 @@ TEST_F(ParquetWriterTest, HostBuffer)
   const auto result = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+  compare_metadata_equality(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, NonNullable)
@@ -730,6 +796,175 @@ TEST_F(ParquetWriterTest, NonNullable)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
 }
 
+TEST_F(ParquetWriterTest, Struct)
+{
+  // Struct<is_human:bool, Struct<names:string, ages:int>>
+
+  auto names = {"Samuel Vimes",
+                "Carrot Ironfoundersson",
+                "Angua von Uberwald",
+                "Cheery Littlebottom",
+                "Detritus",
+                "Mr Slant"};
+
+  // `Name` column has all valid values.
+  auto names_col = cudf::test::strings_column_wrapper{names.begin(), names.end()};
+
+  auto ages_col =
+    cudf::test::fixed_width_column_wrapper<int32_t>{{48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}};
+
+  auto struct_1 = cudf::test::structs_column_wrapper{{names_col, ages_col}, {1, 1, 1, 1, 0, 1}};
+
+  auto is_human_col = cudf::test::fixed_width_column_wrapper<bool>{
+    {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}};
+
+  auto struct_2 =
+    cudf::test::structs_column_wrapper{{is_human_col, struct_1}, {0, 1, 1, 1, 1, 1}}.release();
+
+  auto expected = table_view({*struct_2});
+
+  auto filepath = temp_env->get_temp_filepath("Struct.parquet");
+  cudf_io::parquet_writer_options args =
+    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected);
+  cudf_io::write_parquet(args);
+
+  cudf_io::parquet_reader_options read_args =
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath));
+  cudf_io::read_parquet(read_args);
+}
+
+TEST_F(ParquetWriterTest, StructOfList)
+{
+  // Struct<is_human:bool,
+  //        Struct<weight:float,
+  //               ages:int,
+  //               land_unit:List<int>>,
+  //               flats:List<List<int>>
+  //              >
+  //       >
+
+  auto weights_col = cudf::test::fixed_width_column_wrapper<float>{1.1, 2.4, 5.3, 8.0, 9.6, 6.9};
+
+  auto ages_col =
+    cudf::test::fixed_width_column_wrapper<int32_t>{{48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}};
+
+  auto valids  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
+  auto valids2 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; });
+
+  using lcw = cudf::test::lists_column_wrapper<int32_t>;
+
+  // []
+  // [NULL, 2, NULL]
+  // [4, 5]
+  // NULL
+  // []
+  // [7, 8, 9]
+  lcw land_unit{{{}, {{1, 2, 3}, valids}, {4, 5}, {}, {}, {7, 8, 9}}, valids2};
+
+  // []
+  // [[1, 2, 3], [], [4, 5], [], [0, 6, 0]]
+  // [[7, 8], []]
+  // [[]]
+  // [[]]
+  // [[], [], []]
+  lcw flats{lcw{},
+            {{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}},
+            {{7, 8}, {}},
+            lcw{lcw{}},
+            lcw{lcw{}},
+            lcw{lcw{}, lcw{}, lcw{}}};
+
+  auto struct_1 = cudf::test::structs_column_wrapper{{weights_col, ages_col, land_unit, flats},
+                                                     {1, 1, 1, 1, 0, 1}};
+
+  auto is_human_col = cudf::test::fixed_width_column_wrapper<bool>{
+    {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}};
+
+  auto struct_2 =
+    cudf::test::structs_column_wrapper{{is_human_col, struct_1}, {0, 1, 1, 1, 1, 1}}.release();
+
+  // cudf::test::print(struct_2->child(1).child(2));
+
+  auto expected = table_view({*struct_2});
+
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("being");
+  expected_metadata.column_metadata[0].child(0).set_name("human?");
+  expected_metadata.column_metadata[0].child(1).set_name("particulars");
+  expected_metadata.column_metadata[0].child(1).child(0).set_name("weight");
+  expected_metadata.column_metadata[0].child(1).child(1).set_name("age");
+  expected_metadata.column_metadata[0].child(1).child(2).set_name("land_unit");
+  expected_metadata.column_metadata[0].child(1).child(3).set_name("flats");
+
+  auto filepath = temp_env->get_temp_filepath("StructOfList.parquet");
+  cudf_io::parquet_writer_options args =
+    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+      .metadata(&expected_metadata);
+  cudf_io::write_parquet(args);
+
+  cudf_io::parquet_reader_options read_args =
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath));
+  const auto result = cudf_io::read_parquet(read_args);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+  compare_metadata_equality(expected_metadata, result.metadata);
+}
+
+TEST_F(ParquetWriterTest, ListOfStruct)
+{
+  // List<Struct<is_human:bool,
+  //             Struct<weight:float,
+  //                    ages:int,
+  //                   >
+  //            >
+  //     >
+
+  auto weight_col = cudf::test::fixed_width_column_wrapper<float>{1.1, 2.4, 5.3, 8.0, 9.6, 6.9};
+
+  auto ages_col =
+    cudf::test::fixed_width_column_wrapper<int32_t>{{48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}};
+
+  auto struct_1 = cudf::test::structs_column_wrapper{{weight_col, ages_col}, {1, 1, 1, 1, 0, 1}};
+
+  auto is_human_col = cudf::test::fixed_width_column_wrapper<bool>{
+    {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}};
+
+  auto struct_2 =
+    cudf::test::structs_column_wrapper{{is_human_col, struct_1}, {0, 1, 1, 1, 1, 1}}.release();
+
+  auto list_offsets_column =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 2, 5, 5, 6}.release();
+  auto num_list_rows = list_offsets_column->size() - 1;
+
+  auto list_col = cudf::make_lists_column(num_list_rows,
+                                          std::move(list_offsets_column),
+                                          std::move(struct_2),
+                                          cudf::UNKNOWN_NULL_COUNT,
+                                          {});
+
+  auto expected = table_view({*list_col});
+
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("family");
+  expected_metadata.column_metadata[0].child(1).child(0).set_name("human?");
+  expected_metadata.column_metadata[0].child(1).child(1).set_name("particulars");
+  expected_metadata.column_metadata[0].child(1).child(1).child(0).set_name("weight");
+  expected_metadata.column_metadata[0].child(1).child(1).child(1).set_name("age");
+
+  auto filepath = temp_env->get_temp_filepath("ListOfStruct.parquet");
+  cudf_io::parquet_writer_options args =
+    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+      .metadata(&expected_metadata);
+  cudf_io::write_parquet(args);
+
+  cudf_io::parquet_reader_options read_args =
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath));
+  const auto result = cudf_io::read_parquet(read_args);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+  compare_metadata_equality(expected_metadata, result.metadata);
+}
+
 // custom data sink that supports device writes. uses plain file io.
 class custom_test_data_sink : public cudf::io::data_sink {
  public:
@@ -1055,6 +1290,168 @@ TEST_F(ParquetChunkedWriterTest, ListColumn)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
 }
 
+TEST_F(ParquetChunkedWriterTest, ListOfStruct)
+{
+  // Table 1
+  auto weight_1   = cudf::test::fixed_width_column_wrapper<float>{{57.5, 51.1, 15.3}};
+  auto ages_1     = cudf::test::fixed_width_column_wrapper<int32_t>{{30, 27, 5}};
+  auto struct_1_1 = cudf::test::structs_column_wrapper{weight_1, ages_1};
+  auto is_human_1 = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}};
+  auto struct_2_1 = cudf::test::structs_column_wrapper{{is_human_1, struct_1_1}};
+
+  auto list_offsets_column_1 =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 2, 3, 3}.release();
+  auto num_list_rows_1 = list_offsets_column_1->size() - 1;
+
+  auto list_col_1 = cudf::make_lists_column(num_list_rows_1,
+                                            std::move(list_offsets_column_1),
+                                            struct_2_1.release(),
+                                            cudf::UNKNOWN_NULL_COUNT,
+                                            {});
+
+  auto table_1 = table_view({*list_col_1});
+
+  // Table 2
+  auto weight_2   = cudf::test::fixed_width_column_wrapper<float>{{1.1, -1.0, -1.0}};
+  auto ages_2     = cudf::test::fixed_width_column_wrapper<int32_t>{{31, 351, 351}, {1, 1, 0}};
+  auto struct_1_2 = cudf::test::structs_column_wrapper{{weight_2, ages_2}, {1, 0, 1}};
+  auto is_human_2 = cudf::test::fixed_width_column_wrapper<bool>{{false, false, false}, {1, 1, 0}};
+  auto struct_2_2 = cudf::test::structs_column_wrapper{{is_human_2, struct_1_2}};
+
+  auto list_offsets_column_2 =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 1, 2, 3}.release();
+  auto num_list_rows_2 = list_offsets_column_2->size() - 1;
+
+  auto list_col_2 = cudf::make_lists_column(num_list_rows_2,
+                                            std::move(list_offsets_column_2),
+                                            struct_2_2.release(),
+                                            cudf::UNKNOWN_NULL_COUNT,
+                                            {});
+
+  auto table_2 = table_view({*list_col_2});
+
+  auto full_table = cudf::concatenate({table_1, table_2});
+
+  cudf_io::table_input_metadata expected_metadata(table_1);
+  expected_metadata.column_metadata[0].set_name("family");
+  expected_metadata.column_metadata[0].child(1).set_nullability(false);
+  expected_metadata.column_metadata[0].child(1).child(0).set_name("human?");
+  expected_metadata.column_metadata[0].child(1).child(1).set_name("particulars");
+  expected_metadata.column_metadata[0].child(1).child(1).child(0).set_name("weight");
+  expected_metadata.column_metadata[0].child(1).child(1).child(1).set_name("age");
+
+  auto filepath = temp_env->get_temp_filepath("ChunkedListOfStruct.parquet");
+  cudf_io::chunked_parquet_writer_options args =
+    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
+  args.set_metadata(&expected_metadata);
+  cudf_io::parquet_chunked_writer(args).write(table_1).write(table_2);
+
+  cudf_io::parquet_reader_options read_opts =
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
+  auto result = cudf_io::read_parquet(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table);
+  compare_metadata_equality(expected_metadata, result.metadata);
+}
+
+TEST_F(ParquetChunkedWriterTest, ListOfStructOfStructOfListOfList)
+{
+  auto valids  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
+  auto valids2 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; });
+
+  using lcw = cudf::test::lists_column_wrapper<int32_t>;
+
+  // Table 1 ===========================
+
+  // []
+  // [NULL, 2, NULL]
+  // [4, 5]
+  // NULL
+  lcw land_1{{{}, {{1, 2, 3}, valids}, {4, 5}, {}}, valids2};
+
+  // []
+  // [[1, 2, 3], [], [4, 5], [], [0, 6, 0]]
+  // [[7, 8], []]
+  // [[]]
+  lcw flats_1{lcw{}, {{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, {{7, 8}, {}}, lcw{lcw{}}};
+
+  auto weight_1   = cudf::test::fixed_width_column_wrapper<float>{{57.5, 51.1, 15.3, 1.1}};
+  auto ages_1     = cudf::test::fixed_width_column_wrapper<int32_t>{{30, 27, 5, 31}};
+  auto struct_1_1 = cudf::test::structs_column_wrapper{weight_1, ages_1, land_1, flats_1};
+  auto is_human_1 = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false, false}};
+  auto struct_2_1 = cudf::test::structs_column_wrapper{{is_human_1, struct_1_1}};
+
+  auto list_offsets_column_1 =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 2, 3, 4}.release();
+  auto num_list_rows_1 = list_offsets_column_1->size() - 1;
+
+  auto list_col_1 = cudf::make_lists_column(num_list_rows_1,
+                                            std::move(list_offsets_column_1),
+                                            struct_2_1.release(),
+                                            cudf::UNKNOWN_NULL_COUNT,
+                                            {});
+
+  auto table_1 = table_view({*list_col_1});
+
+  // Table 2 ===========================
+
+  // []
+  // [7, 8, 9]
+  lcw land_2{{}, {7, 8, 9}};
+
+  // [[]]
+  // [[], [], []]
+  lcw flats_2{lcw{lcw{}}, lcw{lcw{}, lcw{}, lcw{}}};
+
+  auto weight_2   = cudf::test::fixed_width_column_wrapper<float>{{-1.0, -1.0}};
+  auto ages_2     = cudf::test::fixed_width_column_wrapper<int32_t>{{351, 351}, {1, 0}};
+  auto struct_1_2 = cudf::test::structs_column_wrapper{{weight_2, ages_2, land_2, flats_2}, {0, 1}};
+  auto is_human_2 = cudf::test::fixed_width_column_wrapper<bool>{{false, false}, {1, 0}};
+  auto struct_2_2 = cudf::test::structs_column_wrapper{{is_human_2, struct_1_2}};
+
+  auto list_offsets_column_2 =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 1, 2}.release();
+  auto num_list_rows_2 = list_offsets_column_2->size() - 1;
+
+  auto list_col_2 = cudf::make_lists_column(num_list_rows_2,
+                                            std::move(list_offsets_column_2),
+                                            struct_2_2.release(),
+                                            cudf::UNKNOWN_NULL_COUNT,
+                                            {});
+
+  auto table_2 = table_view({*list_col_2});
+
+  auto full_table = cudf::concatenate({table_1, table_2});
+
+  cudf_io::table_input_metadata expected_metadata(table_1);
+  expected_metadata.column_metadata[0].set_name("family");
+  expected_metadata.column_metadata[0].child(1).set_nullability(false);
+  expected_metadata.column_metadata[0].child(1).child(0).set_name("human?");
+  expected_metadata.column_metadata[0].child(1).child(1).set_name("particulars");
+  expected_metadata.column_metadata[0].child(1).child(1).child(0).set_name("weight");
+  expected_metadata.column_metadata[0].child(1).child(1).child(1).set_name("age");
+  expected_metadata.column_metadata[0].child(1).child(1).child(2).set_name("land_unit");
+  expected_metadata.column_metadata[0].child(1).child(1).child(3).set_name("flats");
+
+  auto filepath = temp_env->get_temp_filepath("ListOfStructOfStructOfListOfList.parquet");
+  cudf_io::chunked_parquet_writer_options args =
+    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
+  args.set_metadata(&expected_metadata);
+  cudf_io::parquet_chunked_writer(args).write(table_1).write(table_2);
+
+  cudf_io::parquet_reader_options read_opts =
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
+  auto result = cudf_io::read_parquet(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table);
+  compare_metadata_equality(expected_metadata, result.metadata);
+
+  // We specifically mentioned in input schema that struct_2 is non-nullable across chunked calls.
+  auto result_parent_list = result.tbl->get_column(0);
+  auto result_struct_2    = result_parent_list.child(cudf::lists_column_view::child_column_index);
+  EXPECT_EQ(result_struct_2.nullable(), false);
+}
+
 TEST_F(ParquetChunkedWriterTest, MismatchedTypes)
 {
   srand(31337);
@@ -1150,8 +1547,7 @@ TEST_F(ParquetChunkedWriterTest, MismatchedStructureList)
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
   cudf_io::parquet_chunked_writer writer(args);
   writer.write(tbl0);
-  CUDF_EXPECT_THROW_MESSAGE(writer.write(tbl1),
-                            "Mismatch in schema between multiple calls to write_chunk");
+  EXPECT_THROW(writer.write(tbl1), cudf::logic_error);
 }
 
 TEST_F(ParquetChunkedWriterTest, DifferentNullability)
@@ -1174,6 +1570,54 @@ TEST_F(ParquetChunkedWriterTest, DifferentNullability)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
 }
 
+TEST_F(ParquetChunkedWriterTest, DifferentNullabilityStruct)
+{
+  // Struct<is_human:bool (non-nullable),
+  //        Struct<weight:float>,
+  //               age:int
+  //              > (nullable)
+  //       > (non-nullable)
+
+  // Table 1: is_human and struct_1 are non-nullable but should be nullable when read back.
+  auto weight_1   = cudf::test::fixed_width_column_wrapper<float>{{57.5, 51.1, 15.3}};
+  auto ages_1     = cudf::test::fixed_width_column_wrapper<int32_t>{{30, 27, 5}};
+  auto struct_1_1 = cudf::test::structs_column_wrapper{weight_1, ages_1};
+  auto is_human_1 = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}};
+  auto struct_2_1 = cudf::test::structs_column_wrapper{{is_human_1, struct_1_1}};
+  auto table_1    = cudf::table_view({struct_2_1});
+
+  // Table 2: struct_1 and is_human are nullable now so if we hadn't assumed worst case (nullable)
+  // when writing table_1, we would have wrong pages for it.
+  auto weight_2   = cudf::test::fixed_width_column_wrapper<float>{{1.1, -1.0, -1.0}};
+  auto ages_2     = cudf::test::fixed_width_column_wrapper<int32_t>{{31, 351, 351}, {1, 1, 0}};
+  auto struct_1_2 = cudf::test::structs_column_wrapper{{weight_2, ages_2}, {1, 0, 1}};
+  auto is_human_2 = cudf::test::fixed_width_column_wrapper<bool>{{false, false, false}, {1, 1, 0}};
+  auto struct_2_2 = cudf::test::structs_column_wrapper{{is_human_2, struct_1_2}};
+  auto table_2    = cudf::table_view({struct_2_2});
+
+  auto full_table = cudf::concatenate({table_1, table_2});
+
+  cudf_io::table_input_metadata expected_metadata(table_1);
+  expected_metadata.column_metadata[0].set_name("being");
+  expected_metadata.column_metadata[0].child(0).set_name("human?");
+  expected_metadata.column_metadata[0].child(1).set_name("particulars");
+  expected_metadata.column_metadata[0].child(1).child(0).set_name("weight");
+  expected_metadata.column_metadata[0].child(1).child(1).set_name("age");
+
+  auto filepath = temp_env->get_temp_filepath("ChunkedNullableStruct.parquet");
+  cudf_io::chunked_parquet_writer_options args =
+    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
+  args.set_metadata(&expected_metadata);
+  cudf_io::parquet_chunked_writer(args).write(table_1).write(table_2);
+
+  cudf_io::parquet_reader_options read_opts =
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
+  auto result = cudf_io::read_parquet(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table);
+  compare_metadata_equality(expected_metadata, result.metadata);
+}
+
 TEST_F(ParquetChunkedWriterTest, ForcedNullability)
 {
   srand(31337);
@@ -1184,17 +1628,17 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullability)
 
   auto filepath = temp_env->get_temp_filepath("ChunkedNoNullable.parquet");
 
-  cudf::io::table_metadata_with_nullability nullable_metadata;
+  cudf_io::table_input_metadata metadata(*table1);
 
   // In the absence of prescribed per-column nullability in metadata, the writer assumes the worst
   // and considers all columns nullable. However cudf::concatenate will not force nulls in case no
   // columns are nullable. To get the expected result, we tell the writer the nullability of all
   // columns in advance.
-  nullable_metadata.column_nullable.insert(nullable_metadata.column_nullable.begin(), 5, false);
+  for (auto& col_meta : metadata.column_metadata) { col_meta.set_nullability(false); }
 
   cudf_io::chunked_parquet_writer_options args =
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath})
-      .nullable_metadata(&nullable_metadata);
+      .metadata(&metadata);
   cudf_io::parquet_chunked_writer(args).write(*table1).write(*table2);
 
   cudf_io::parquet_reader_options read_opts =
@@ -1213,8 +1657,6 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityList)
 
   using lcw = cudf::test::lists_column_wrapper<int32_t>;
 
-  cudf::io::table_metadata_with_nullability nullable_metadata;
-
   // COL0 ====================
   // [1, 2, 3]
   // []
@@ -1228,9 +1670,6 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityList)
   // NULL
   lcw col01{{{7}, {}, {8, 9, 10, 11}, {}}, valids2};
 
-  nullable_metadata.column_nullable.push_back(true);   // List is nullable at first (root) level
-  nullable_metadata.column_nullable.push_back(false);  // non-nullable at second (leaf) level
-
   // COL1 (non-nested columns to test proper schema construction)
   size_t num_rows = static_cast<cudf::column_view>(col00).size();
   auto seq_col0   = random_values<int>(num_rows);
@@ -1239,18 +1678,22 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityList)
   column_wrapper<int> col10{seq_col0.begin(), seq_col0.end(), valids};
   column_wrapper<int> col11{seq_col1.begin(), seq_col1.end(), valids2};
 
-  nullable_metadata.column_nullable.push_back(true);
-
   auto table1 = table_view({col00, col10});
   auto table2 = table_view({col01, col11});
 
   auto full_table = cudf::concatenate({table1, table2});
 
+  cudf_io::table_input_metadata metadata(table1);
+  metadata.column_metadata[0].set_nullability(true);  // List is nullable at first (root) level
+  metadata.column_metadata[0].child(1).set_nullability(
+    false);  // non-nullable at second (leaf) level
+  metadata.column_metadata[1].set_nullability(true);
+
   auto filepath = temp_env->get_temp_filepath("ChunkedListNullable.parquet");
 
   cudf_io::chunked_parquet_writer_options args =
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath})
-      .nullable_metadata(&nullable_metadata);
+      .metadata(&metadata);
   cudf_io::parquet_chunked_writer(args).write(table1).write(table2);
 
   cudf_io::parquet_reader_options read_opts =
@@ -1260,30 +1703,50 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityList)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
 }
 
-TEST_F(ParquetChunkedWriterTest, WrongNullability)
+TEST_F(ParquetChunkedWriterTest, ForcedNullabilityStruct)
 {
-  srand(31337);
-  auto table1 = create_random_fixed_table<int>(5, 5, false);
+  // Struct<is_human:bool (non-nullable),
+  //        Struct<weight:float>,
+  //               age:int
+  //              > (nullable)
+  //       > (non-nullable)
+
+  // Table 1: is_human and struct_2 are non-nullable and should stay that way when read back.
+  auto weight_1   = cudf::test::fixed_width_column_wrapper<float>{{57.5, 51.1, 15.3}};
+  auto ages_1     = cudf::test::fixed_width_column_wrapper<int32_t>{{30, 27, 5}};
+  auto struct_1_1 = cudf::test::structs_column_wrapper{weight_1, ages_1};
+  auto is_human_1 = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}};
+  auto struct_2_1 = cudf::test::structs_column_wrapper{{is_human_1, struct_1_1}};
+  auto table_1    = cudf::table_view({struct_2_1});
+
+  auto weight_2   = cudf::test::fixed_width_column_wrapper<float>{{1.1, -1.0, -1.0}};
+  auto ages_2     = cudf::test::fixed_width_column_wrapper<int32_t>{{31, 351, 351}, {1, 1, 0}};
+  auto struct_1_2 = cudf::test::structs_column_wrapper{{weight_2, ages_2}, {1, 0, 1}};
+  auto is_human_2 = cudf::test::fixed_width_column_wrapper<bool>{{false, false, false}};
+  auto struct_2_2 = cudf::test::structs_column_wrapper{{is_human_2, struct_1_2}};
+  auto table_2    = cudf::table_view({struct_2_2});
+
+  auto full_table = cudf::concatenate({table_1, table_2});
+
+  cudf_io::table_input_metadata expected_metadata(table_1);
+  expected_metadata.column_metadata[0].set_name("being").set_nullability(false);
+  expected_metadata.column_metadata[0].child(0).set_name("human?").set_nullability(false);
+  expected_metadata.column_metadata[0].child(1).set_name("particulars");
+  expected_metadata.column_metadata[0].child(1).child(0).set_name("weight");
+  expected_metadata.column_metadata[0].child(1).child(1).set_name("age");
+
+  auto filepath = temp_env->get_temp_filepath("ChunkedNullableStruct.parquet");
+  cudf_io::chunked_parquet_writer_options args =
+    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
+  args.set_metadata(&expected_metadata);
+  cudf_io::parquet_chunked_writer(args).write(table_1).write(table_2);
 
-  auto filepath = temp_env->get_temp_filepath("ChunkedWrongNullable.parquet");
+  cudf_io::parquet_reader_options read_opts =
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
+  auto result = cudf_io::read_parquet(read_opts);
 
-  cudf::io::table_metadata_with_nullability nullable_metadata;
-  // Number of columns with mask in table (i.e 5) and size of column nullability (i.e 6), are
-  // mismatching.
-  nullable_metadata.column_nullable.insert(nullable_metadata.column_nullable.begin(), 6, true);
-  cudf_io::chunked_parquet_writer_options args =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath})
-      .nullable_metadata(&nullable_metadata);
-  EXPECT_THROW(cudf_io::parquet_chunked_writer(args).write(*table1), cudf::logic_error);
-
-  nullable_metadata.column_nullable.clear();
-  // Number of columns with mask in table (i.e 5) and size of column nullability (i.e 4), are
-  // mismatching.
-  nullable_metadata.column_nullable.insert(nullable_metadata.column_nullable.begin(), 4, true);
-  cudf_io::chunked_parquet_writer_options args2 =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath})
-      .nullable_metadata(&nullable_metadata);
-  EXPECT_THROW(cudf_io::parquet_chunked_writer(args2).write(*table1), cudf::logic_error);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
+  compare_metadata_equality(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetChunkedWriterTest, ReadRowGroups)
@@ -1328,7 +1791,7 @@ TEST_F(ParquetChunkedWriterTest, ReadRowGroupsError)
   EXPECT_THROW(cudf_io::read_parquet(read_opts), cudf::logic_error);
 }
 
-TEST_F(ParquetChunkedWriterTest, DecimalWrite)
+TEST_F(ParquetWriterTest, DecimalWrite)
 {
   constexpr cudf::size_type num_rows = 500;
   auto seq_col0                      = random_values<int32_t>(num_rows);
@@ -1345,36 +1808,25 @@ TEST_F(ParquetChunkedWriterTest, DecimalWrite)
   auto table = table_view({col0, col1});
 
   auto filepath = temp_env->get_temp_filepath("DecimalWrite.parquet");
-  cudf_io::chunked_parquet_writer_options args =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
+  cudf_io::parquet_writer_options args =
+    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, table);
 
   // verify failure if no decimal precision given
-  EXPECT_THROW(cudf_io::parquet_chunked_writer(args).write(table), cudf::logic_error);
+  EXPECT_THROW(cudf_io::write_parquet(args), cudf::logic_error);
+
+  cudf_io::table_input_metadata expected_metadata(table);
 
   // verify failure if too small a precision is given
-  std::vector<uint8_t> precisions{7, 1};
-  args.set_decimal_precision_data(precisions);
-  EXPECT_THROW(cudf_io::parquet_chunked_writer(args).write(table), cudf::logic_error);
-
-  // verify failure if too few precisions given
-  precisions.pop_back();
-  args.set_decimal_precision_data(precisions);
-  EXPECT_THROW(cudf_io::parquet_chunked_writer(args).write(table), cudf::logic_error);
-
-  // verify sucess if equal precision is given
-  precisions = {7, 9};
-  args.set_decimal_precision_data(precisions);
-  cudf_io::parquet_chunked_writer(args).write(table);
-
-  // verify failure if too many precisions given
-  precisions = {7, 14, 11};
-  args.set_decimal_precision_data(precisions);
-  EXPECT_THROW(cudf_io::parquet_chunked_writer(args).write(table), cudf::logic_error);
-
-  // write correctly
-  precisions.pop_back();
-  args.set_decimal_precision_data(precisions);
-  cudf_io::parquet_chunked_writer(args).write(table);
+  expected_metadata.column_metadata[0].set_decimal_precision(7);
+  expected_metadata.column_metadata[1].set_decimal_precision(1);
+  args.set_metadata(&expected_metadata);
+  EXPECT_THROW(cudf_io::write_parquet(args), cudf::logic_error);
+
+  // verify success if equal precision is given
+  expected_metadata.column_metadata[0].set_decimal_precision(7);
+  expected_metadata.column_metadata[1].set_decimal_precision(9);
+  args.set_metadata(&expected_metadata);
+  cudf_io::write_parquet(args);
 
   cudf_io::parquet_reader_options read_opts =
     cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
@@ -1744,9 +2196,9 @@ TEST_F(ParquetReaderTest, ReorderedColumns)
 
     cudf::table_view tbl{{a, b}};
     auto filepath = temp_env->get_temp_filepath("ReorderedColumns.parquet");
-    cudf_io::table_metadata md;
-    md.column_names.push_back("a");
-    md.column_names.push_back("b");
+    cudf_io::table_input_metadata md(tbl);
+    md.column_metadata[0].set_name("a");
+    md.column_metadata[1].set_name("b");
     cudf_io::parquet_writer_options opts =
       cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, tbl).metadata(&md);
     cudf_io::write_parquet(opts);
@@ -1766,9 +2218,9 @@ TEST_F(ParquetReaderTest, ReorderedColumns)
 
     cudf::table_view tbl{{a, b}};
     auto filepath = temp_env->get_temp_filepath("ReorderedColumns2.parquet");
-    cudf_io::table_metadata md;
-    md.column_names.push_back("a");
-    md.column_names.push_back("b");
+    cudf_io::table_input_metadata md(tbl);
+    md.column_metadata[0].set_name("a");
+    md.column_metadata[1].set_name("b");
     cudf_io::parquet_writer_options opts =
       cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, tbl).metadata(&md);
     cudf_io::write_parquet(opts);
@@ -1791,11 +2243,11 @@ TEST_F(ParquetReaderTest, ReorderedColumns)
 
   cudf::table_view tbl{{a, b, c, d}};
   auto filepath = temp_env->get_temp_filepath("ReorderedColumns3.parquet");
-  cudf_io::table_metadata md;
-  md.column_names.push_back("a");
-  md.column_names.push_back("b");
-  md.column_names.push_back("c");
-  md.column_names.push_back("d");
+  cudf_io::table_input_metadata md(tbl);
+  md.column_metadata[0].set_name("a");
+  md.column_metadata[1].set_name("b");
+  md.column_metadata[2].set_name("c");
+  md.column_metadata[3].set_name("d");
   cudf_io::parquet_writer_options opts =
     cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, tbl).metadata(&md);
   cudf_io::write_parquet(opts);
@@ -2205,4 +2657,5 @@ TEST_F(ParquetReaderTest, DecimalRead)
     EXPECT_THROW(cudf_io::read_parquet(read_opts), cudf::logic_error);
   }
 }
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
index f7f094834e6..519565fa48c 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -3,6 +3,7 @@
 from libcpp cimport bool
 from libcpp.string cimport string
 from libcpp.vector cimport vector
+from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libc.stdint cimport uint8_t
 
@@ -64,17 +65,35 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
     cdef cudf_io_types.table_with_metadata read_parquet(
         parquet_reader_options args) except +
 
+    cdef cppclass column_in_metadata:
+        column_in_metadata& set_name(const string& name)
+        column_in_metadata& set_nullability(bool nullable)
+        column_in_metadata& set_list_column_as_map()
+        column_in_metadata& set_int96_timestamps(bool req)
+        column_in_metadata& child(size_type i)
+
+    cdef cppclass table_input_metadata:
+        table_input_metadata() except +
+        table_input_metadata(const cudf_table_view.table_view& table) except +
+        table_input_metadata(
+            const cudf_table_view.table_view& table,
+            map[string, string] user_data
+        ) except +
+
+        vector[column_in_metadata] column_metadata
+        map[string, string] user_data
+
     cdef cppclass parquet_writer_options:
         parquet_writer_options() except +
         cudf_io_types.sink_info get_sink_info() except +
         cudf_io_types.compression_type get_compression() except +
         cudf_io_types.statistics_freq get_stats_level() except +
         cudf_table_view.table_view get_table() except +
-        const cudf_io_types.table_metadata get_metadata() except +
+        const table_input_metadata get_metadata() except +
         string get_column_chunks_file_path() except+
 
         void set_metadata(
-            cudf_io_types.table_metadata *m
+            table_input_metadata *m
         ) except +
         void set_stats_level(
             cudf_io_types.statistics_freq sf
@@ -100,7 +119,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             cudf_table_view.table_view table_
         ) except +
         parquet_writer_options_builder& metadata(
-            cudf_io_types.table_metadata *m
+            table_input_metadata *m
         ) except +
         parquet_writer_options_builder& stats_level(
             cudf_io_types.statistics_freq sf
@@ -126,11 +145,11 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         cudf_io_types.sink_info get_sink() except +
         cudf_io_types.compression_type get_compression() except +
         cudf_io_types.statistics_freq get_stats_level() except +
-        cudf_io_types.table_metadata_with_nullability* get_nullable_metadata(
+        table_input_metadata* get_metadata(
         ) except+
 
-        void set_nullable_metadata(
-            cudf_io_types.table_metadata_with_nullability *m
+        void set_metadata(
+            table_input_metadata *m
         ) except +
         void set_stats_level(
             cudf_io_types.statistics_freq sf
@@ -149,8 +168,8 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         chunked_parquet_writer_options_builder(
             cudf_io_types.sink_info sink_,
         ) except +
-        chunked_parquet_writer_options_builder& nullable_metadata(
-            cudf_io_types.table_metadata_with_nullability *m
+        chunked_parquet_writer_options_builder& metadata(
+            table_input_metadata *m
         ) except +
         chunked_parquet_writer_options_builder& stats_level(
             cudf_io_types.statistics_freq sf
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index a9739a02283..87179c02fe2 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -48,6 +48,8 @@ from cudf._lib.cpp.table.table_view cimport (
 from cudf._lib.cpp.io.parquet cimport (
     read_parquet as parquet_reader,
     parquet_reader_options,
+    table_input_metadata,
+    column_in_metadata,
     parquet_writer_options,
     write_parquet as parquet_writer,
     parquet_chunked_writer as cpp_parquet_chunked_writer,
@@ -284,10 +286,8 @@ cpdef write_parquet(
     """
 
     # Create the write options
-    cdef unique_ptr[cudf_io_types.table_metadata] tbl_meta = \
-        make_unique[cudf_io_types.table_metadata]()
+    cdef unique_ptr[table_input_metadata] tbl_meta
 
-    cdef vector[string] column_names
     cdef map[string, string] user_data
     cdef table_view tv
     cdef unique_ptr[cudf_io_types.data_sink] _data_sink
@@ -295,23 +295,29 @@ cpdef write_parquet(
 
     if index is not False and not isinstance(table._index, cudf.RangeIndex):
         tv = table.view()
+        tbl_meta = make_unique[table_input_metadata](tv)
         for level, idx_name in enumerate(table._index.names):
-            column_names.push_back(
+            tbl_meta.get().column_metadata[level].set_name(
                 str.encode(
                     _index_level_name(idx_name, level, table._column_names)
                 )
             )
+        num_index_cols_meta = len(table._index.names)
     else:
         tv = table.data_view()
+        tbl_meta = make_unique[table_input_metadata](tv)
+        num_index_cols_meta = 0
 
-    for col_name in table._column_names:
-        column_names.push_back(str.encode(col_name))
+    for i, name in enumerate(table._column_names, num_index_cols_meta):
+        tbl_meta.get().column_metadata[i].set_name(name.encode())
+        _set_col_children_names(
+            table[name]._column, tbl_meta.get().column_metadata[i]
+        )
 
     pandas_metadata = generate_pandas_metadata(table, index)
     user_data[str.encode("pandas")] = str.encode(pandas_metadata)
 
     # Set the table_metadata
-    tbl_meta.get().column_names = column_names
     tbl_meta.get().user_data = user_data
 
     cdef cudf_io_types.compression_type comp_type = _get_comp_type(compression)
@@ -357,6 +363,7 @@ cdef class ParquetWriter:
     """
     cdef bool initialized
     cdef unique_ptr[cpp_parquet_chunked_writer] writer
+    cdef unique_ptr[table_input_metadata] tbl_meta
     cdef cudf_io_types.sink_info sink
     cdef unique_ptr[cudf_io_types.data_sink] _data_sink
     cdef cudf_io_types.statistics_freq stat_freq
@@ -416,20 +423,44 @@ cdef class ParquetWriter:
     def _initialize_chunked_state(self, Table table):
         """ Prepares all the values required to build the
         chunked_parquet_writer_options and creates a writer"""
-        cdef unique_ptr[cudf_io_types.table_metadata_with_nullability] tbl_meta
-        tbl_meta = make_unique[cudf_io_types.table_metadata_with_nullability]()
+        cdef table_view tv
 
         # Set the table_metadata
-        tbl_meta.get().column_names = get_column_names(table, self.index)
+        num_index_cols_meta = 0
+        self.tbl_meta = make_unique[table_input_metadata](table.data_view())
+        if self.index is not False:
+            if isinstance(table._index, cudf.core.multiindex.MultiIndex):
+                tv = table.view()
+                self.tbl_meta = make_unique[table_input_metadata](tv)
+                for level, idx_name in enumerate(table._index.names):
+                    self.tbl_meta.get().column_metadata[level].set_name(
+                        (str.encode(idx_name))
+                    )
+                num_index_cols_meta = len(table._index.names)
+            else:
+                if table._index.name is not None:
+                    tv = table.view()
+                    self.tbl_meta = make_unique[table_input_metadata](tv)
+                    self.tbl_meta.get().column_metadata[0].set_name(
+                        str.encode(table._index.name)
+                    )
+                    num_index_cols_meta = 1
+
+        for i, name in enumerate(table._column_names, num_index_cols_meta):
+            self.tbl_meta.get().column_metadata[i].set_name(name.encode())
+            _set_col_children_names(
+                table[name]._column, self.tbl_meta.get().column_metadata[i]
+            )
+
         pandas_metadata = generate_pandas_metadata(table, self.index)
-        tbl_meta.get().user_data[str.encode("pandas")] = \
+        self.tbl_meta.get().user_data[str.encode("pandas")] = \
             str.encode(pandas_metadata)
 
         cdef chunked_parquet_writer_options args
         with nogil:
             args = move(
                 chunked_parquet_writer_options.builder(self.sink)
-                .nullable_metadata(tbl_meta.get())
+                .metadata(self.tbl_meta.get())
                 .compression(self.comp_type)
                 .stats_level(self.stat_freq)
                 .build()
@@ -514,3 +545,15 @@ cdef Column _update_column_struct_field_names(
             )
         col.set_base_children(tuple(children))
     return col
+
+cdef _set_col_children_names(Column col, column_in_metadata& col_meta):
+    if is_struct_dtype(col):
+        for i, (child_col, name) in enumerate(
+            zip(col.children, list(col.dtype.fields))
+        ):
+            col_meta.child(i).set_name(name.encode())
+            _set_col_children_names(child_col, col_meta.child(i))
+    elif is_list_dtype(col):
+        _set_col_children_names(col.children[1], col_meta.child(1))
+    else:
+        return
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 4c4ef17c6b9..6698a47b416 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -22,6 +22,7 @@ from cudf.utils.dtypes import (
     np_to_pa_dtype,
     is_categorical_dtype,
     is_list_dtype,
+    is_struct_dtype,
 )
 
 
@@ -79,7 +80,7 @@ cpdef generate_pandas_metadata(Table table, index):
                 "'category' column dtypes are currently not "
                 + "supported by the gpu accelerated parquet writer"
             )
-        elif is_list_dtype(col):
+        elif is_list_dtype(col) or is_struct_dtype(col):
             types.append(col.dtype.to_arrow())
         else:
             types.append(np_to_pa_dtype(col.dtype))
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 4fb5762e098..812a20cba45 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -7318,15 +7318,6 @@ def to_parquet(self, path, *args, **kwargs):
         """{docstring}"""
         from cudf.io import parquet as pq
 
-        if any(
-            isinstance(col, cudf.core.column.StructColumn)
-            for col in self._data.columns
-        ):
-            raise NotImplementedError(
-                "Writing to parquet format is not yet supported "
-                "with Struct columns."
-            )
-
         return pq.to_parquet(self, path, *args, **kwargs)
 
     @ioutils.doc_to_feather()
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index dc4d0615a7f..6d50e4b6fee 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1834,3 +1834,89 @@ def test_parquet_writer_list_statistics(tmpdir):
             actual_max = cudf.Series(pd_slice[col].explode().explode()).max()
             stats_max = stats.max
             assert normalized_equals(actual_max, stats_max)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        # Structs
+        {
+            "being": [
+                None,
+                {"human?": True, "Deets": {"Name": "Carrot", "Age": 27}},
+                {"human?": None, "Deets": {"Name": "Angua", "Age": 25}},
+                {"human?": False, "Deets": {"Name": "Cheery", "Age": 31}},
+                {"human?": False, "Deets": None},
+                {"human?": None, "Deets": {"Name": "Mr", "Age": None}},
+            ]
+        },
+        # List of Structs
+        pytest.param(
+            {
+                "family": [
+                    [
+                        None,
+                        {"human?": True, "deets": {"weight": 2.4, "age": 27}},
+                    ],
+                    [
+                        {"human?": None, "deets": {"weight": 5.3, "age": 25}},
+                        {"human?": False, "deets": {"weight": 8.0, "age": 31}},
+                        {"human?": False, "deets": None},
+                    ],
+                    [],
+                    [{"human?": None, "deets": {"weight": 6.9, "age": None}}],
+                ]
+            },
+            marks=pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/7561"
+            ),
+        ),
+        # Struct of Lists
+        pytest.param(
+            {
+                "Real estate records": [
+                    None,
+                    {
+                        "Status": "NRI",
+                        "Ownerships": {
+                            "land_unit": [None, 2, None],
+                            "flats": [[1, 2, 3], [], [4, 5], [], [0, 6, 0]],
+                        },
+                    },
+                    {
+                        "Status": None,
+                        "Ownerships": {
+                            "land_unit": [4, 5],
+                            "flats": [[7, 8], []],
+                        },
+                    },
+                    {
+                        "Status": "RI",
+                        "Ownerships": {"land_unit": None, "flats": [[]]},
+                    },
+                    {"Status": "RI", "Ownerships": None},
+                    {
+                        "Status": None,
+                        "Ownerships": {
+                            "land_unit": [7, 8, 9],
+                            "flats": [[], [], []],
+                        },
+                    },
+                ]
+            },
+            marks=pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/7562"
+            ),
+        ),
+    ],
+)
+def test_parquet_writer_nested(tmpdir, data):
+    expect = pd.DataFrame(data)
+    gdf = cudf.from_pandas(expect)
+
+    fname = tmpdir.join("test_parquet_writer_nested.parquet")
+    gdf.to_parquet(fname)
+    assert os.path.exists(fname)
+
+    got = pd.read_parquet(fname)
+    assert_eq(expect, got)

From 8773a40f4c8ce63f56ed6eb67b4eaf959106939f Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Fri, 19 Mar 2021 09:46:00 -0500
Subject: [PATCH 18/26] Add in JNI support for count_elements (#7651)

This is just a simple JNI wrapper around cudf::lists::count_elements

Authors:
  - Robert (Bobby) Evans (@revans2)

Approvers:
  - Jason Lowe (@jlowe)

URL: https://github.com/rapidsai/cudf/pull/7651
---
 .../main/java/ai/rapids/cudf/ColumnView.java  | 11 ++++++++++
 java/src/main/native/src/ColumnViewJni.cpp    | 20 ++++++++++++++++---
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 12 +++++++++++
 3 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 2f3f2bf80cf..e50a9e86ead 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -256,6 +256,15 @@ public final ColumnVector getByteCount() {
     return new ColumnVector(byteCount(getNativeView()));
   }
 
+  /**
+   * Get the number of elements for each list. Null lists will have a value of null.
+   * @return the number of elements in each list as an INT32 value.
+   */
+  public final ColumnVector countElements() {
+    assert DType.LIST.equals(type) : "Only lists are supported";
+    return new ColumnVector(countElements(getNativeView()));
+  }
+
   /**
    * Returns a Boolean vector with the same number of rows as this instance, that has
    * TRUE for any entry that is not null, and FALSE for any null entry (as per the validity mask)
@@ -2749,6 +2758,8 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat
 
   private static native long binaryOpVV(long lhs, long rhs, int op, int dtype, int scale);
 
+  private static native long countElements(long viewHandle);
+
   private static native long byteCount(long viewHandle) throws CudfException;
 
   private static native long extractListElement(long nativeView, int index);
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index ac14e1605d7..73db5ee4df3 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -20,16 +20,17 @@
 #include <cudf/binaryop.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/concatenate.hpp>
-#include <cudf/lists/extract.hpp>
-#include <cudf/reshape.hpp>
-#include <cudf/lists/detail/concatenate.hpp>
 #include <cudf/datetime.hpp>
 #include <cudf/filling.hpp>
 #include <cudf/hashing.hpp>
+#include <cudf/lists/count_elements.hpp>
+#include <cudf/lists/detail/concatenate.hpp>
+#include <cudf/lists/extract.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/quantiles.hpp>
 #include <cudf/reduction.hpp>
 #include <cudf/replace.hpp>
+#include <cudf/reshape.hpp>
 #include <cudf/rolling.hpp>
 #include <cudf/round.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
@@ -430,6 +431,19 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_split(JNIEnv *env, j
   CATCH_STD(env, NULL);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_countElements(JNIEnv *env, jclass clazz,
+                                                                     jlong view_handle) {
+  JNI_NULL_CHECK(env, view_handle, "input column is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::column_view *n_column = reinterpret_cast<cudf::column_view *>(view_handle);
+    std::unique_ptr<cudf::column> result =
+        cudf::lists::count_elements(cudf::lists_column_view(*n_column));
+    return reinterpret_cast<jlong>(result.release());
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_charLengths(JNIEnv *env, jclass clazz,
                                                                    jlong view_handle) {
   JNI_NULL_CHECK(env, view_handle, "input column is null", 0);
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index d224543e574..420e176efe2 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -1666,6 +1666,18 @@ void testAppendStrings() {
     }
   }
 
+  @Test
+  void testCountElements() {
+    DataType dt = new ListType(true, new BasicType(true, DType.INT32));
+    try (ColumnVector cv = ColumnVector.fromLists(dt, Arrays.asList(1),
+        Arrays.asList(1, 2), null, Arrays.asList(null, null),
+        Arrays.asList(1, 2, 3), Arrays.asList(1, 2, 3, 4));
+         ColumnVector lengths = cv.countElements();
+         ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, null, 2, 3, 4)) {
+      TableTest.assertColumnsAreEqual(expected, lengths);
+    }
+  }
+
   @Test
   void testStringLengths() {
     try (ColumnVector cv = ColumnVector.fromStrings("1", "12", null, "123", "1234");

From 8687182caa77fa3740d7f474815f90fd5c0627a4 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Fri, 19 Mar 2021 13:02:03 -0500
Subject: [PATCH 19/26] Fix Java Parquet write after writer API changes (#7655)

This fixes the java build, but it required breaking changes to do it. I'll put up a corresponding change in the rapids plugin shortly.

https://github.com/rapidsai/cudf/issues/7654

was found as a part of this.

This is also not the final API that we will want. We need to redo how we configure the builders so that they can take advantage of the new APIs properly.

Authors:
  - Robert (Bobby) Evans (@revans2)

Approvers:
  - Jason Lowe (@jlowe)
  - Raza Jafri (@razajafri)

URL: https://github.com/rapidsai/cudf/pull/7655
---
 .../ai/rapids/cudf/ParquetWriterOptions.java  | 27 ++-------
 java/src/main/java/ai/rapids/cudf/Table.java  | 17 ++----
 .../java/ai/rapids/cudf/WriterOptions.java    |  4 +-
 java/src/main/native/src/TableJni.cpp         | 60 ++++++++++++-------
 .../test/java/ai/rapids/cudf/TableTest.java   | 29 ++++-----
 5 files changed, 63 insertions(+), 74 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
index 1203fc25931..2e793494b7b 100644
--- a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
@@ -58,25 +58,12 @@ public Builder withTimestampInt96(boolean int96) {
     }
 
     /**
-     * Overwrite flattened precision values for all decimal columns that are expected to be in
-     * this Table. The list of precisions should be an in-order traversal of all Decimal columns,
-     * including nested columns. Please look at the example below.
-     *
-     * NOTE: The number of `precisionValues` should be equal to the numbers of Decimal columns
-     * otherwise a CudfException will be thrown. Also note that the values will be overwritten
-     * every time this method is called
-     *
-     * Example:
-     *  Table0 : c0[type: INT32]
-     *           c1[type: Decimal32(3, 1)]
-     *           c2[type: Struct[col0[type: Decimal(2, 1)],
-     *                           col1[type: INT64],
-     *                           col2[type: Decimal(8, 6)]]
-     *           c3[type: Decimal64(12, 5)]
-     *
-     *  Flattened list of precision from the above example will be {3, 2, 8, 12}
+     * This is a temporary hack to make things work.  This API will go away once we can update the
+     * parquet APIs properly.
+     * @param precisionValues a value for each column, non-decimal columns are ignored.
+     * @return this for chaining.
      */
-    public Builder withPrecisionValues(int... precisionValues) {
+    public Builder withDecimalPrecisions(int ... precisionValues) {
       this.precisionValues = precisionValues;
       return this;
     }
@@ -86,8 +73,6 @@ public ParquetWriterOptions build() {
     }
   }
 
-  public static final ParquetWriterOptions DEFAULT = new ParquetWriterOptions(new Builder());
-
   public static Builder builder() {
     return new Builder();
   }
@@ -107,7 +92,7 @@ public StatisticsFrequency getStatisticsFrequency() {
 
   /**
    * Return the flattened list of precisions if set otherwise empty array will be returned.
-   * For a definition of what `flattened` means please look at {@link Builder#withPrecisionValues}
+   * For a definition of what `flattened` means please look at {@link Builder#withDecimalPrecisions}
    */
   public int[] getPrecisions() {
     return precisions;
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 0dc529d423f..4da99d811f2 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -803,6 +803,12 @@ private static class ParquetTableWriter implements TableWriter {
     HostBufferConsumer consumer;
 
     private ParquetTableWriter(ParquetWriterOptions options, File outputFile) {
+      int numColumns = options.getColumnNames().length;
+      assert (numColumns == options.getColumnNullability().length);
+      int[] precisions = options.getPrecisions();
+      if (precisions != null) {
+        assert (numColumns >= options.getPrecisions().length);
+      }
       this.consumer = null;
       this.handle = writeParquetFileBegin(options.getColumnNames(),
           options.getColumnNullability(),
@@ -871,17 +877,6 @@ public static TableWriter writeParquetChunked(ParquetWriterOptions options,
     return new ParquetTableWriter(options, consumer);
   }
 
-  /**
-   * Writes this table to a Parquet file on the host
-   *
-   * @param outputFile file to write the table to
-   * @deprecated please use writeParquetChunked instead
-   */
-  @Deprecated
-  public void writeParquet(File outputFile) {
-    writeParquet(ParquetWriterOptions.DEFAULT, outputFile);
-  }
-
   /**
    * Writes this table to a Parquet file on the host
    *
diff --git a/java/src/main/java/ai/rapids/cudf/WriterOptions.java b/java/src/main/java/ai/rapids/cudf/WriterOptions.java
index 60f7fb03459..5d5af3006a3 100644
--- a/java/src/main/java/ai/rapids/cudf/WriterOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/WriterOptions.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@ protected static class WriterBuilder<T extends WriterBuilder> {
     final List<Boolean> columnNullability = new ArrayList<>();
 
     /**
-     * Add column name
+     * Add column name(s). For Parquet column names are not optional.
      * @param columnNames
      */
     public T withColumnNames(String... columnNames) {
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 81b9882104f..249a3d9b55b 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -915,12 +915,24 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
     cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability);
     cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys);
     cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
+    cudf::jni::native_jintArray precisions(env, j_precisions);
+
+    auto cpp_names = col_names.as_cpp_vector();
+    table_input_metadata metadata;
+    metadata.column_metadata.resize(col_nullability.size());
+    for (int i = 0; i < col_nullability.size(); i++) {
+       metadata.column_metadata[i]
+           .set_name(cpp_names[i])
+           .set_nullability(col_nullability[i])
+           .set_int96_timestamps(j_isInt96);
+    }
+
+    // Precisions is not always set
+    for (int i = 0; i < precisions.size(); i++) {
+       metadata.column_metadata[i]
+           .set_decimal_precision(precisions[i]);
+    }
 
-    auto d = col_nullability.data();
-    std::vector<bool> nullability(d, d + col_nullability.size());
-    table_metadata_with_nullability metadata;
-    metadata.column_nullable = nullability;
-    metadata.column_names = col_names.as_cpp_vector();
     for (auto i = 0; i < meta_keys.size(); ++i) {
       metadata.user_data[meta_keys[i].get()] = meta_values[i].get();
     }
@@ -928,16 +940,13 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
     std::unique_ptr<cudf::jni::jni_writer_data_sink> data_sink(
         new cudf::jni::jni_writer_data_sink(env, consumer));
     sink_info sink{data_sink.get()};
-    cudf::jni::native_jintArray precisions(env, j_precisions);
     std::vector<uint8_t> const v_precisions(
         precisions.data(), precisions.data() + precisions.size());
     chunked_parquet_writer_options opts =
         chunked_parquet_writer_options::builder(sink)
-            .nullable_metadata(&metadata)
+            .metadata(&metadata)
             .compression(static_cast<compression_type>(j_compression))
             .stats_level(static_cast<statistics_freq>(j_stats_freq))
-            .int96_timestamps(static_cast<bool>(j_isInt96))
-            .decimal_precision(v_precisions)
             .build();
 
     auto writer_ptr = std::make_unique<cudf::io::parquet_chunked_writer>(opts);
@@ -965,27 +974,34 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
     cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys);
     cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
     cudf::jni::native_jstring output_path(env, j_output_path);
+    cudf::jni::native_jintArray precisions(env, j_precisions);
 
-    auto d = col_nullability.data();
-    std::vector<bool> nullability(d, d + col_nullability.size());
-    table_metadata_with_nullability metadata;
-    metadata.column_nullable = nullability;
-    metadata.column_names = col_names.as_cpp_vector();
-    for (int i = 0; i < meta_keys.size(); ++i) {
+    auto cpp_names = col_names.as_cpp_vector();
+    table_input_metadata metadata;
+    metadata.column_metadata.resize(col_nullability.size());
+    for (int i = 0; i < col_nullability.size(); i++) {
+       metadata.column_metadata[i]
+           .set_name(cpp_names[i])
+           .set_nullability(col_nullability[i])
+           .set_int96_timestamps(j_isInt96);
+    }
+
+    // Precisions is not always set
+    for (int i = 0; i < precisions.size(); i++) {
+       metadata.column_metadata[i]
+           .set_decimal_precision(precisions[i]);
+    }
+
+    for (auto i = 0; i < meta_keys.size(); ++i) {
       metadata.user_data[meta_keys[i].get()] = meta_values[i].get();
     }
-    cudf::jni::native_jintArray precisions(env, j_precisions);
-    std::vector<uint8_t> v_precisions(
-        precisions.data(), precisions.data() + precisions.size());
- 
+
     sink_info sink{output_path.get()};
     chunked_parquet_writer_options opts =
         chunked_parquet_writer_options::builder(sink)
-            .nullable_metadata(&metadata)
+            .metadata(&metadata)
             .compression(static_cast<compression_type>(j_compression))
             .stats_level(static_cast<statistics_freq>(j_stats_freq))
-            .int96_timestamps(static_cast<bool>(j_isInt96))
-            .decimal_precision(v_precisions)
             .build();
 
     auto writer_ptr = std::make_unique<cudf::io::parquet_chunked_writer>(opts);
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index c075f074068..02b64f0b713 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -4148,19 +4148,6 @@ private Table getExpectedFileTableWithDecimals() {
         .build();
   }
 
-  @Test
-  void testParquetWriteToFileNoNames() throws IOException {
-    File tempFile = File.createTempFile("test-nonames", ".parquet");
-    try (Table table0 = getExpectedFileTable()) {
-      table0.writeParquet(tempFile.getAbsoluteFile());
-      try (Table table1 = Table.readParquet(tempFile.getAbsoluteFile())) {
-        assertTablesAreEqual(table0, table1);
-      }
-    } finally {
-      tempFile.delete();
-    }
-  }
-
   private final class MyBufferConsumer implements HostBufferConsumer, AutoCloseable {
     public final HostMemoryBuffer buffer;
     long offset = 0;
@@ -4210,8 +4197,9 @@ void testParquetWriteToBufferChunkedInt96() {
     try (Table table0 = getExpectedFileTableWithDecimals();
          MyBufferConsumer consumer = new MyBufferConsumer()) {
       ParquetWriterOptions options = ParquetWriterOptions.builder()
+          .withColumnNames("_c1", "_c2", "_c3", "_c4", "_c5", "_c6", "_c7", "_c8", "_c9")
           .withTimestampInt96(true)
-          .withPrecisionValues(5, 5)
+          .withDecimalPrecisions(0, 0, 0, 0, 0, 0, 0, 5, 5)
           .build();
 
       try (TableWriter writer = Table.writeParquetChunked(options, consumer)) {
@@ -4228,9 +4216,13 @@ void testParquetWriteToBufferChunkedInt96() {
 
   @Test
   void testParquetWriteToBufferChunked() {
+    ParquetWriterOptions options = ParquetWriterOptions.builder()
+        .withColumnNames("_c1", "_c2", "_c3", "_c4", "_c5", "_c6", "_c7")
+        .withTimestampInt96(true)
+        .build();
     try (Table table0 = getExpectedFileTable();
          MyBufferConsumer consumer = new MyBufferConsumer()) {
-         try (TableWriter writer = Table.writeParquetChunked(ParquetWriterOptions.DEFAULT, consumer)) {
+         try (TableWriter writer = Table.writeParquetChunked(options, consumer)) {
            writer.write(table0);
            writer.write(table0);
            writer.write(table0);
@@ -4251,7 +4243,7 @@ void testParquetWriteToFileWithNames() throws IOException {
               "eighth", "nineth")
           .withCompressionType(CompressionType.NONE)
           .withStatisticsFrequency(ParquetWriterOptions.StatisticsFrequency.NONE)
-          .withPrecisionValues(5, 6)
+          .withDecimalPrecisions(0, 0, 0, 0, 0, 0, 0, 5, 6)
           .build();
       try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) {
         writer.write(table0);
@@ -4274,7 +4266,7 @@ void testParquetWriteToFileWithNamesAndMetadata() throws IOException {
           .withMetadata("somekey", "somevalue")
           .withCompressionType(CompressionType.NONE)
           .withStatisticsFrequency(ParquetWriterOptions.StatisticsFrequency.NONE)
-          .withPrecisionValues(6, 8)
+          .withDecimalPrecisions(0, 0, 0, 0, 0, 0, 0, 6, 8)
           .build();
       try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) {
         writer.write(table0);
@@ -4292,9 +4284,10 @@ void testParquetWriteToFileUncompressedNoStats() throws IOException {
     File tempFile = File.createTempFile("test-uncompressed", ".parquet");
     try (Table table0 = getExpectedFileTableWithDecimals()) {
       ParquetWriterOptions options = ParquetWriterOptions.builder()
+          .withColumnNames("_c1", "_c2", "_c3", "_c4", "_c5", "_c6", "_c7", "_c8", "_c9")
           .withCompressionType(CompressionType.NONE)
           .withStatisticsFrequency(ParquetWriterOptions.StatisticsFrequency.NONE)
-          .withPrecisionValues(4, 6)
+          .withDecimalPrecisions(0, 0, 0, 0, 0, 0, 0, 4, 6)
           .build();
       try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) {
         writer.write(table0);

From 708673225b831036c22a29a435324e27ad202008 Mon Sep 17 00:00:00 2001
From: David <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 19 Mar 2021 17:55:57 -0400
Subject: [PATCH 20/26] Work-around for gcc7 compile error on Centos7 (#7652)

Closes #7650

I was able to reproduce this with `nvidia/cuda:10.2-devel-centos7` docker image using gcc 7.5.0
After doing some googling and trial and error, I found just moving the offending line up one line outside the lambda in the source file gets past the error.

Authors:
  - David (@davidwendt)

Approvers:
  - Devavret Makkar (@devavret)
  - Nghia Truong (@ttnghia)
  - Mike Wilson (@hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/7652
---
 cpp/include/cudf/strings/detail/strings_column_factories.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
index 932f7eb0926..92cf537454c 100644
--- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh
+++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -93,10 +93,10 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
   auto null_mask =
     (null_count > 0) ? std::move(new_nulls.first) : rmm::device_buffer{0, stream, mr};
 
+  auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1);
   // build chars column
   std::unique_ptr<column> chars_column = [&] {
     // use a character-parallel kernel for long string lengths
-    auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1);
     if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) {
       auto const d_offsets =
         device_span<size_type const>{offsets_column->view().template data<int32_t>(),

From 217d702c9cb546dc8c175defc8876fd473b5c8a0 Mon Sep 17 00:00:00 2001
From: "Ram (Ramakrishna Prabhu)"
 <42624703+rgsl888prabhu@users.noreply.github.com>
Date: Sat, 20 Mar 2021 03:28:46 +0530
Subject: [PATCH 21/26] Fix ORC reader issue with reading empty string columns
 (#7656)

There was a [condition in reader where if the data size is zero](https://github.com/rapidsai/cudf/blob/8773a40f4c8ce63f56ed6eb67b4eaf959106939f/cpp/src/io/orc/reader_impl.cu#L538), then stream pointer was not getting updated.
But in case of `["", ""]` where it is a valid data with 0 size, it was reading it as `[null, null]`, so the condition has been removed which caused this issue.

I have also added test cases to validate.

closes #7620

Authors:
  - Ram (Ramakrishna Prabhu) (@rgsl888prabhu)

Approvers:
  - Devavret Makkar (@devavret)
  - Vukasin Milovanovic (@vuule)
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7656
---
 cpp/src/io/orc/reader_impl.cu      |  4 +---
 python/cudf/cudf/tests/test_orc.py | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 61adef26dab..2567b2579d7 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -535,9 +535,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
           chunk.ts_clock_rate = to_clockrate(_timestamp_type.id());
         }
         for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
-          if (chunk.strm_len[k] > 0) {
-            chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
-          }
+          chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
         }
       }
       stripe_start_row += stripe_info->numberOfRows;
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index ca8aa00f80c..fa14a0a9690 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -738,3 +738,19 @@ def test_nanoseconds_overflow():
 
     pyarrow_got = pa.orc.ORCFile(buffer).read()
     assert_eq(expected.to_pandas(), pyarrow_got.to_pandas())
+
+
+@pytest.mark.parametrize(
+    "data", [[None, ""], ["", None], [None, None], ["", ""]]
+)
+def test_empty_string_columns(data):
+    buffer = BytesIO()
+
+    expected = cudf.DataFrame({"string": data}, dtype="str")
+    expected.to_orc(buffer)
+
+    expected_pdf = pd.read_orc(buffer)
+    got_df = cudf.read_orc(buffer)
+
+    assert_eq(expected, got_df)
+    assert_eq(expected_pdf, got_df)

From cdd44d21629d014fb431aec3ebb4c02ad56cb29e Mon Sep 17 00:00:00 2001
From: Liangcai Li <firestarmanllc@gmail.com>
Date: Sat, 20 Mar 2021 09:18:39 +0800
Subject: [PATCH 22/26] JNI: Pass names of children struct columns to native
 Arrow IPC writer (#7598)

This PR is to add the support of building the structure of column metadata from the flattened column names according to the table schema.
Since the children column metadata is required when converting cudf tables to arrow tables.

Also updating the related unit tests.

closes https://github.com/rapidsai/cudf/issues/7570

Signed-off-by: Firestarman <firestarmanllc@gmail.com>

Authors:
  - Liangcai Li (@firestarman)

Approvers:
  - Jason Lowe (@jlowe)

URL: https://github.com/rapidsai/cudf/pull/7598
---
 .../ai/rapids/cudf/ArrowIPCWriterOptions.java | 58 +++++++++++++++++
 java/src/main/native/src/TableJni.cpp         | 63 +++++++++++++++++--
 .../test/java/ai/rapids/cudf/TableTest.java   | 46 +++++++++++---
 3 files changed, 151 insertions(+), 16 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java
index 298e99b059d..ee5ae094b29 100644
--- a/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java
@@ -67,6 +67,64 @@ public Builder withCallback(DoneOnGpu callback) {
       return this;
     }
 
+    /**
+     * Add the name(s) for nullable column(s).
+     *
+     * Please note the column names of the nested struct columns should be flattened in sequence.
+     * For examples,
+     * <pre>
+     *   A table with an int column and a struct column:
+     *                   ["int_col", "struct_col":{"field_1", "field_2"}]
+     *   output:
+     *                   ["int_col", "struct_col", "field_1", "field_2"]
+     *
+     *   A table with an int column and a list of non-nested type column:
+     *                   ["int_col", "list_col":[]]
+     *   output:
+     *                   ["int_col", "list_col"]
+     *
+     *   A table with an int column and a list of struct column:
+     *                   ["int_col", "list_struct_col":[{"field_1", "field_2"}]]
+     *   output:
+     *                   ["int_col", "list_struct_col", "field_1", "field_2"]
+     * </pre>
+     *
+     * @param columnNames The column names corresponding to the written table(s).
+     */
+    @Override
+    public Builder withColumnNames(String... columnNames) {
+      return super.withColumnNames(columnNames);
+    }
+
+    /**
+     * Add the name(s) for non-nullable column(s).
+     *
+     * Please note the column names of the nested struct columns should be flattened in sequence.
+     * For examples,
+     * <pre>
+     *   A table with an int column and a struct column:
+     *                   ["int_col", "struct_col":{"field_1", "field_2"}]
+     *   output:
+     *                   ["int_col", "struct_col", "field_1", "field_2"]
+     *
+     *   A table with an int column and a list of non-nested type column:
+     *                   ["int_col", "list_col":[]]
+     *   output:
+     *                   ["int_col", "list_col"]
+     *
+     *   A table with an int column and a list of struct column:
+     *                   ["int_col", "list_struct_col":[{"field_1", "field_2"}]]
+     *   output:
+     *                   ["int_col", "list_struct_col", "field_1", "field_2"]
+     * </pre>
+     *
+     * @param columnNames The column names corresponding to the written table(s).
+     */
+    @Override
+    public Builder withNotNullableColumnNames(String... columnNames) {
+      return super.withNotNullableColumnNames(columnNames);
+    }
+
     public ArrowIPCWriterOptions build() {
       return new ArrowIPCWriterOptions(this);
     }
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 249a3d9b55b..6beedf54f5a 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -212,12 +212,15 @@ class native_arrow_ipc_writer_handle final {
                                           const std::shared_ptr<arrow::io::OutputStream> &sink)
       : initialized(false), column_names(col_names), file_name(""), sink(sink) {}
 
+private:
   bool initialized;
   std::vector<std::string> column_names;
+  std::vector<cudf::column_metadata> columns_meta;
   std::string file_name;
   std::shared_ptr<arrow::io::OutputStream> sink;
   std::shared_ptr<arrow::ipc::RecordBatchWriter> writer;
 
+public:
   void write(std::shared_ptr<arrow::Table> &arrow_tab, int64_t max_chunk) {
     if (!initialized) {
       if (!sink) {
@@ -246,6 +249,59 @@ class native_arrow_ipc_writer_handle final {
     }
     initialized = false;
   }
+
+  std::vector<cudf::column_metadata> get_column_metadata(const cudf::table_view& tview) {
+    if (!column_names.empty() && columns_meta.empty()) {
+      // Rebuild the structure of column meta according to table schema.
+      // All the tables written by this writer should share the same schema,
+      // so build column metadata only once.
+      columns_meta.reserve(tview.num_columns());
+      size_t idx = 0;
+      for (auto itr = tview.begin(); itr < tview.end(); ++itr) {
+        // It should consume the column names only when a column is
+        //   - type of struct, or
+        //   - not a child.
+        columns_meta.push_back(build_one_column_meta(*itr, idx));
+      }
+      if (idx < column_names.size()) {
+        throw cudf::jni::jni_exception("Too many column names are provided.");
+      }
+    }
+    return columns_meta;
+  }
+
+private:
+  cudf::column_metadata build_one_column_meta(const cudf::column_view& cview, size_t& idx,
+                                              const bool consume_name = true) {
+    auto col_meta = cudf::column_metadata{};
+    if (consume_name) {
+      col_meta.name = get_column_name(idx++);
+    }
+    // Process children
+    if (cview.type().id() == cudf::type_id::LIST) {
+      // list type:
+      //   - requires a stub metadata for offset column(index: 0).
+      //   - does not require a name for the child column(index 1).
+      col_meta.children_meta = {{}, build_one_column_meta(cview.child(1), idx, false)};
+    } else if (cview.type().id() == cudf::type_id::STRUCT) {
+      // struct type always consumes the column names.
+      col_meta.children_meta.reserve(cview.num_children());
+      for (auto itr = cview.child_begin(); itr < cview.child_end(); ++itr) {
+        col_meta.children_meta.push_back(build_one_column_meta(*itr, idx));
+      }
+    } else if (cview.type().id() == cudf::type_id::DICTIONARY32) {
+      // not supported yet in JNI, nested type?
+      throw cudf::jni::jni_exception("Unsupported type 'DICTIONARY32'");
+    }
+    return col_meta;
+  }
+
+  std::string& get_column_name(const size_t idx) {
+    if (idx < 0 || idx >= column_names.size()) {
+      throw cudf::jni::jni_exception("Missing names for columns or nested struct columns");
+    }
+    return column_names[idx];
+  }
 };
 
 class jni_arrow_output_stream final : public arrow::io::OutputStream {
@@ -1262,12 +1318,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_convertCudfToArrowTable(JNIEnv
     cudf::jni::auto_set_device(env);
     std::unique_ptr<std::shared_ptr<arrow::Table>> result(
         new std::shared_ptr<arrow::Table>(nullptr));
-    auto column_metadata = std::vector<cudf::column_metadata>{};
-    column_metadata.reserve(state->column_names.size());
-    std::transform(std::begin(state->column_names), std::end(state->column_names),
-                   std::back_inserter(column_metadata),
-                   [](auto const &column_name) { return cudf::column_metadata{column_name}; });
-    *result = cudf::to_arrow(*tview, column_metadata);
+    *result = cudf::to_arrow(*tview, state->get_column_metadata(*tview));
     if (!result->get()) {
       return 0;
     }
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 02b64f0b713..4eee3e97e6e 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -4123,15 +4123,38 @@ void testTableBasedFilter() {
   }
 
   private Table getExpectedFileTable() {
-    return new TestBuilder()
-        .column(true, false, false, true, false)
-        .column(5, 1, 0, 2, 7)
-        .column(new Byte[]{2, 3, 4, 5, 9})
-        .column(3l, 9l, 4l, 2l, 20l)
-        .column("this", "is", "a", "test", "string")
-        .column(1.0f, 3.5f, 5.9f, 7.1f, 9.8f)
-        .column(5.0d, 9.5d, 0.9d, 7.23d, 2.8d)
-        .build();
+    return getExpectedFileTable(false);
+  }
+
+  private Table getExpectedFileTable(boolean withNestedColumns) {
+    TestBuilder tb = new TestBuilder()
+            .column(true, false, false, true, false)
+            .column(5, 1, 0, 2, 7)
+            .column(new Byte[]{2, 3, 4, 5, 9})
+            .column(3l, 9l, 4l, 2l, 20l)
+            .column("this", "is", "a", "test", "string")
+            .column(1.0f, 3.5f, 5.9f, 7.1f, 9.8f)
+            .column(5.0d, 9.5d, 0.9d, 7.23d, 2.8d);
+    if (withNestedColumns) {
+      StructType nestedType = new StructType(true,
+              new BasicType(false, DType.INT32), new BasicType(false, DType.STRING));
+      tb.column(nestedType,
+            struct(1, "k1"), struct(2, "k2"), struct(3, "k3"),
+            struct(4, "k4"), new HostColumnVector.StructData((List) null))
+        .column(new ListType(false, new BasicType(false, DType.INT32)),
+                Arrays.asList(1, 2),
+                Arrays.asList(3, 4),
+                Arrays.asList(5),
+                Arrays.asList(6, 7),
+                Arrays.asList(8, 9, 10))
+        .column(new ListType(false, nestedType),
+            Arrays.asList(struct(1, "k1"), struct(2, "k2"), struct(3, "k3")),
+            Arrays.asList(struct(4, "k4"), struct(5, "k5")),
+            Arrays.asList(struct(6, "k6")),
+            Arrays.asList(new HostColumnVector.StructData((List) null)),
+            Arrays.asList());
+    }
+    return tb.build();
   }
 
   private Table getExpectedFileTableWithDecimals() {
@@ -4332,10 +4355,13 @@ void testArrowIPCWriteToFileWithNamesAndMetadata() throws IOException {
 
   @Test
   void testArrowIPCWriteToBufferChunked() {
-    try (Table table0 = getExpectedFileTable();
+    try (Table table0 = getExpectedFileTable(true);
          MyBufferConsumer consumer = new MyBufferConsumer()) {
       ArrowIPCWriterOptions options = ArrowIPCWriterOptions.builder()
               .withColumnNames("first", "second", "third", "fourth", "fifth", "sixth", "seventh")
+              .withColumnNames("eighth", "eighth_id", "eighth_name")
+              .withColumnNames("ninth")
+              .withColumnNames("tenth", "child_id", "child_name")
               .build();
       try (TableWriter writer = Table.writeArrowIPCChunked(options, consumer)) {
         writer.write(table0);

From bb05ddc4d7b9852bbe87694977a259d4466d6604 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 19 Mar 2021 19:17:39 -0700
Subject: [PATCH 23/26] Use rmm::device_uvector in place of rmm::device_vector
 for ORC reader/writer and cudf::io::column_buffer (#7614)

Issue #7287

Replaces `device_vector` with `device_uvector` and `device_span`. Because `device_uvector` does not have a default constructor, some additional changes were required for `device_uvector` data members.

Performance impact: this change makes a measurable difference in reader benchmarks. Most string column cases are sped up around **5%**, with other cases having a measurable, but less consistent improvement.

Authors:
  - Vukasin Milovanovic (@vuule)

Approvers:
  - Ram (Ramakrishna Prabhu) (@rgsl888prabhu)
  - Kumar Aatish (@kaatish)

URL: https://github.com/rapidsai/cudf/pull/7614
---
 cpp/src/io/csv/reader_impl.cu          |  2 +-
 cpp/src/io/orc/reader_impl.cu          | 58 ++++++++++++--------------
 cpp/src/io/orc/reader_impl.hpp         |  6 +--
 cpp/src/io/orc/timezone.cpp            | 20 ++++++++-
 cpp/src/io/orc/timezone.cuh            | 17 ++++++--
 cpp/src/io/utilities/column_buffer.hpp | 15 ++++---
 6 files changed, 70 insertions(+), 48 deletions(-)

diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 332e8aff7fc..76580122fe6 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -351,7 +351,7 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
         // during the conversion stage
         const std::string quotechar(1, opts.quotechar);
         const std::string dblquotechar(2, opts.quotechar);
-        std::unique_ptr<column> col = cudf::make_strings_column(out_buffers[i]._strings, stream);
+        std::unique_ptr<column> col = cudf::make_strings_column(*out_buffers[i]._strings, stream);
         out_columns.emplace_back(
           cudf::strings::replace(col->view(), dblquotechar, quotechar, -1, mr_));
       } else {
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 2567b2579d7..e6e3ec69e43 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -31,7 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <algorithm>
 #include <array>
@@ -223,7 +223,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
   const OrcDecompressor *decompressor,
   std::vector<orc_stream_info> &stream_info,
   size_t num_stripes,
-  rmm::device_vector<gpu::RowGroup> &row_groups,
+  device_span<gpu::RowGroup> row_groups,
   size_t row_index_stride,
   rmm::cuda_stream_view stream)
 {
@@ -254,9 +254,9 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
   CUDF_EXPECTS(total_decomp_size > 0, "No decompressible data found");
 
   rmm::device_buffer decomp_data(total_decomp_size, stream);
-  rmm::device_vector<gpu_inflate_input_s> inflate_in(num_compressed_blocks +
-                                                     num_uncompressed_blocks);
-  rmm::device_vector<gpu_inflate_status_s> inflate_out(num_compressed_blocks);
+  rmm::device_uvector<gpu_inflate_input_s> inflate_in(
+    num_compressed_blocks + num_uncompressed_blocks, stream);
+  rmm::device_uvector<gpu_inflate_status_s> inflate_out(num_compressed_blocks, stream);
 
   // Parse again to populate the decompression input/output buffers
   size_t decomp_offset      = 0;
@@ -265,9 +265,9 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
   for (size_t i = 0; i < compinfo.size(); ++i) {
     auto dst_base                 = static_cast<uint8_t *>(decomp_data.data());
     compinfo[i].uncompressed_data = dst_base + decomp_offset;
-    compinfo[i].decctl            = inflate_in.data().get() + start_pos;
-    compinfo[i].decstatus         = inflate_out.data().get() + start_pos;
-    compinfo[i].copyctl           = inflate_in.data().get() + start_pos_uncomp;
+    compinfo[i].decctl            = inflate_in.data() + start_pos;
+    compinfo[i].decstatus         = inflate_out.data() + start_pos;
+    compinfo[i].copyctl           = inflate_in.data() + start_pos_uncomp;
 
     stream_info[i].dst_pos = decomp_offset;
     decomp_offset += compinfo[i].max_uncompressed_size;
@@ -285,19 +285,18 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
   if (num_compressed_blocks > 0) {
     switch (decompressor->GetKind()) {
       case orc::ZLIB:
-        CUDA_TRY(gpuinflate(
-          inflate_in.data().get(), inflate_out.data().get(), num_compressed_blocks, 0, stream));
+        CUDA_TRY(
+          gpuinflate(inflate_in.data(), inflate_out.data(), num_compressed_blocks, 0, stream));
         break;
       case orc::SNAPPY:
-        CUDA_TRY(gpu_unsnap(
-          inflate_in.data().get(), inflate_out.data().get(), num_compressed_blocks, stream));
+        CUDA_TRY(gpu_unsnap(inflate_in.data(), inflate_out.data(), num_compressed_blocks, stream));
         break;
       default: CUDF_EXPECTS(false, "Unexpected decompression dispatch"); break;
     }
   }
   if (num_uncompressed_blocks > 0) {
     CUDA_TRY(gpu_copy_uncompressed_blocks(
-      inflate_in.data().get() + num_compressed_blocks, num_uncompressed_blocks, stream));
+      inflate_in.data() + num_compressed_blocks, num_uncompressed_blocks, stream));
   }
   gpu::PostDecompressionReassemble(compinfo.device_ptr(), compinfo.size(), stream);
 
@@ -324,7 +323,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
 
   if (not row_groups.empty()) {
     chunks.host_to_device(stream);
-    gpu::ParseRowGroupIndex(row_groups.data().get(),
+    gpu::ParseRowGroupIndex(row_groups.data(),
                             compinfo.device_ptr(),
                             chunks.device_ptr(),
                             num_columns,
@@ -341,8 +340,8 @@ void reader::impl::decode_stream_data(hostdevice_vector<gpu::ColumnDesc> &chunks
                                       size_t num_dicts,
                                       size_t skip_rows,
                                       size_t num_rows,
-                                      timezone_table const &tz_table,
-                                      const rmm::device_vector<gpu::RowGroup> &row_groups,
+                                      timezone_table_view tz_table,
+                                      device_span<gpu::RowGroup const> row_groups,
                                       size_t row_index_stride,
                                       std::vector<column_buffer> &out_buffers,
                                       rmm::cuda_stream_view stream)
@@ -360,24 +359,19 @@ void reader::impl::decode_stream_data(hostdevice_vector<gpu::ColumnDesc> &chunks
   }
 
   // Allocate global dictionary for deserializing
-  rmm::device_vector<gpu::DictionaryEntry> global_dict(num_dicts);
+  rmm::device_uvector<gpu::DictionaryEntry> global_dict(num_dicts, stream);
 
   chunks.host_to_device(stream);
-  gpu::DecodeNullsAndStringDictionaries(chunks.device_ptr(),
-                                        global_dict.data().get(),
-                                        num_columns,
-                                        num_stripes,
-                                        num_rows,
-                                        skip_rows,
-                                        stream);
+  gpu::DecodeNullsAndStringDictionaries(
+    chunks.device_ptr(), global_dict.data(), num_columns, num_stripes, num_rows, skip_rows, stream);
   gpu::DecodeOrcColumnData(chunks.device_ptr(),
-                           global_dict.data().get(),
+                           global_dict.data(),
                            num_columns,
                            num_stripes,
                            num_rows,
                            skip_rows,
-                           tz_table.view(),
-                           row_groups.data().get(),
+                           tz_table,
+                           row_groups.data(),
                            row_groups.size() / num_columns,
                            row_index_stride,
                            stream);
@@ -548,7 +542,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
     // Process dataset chunk pages into output columns
     if (stripe_data.size() != 0) {
       // Setup row group descriptors if using indexes
-      rmm::device_vector<gpu::RowGroup> row_groups(num_rowgroups * num_columns);
+      rmm::device_uvector<gpu::RowGroup> row_groups(num_rowgroups * num_columns, stream);
       if (_metadata->ps.compression != orc::NONE) {
         auto decomp_data = decompress_stripe_data(chunks,
                                                   stripe_data,
@@ -561,9 +555,9 @@ table_with_metadata reader::impl::read(size_type skip_rows,
         stripe_data.clear();
         stripe_data.push_back(std::move(decomp_data));
       } else {
-        if (not row_groups.empty()) {
+        if (not row_groups.is_empty()) {
           chunks.host_to_device(stream);
-          gpu::ParseRowGroupIndex(row_groups.data().get(),
+          gpu::ParseRowGroupIndex(row_groups.data(),
                                   nullptr,
                                   chunks.device_ptr(),
                                   num_columns,
@@ -577,7 +571,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
       // Setup table for converting timestamp columns from local to UTC time
       auto const tz_table =
         _has_timestamp_column
-          ? build_timezone_transition_table(selected_stripes[0].second->writerTimezone)
+          ? build_timezone_transition_table(selected_stripes[0].second->writerTimezone, stream)
           : timezone_table{};
 
       std::vector<column_buffer> out_buffers;
@@ -596,7 +590,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                          num_dict_entries,
                          skip_rows,
                          num_rows,
-                         tz_table,
+                         tz_table.view(),
                          row_groups,
                          _metadata->get_row_index_stride(),
                          out_buffers,
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 818e70b15e7..3a2913c5548 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -97,7 +97,7 @@ class reader::impl {
                                             const OrcDecompressor *decompressor,
                                             std::vector<orc_stream_info> &stream_info,
                                             size_t num_stripes,
-                                            rmm::device_vector<gpu::RowGroup> &row_groups,
+                                            device_span<gpu::RowGroup> row_groups,
                                             size_t row_index_stride,
                                             rmm::cuda_stream_view stream);
 
@@ -118,8 +118,8 @@ class reader::impl {
                           size_t num_dicts,
                           size_t skip_rows,
                           size_t num_rows,
-                          timezone_table const &tz_table,
-                          const rmm::device_vector<gpu::RowGroup> &row_groups,
+                          timezone_table_view tz_table,
+                          device_span<gpu::RowGroup const> row_groups,
                           size_t row_index_stride,
                           std::vector<column_buffer> &out_buffers,
                           rmm::cuda_stream_view stream);
diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp
index bf8b96b89dc..81ffa954c1a 100644
--- a/cpp/src/io/orc/timezone.cpp
+++ b/cpp/src/io/orc/timezone.cpp
@@ -374,7 +374,8 @@ static int64_t get_transition_time(dst_transition_s const &trans, int year)
   return trans.time + day * day_seconds;
 }
 
-timezone_table build_timezone_transition_table(std::string const &timezone_name)
+timezone_table build_timezone_transition_table(std::string const &timezone_name,
+                                               rmm::cuda_stream_view stream)
 {
   if (timezone_name == "UTC" || timezone_name.empty()) {
     // Return an empty table for UTC
@@ -459,7 +460,22 @@ timezone_table build_timezone_transition_table(std::string const &timezone_name)
     year_timestamp += (365 + is_leap_year(year)) * day_seconds;
   }
 
-  return {get_gmt_offset(ttimes, offsets, orc_utc_offset), ttimes, offsets};
+  rmm::device_uvector<int64_t> d_ttimes{ttimes.size(), stream};
+  CUDA_TRY(cudaMemcpyAsync(d_ttimes.data(),
+                           ttimes.data(),
+                           ttimes.size() * sizeof(int64_t),
+                           cudaMemcpyDefault,
+                           stream.value()));
+  rmm::device_uvector<int32_t> d_offsets{offsets.size(), stream};
+  CUDA_TRY(cudaMemcpyAsync(d_offsets.data(),
+                           offsets.data(),
+                           offsets.size() * sizeof(int32_t),
+                           cudaMemcpyDefault,
+                           stream.value()));
+  auto const gmt_offset = get_gmt_offset(ttimes, offsets, orc_utc_offset);
+  stream.synchronize();
+
+  return {gmt_offset, std::move(d_ttimes), std::move(d_offsets)};
 }
 
 }  // namespace io
diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh
index 3a87f28391c..b0231ca9e7d 100644
--- a/cpp/src/io/orc/timezone.cuh
+++ b/cpp/src/io/orc/timezone.cuh
@@ -20,8 +20,8 @@
 #include <cudf/utilities/span.hpp>
 
 #include <thrust/binary_search.h>
-#include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
+#include <rmm/device_uvector.hpp>
 
 #include <stdint.h>
 #include <string>
@@ -108,8 +108,15 @@ inline __device__ int32_t get_gmt_offset(cudf::device_span<int64_t const> ttimes
 
 struct timezone_table {
   int32_t gmt_offset = 0;
-  rmm::device_vector<int64_t> ttimes;
-  rmm::device_vector<int32_t> offsets;
+  rmm::device_uvector<int64_t> ttimes;
+  rmm::device_uvector<int32_t> offsets;
+  timezone_table() : ttimes{0, rmm::cuda_stream_default}, offsets{0, rmm::cuda_stream_default} {}
+  timezone_table(int32_t gmt_offset,
+                 rmm::device_uvector<int64_t> &&ttimes,
+                 rmm::device_uvector<int32_t> &&offsets)
+    : gmt_offset{gmt_offset}, ttimes{std::move(ttimes)}, offsets{std::move(offsets)}
+  {
+  }
   timezone_table_view view() const { return {gmt_offset, ttimes, offsets}; }
 };
 
@@ -119,10 +126,12 @@ struct timezone_table {
  * Uses system's TZif files. Assumes little-endian platform when parsing these files.
  *
  * @param timezone_name standard timezone name (for example, "US/Pacific")
+ * @param stream CUDA stream used for any device memory operations and kernel launches
  *
  * @return The transition table for the given timezone
  */
-timezone_table build_timezone_transition_table(std::string const &timezone_name);
+timezone_table build_timezone_transition_table(std::string const &timezone_name,
+                                               rmm::cuda_stream_view stream);
 
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index b4e26c042fb..88444d41206 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -32,7 +32,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 
 namespace cudf {
 namespace io {
@@ -108,7 +108,10 @@ struct column_buffer {
     size = _size;
 
     switch (type.id()) {
-      case type_id::STRING: _strings.resize(size); break;
+      case type_id::STRING:
+        _strings = std::make_unique<rmm::device_uvector<str_pair>>(size, stream);
+        cudaMemsetAsync(_strings->data(), 0, size * sizeof(str_pair), stream.value());
+        break;
 
       // list columns store a buffer of int32's as offsets to represent
       // their individual rows
@@ -125,8 +128,8 @@ struct column_buffer {
     }
   }
 
-  auto data() { return _strings.size() ? _strings.data().get() : _data.data(); }
-  auto data_size() { return std::max(_data.size(), _strings.size() * sizeof(str_pair)); }
+  auto data() { return _strings ? _strings->data() : _data.data(); }
+  auto data_size() const { return _strings ? _strings->size() : _data.size(); }
 
   template <typename T = uint32_t>
   auto null_mask()
@@ -137,7 +140,7 @@ struct column_buffer {
 
   auto& null_count() { return _null_count; }
 
-  rmm::device_vector<str_pair> _strings;
+  std::unique_ptr<rmm::device_uvector<str_pair>> _strings;
   rmm::device_buffer _data{};
   rmm::device_buffer _null_mask{};
   size_type _null_count{0};
@@ -178,7 +181,7 @@ std::unique_ptr<column> make_column(
         schema_info->children.push_back(column_name_info{"offsets"});
         schema_info->children.push_back(column_name_info{"chars"});
       }
-      return make_strings_column(buffer._strings, stream, mr);
+      return make_strings_column(*buffer._strings, stream, mr);
 
     case type_id::LIST: {
       // make offsets column

From 69a848db97fd0b1ecb08304451157381ffff8e05 Mon Sep 17 00:00:00 2001
From: Paul Taylor <paul.e.taylor@me.com>
Date: Sat, 20 Mar 2021 00:04:52 -0500
Subject: [PATCH 24/26] Fix `find_package(cudf)` (#7658)

This PR makes `find_package(cudf)` work when `libcudf` is installed via conda.

This should make CPM use the conda-installed `libcudf` in https://github.com/rapidsai/cudf/pull/7484 and https://github.com/rapidsai/cuspatial/pull/365.

Tested in a `gpuci/miniconda-cuda:10.2-devel-ubuntu18.04` container.

Install dependencies:
```
mamba install -n base \
 -c conda-forge gtest gmock cmake=3.18 boost-cpp=1.72.0 \
 -c rapidsai-nightly libcudf
```

Use this `CMakeLists.txt`:
```cmake
cmake_minimum_required(VERSION 3.18)
project(test VERSION 1.0.0 LANGUAGES C CXX CUDA)
find_package(cudf)
add_library(test test.cpp)
target_link_libraries(test cudf::cudf)
```

<details><summary>See this output:</summary><pre>
(base) root@8c350dfb37f1:/tmp/findpackagecudf# mkdir build && cmake -B build -S .
-- The C compiler identification is GNU 7.5.0
-- The CXX compiler identification is GNU 7.5.0
-- The CUDA compiler identification is NVIDIA 10.2.89
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Detecting CUDA compiler ABI info
-- Detecting CUDA compiler ABI info - done
-- Check for working CUDA compiler: /usr/local/cuda/bin/nvcc - skipped
-- Detecting CUDA compile features
-- Detecting CUDA compile features - done
-- Found CUDAToolkit: /usr/local/cuda/include (found version "10.2.89")
-- Looking for pthread.h
-- Looking for pthread.h - found
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed
-- Looking for pthread_create in pthreads
-- Looking for pthread_create in pthreads - not found
-- Looking for pthread_create in pthread
-- Looking for pthread_create in pthread - found
-- Found Threads: TRUE
-- Found ZLIB: /opt/conda/lib/libz.so (found version "1.2.11")
-- Found Boost: /opt/conda/include (found suitable version "1.72.0", minimum required is "1.65.0") found components: filesystem
-- Found Thrust: /usr/local/cuda/include (found suitable version "1.9.7", minimum required is "1.9.0")
-- Found rmm: /opt/conda/lib/cmake/rmm/rmm-config.cmake (found suitable version "0.19.0", minimum required is "0.19")
-- Found GTest: /opt/conda/lib/libgtest.so (Required is at least version "1.10.0")
-- Found Thrust: /opt/conda/include/libcudf/Thrust/thrust/cmake/thrust-config.cmake (found suitable version "1.10.0.0", minimum required is "1.10.0")
-- Found cudf: /opt/conda/lib/cmake/cudf/cudf-config.cmake (found version "0.19.0")
-- Configuring done
-- Generating done
-- Build files have been written to: /tmp/findpackagecudf/build
</pre></details>

<details><summary>The generated compile command for test.cpp:</summary><pre>
/usr/bin/c++
 -DBOOST_NO_CXX14_CONSTEXPR
 -DCUDF_VERSION=0.19.0
 -DJITIFY_USE_CACHE
 -DSPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_INFO
 -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA
 -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP
 -I/opt/conda/include/libcudf/Thrust/dependencies/cub
 -isystem /opt/conda/include
 -isystem /opt/conda/include/libcudf/libcudacxx
 -isystem /opt/conda/include/libcudf/Thrust
 -isystem /usr/local/cuda/include
 -fPIC
 -o CMakeFiles/test.dir/test.cpp.o
 -c /tmp/findpackagecudf/test.cpp
</pre></details>

Authors:
  - Paul Taylor (@trxcllnt)

Approvers:
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7658
---
 cpp/CMakeLists.txt                        |  6 +++++-
 cpp/cmake/cudf-config.cmake.in            | 14 ++++++++++++--
 cpp/cmake/thirdparty/CUDF_GetGTest.cmake  |  4 ++--
 cpp/cmake/thirdparty/CUDF_GetThrust.cmake |  1 +
 4 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f15fd649b83..3e875b71ca6 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -425,7 +425,8 @@ target_include_directories(cudf
                        "$<BUILD_INTERFACE:${CUDF_GENERATED_INCLUDE_DIR}/include>"
            PRIVATE     "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
            INTERFACE   "$<INSTALL_INTERFACE:include>"
-                       "$<INSTALL_INTERFACE:include/libcudf/libcudaxx>")
+                       "$<INSTALL_INTERFACE:include/libcudf/libcudacxx>"
+                       "$<INSTALL_INTERFACE:include/libcudf/Thrust>")
 
 # Add Conda library paths if specified
 if(CONDA_LINK_DIRS)
@@ -578,6 +579,9 @@ install(DIRECTORY
             ${CUDF_GENERATED_INCLUDE_DIR}/include/libcudacxx
         DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/libcudf)
 
+install(DIRECTORY ${Thrust_SOURCE_DIR}/
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust)
+
 include(CMakePackageConfigHelpers)
 
 configure_package_config_file(cmake/cudf-config.cmake.in "${CUDF_BINARY_DIR}/cmake/cudf-config.cmake"
diff --git a/cpp/cmake/cudf-config.cmake.in b/cpp/cmake/cudf-config.cmake.in
index 1147e1160e7..0a478516f18 100644
--- a/cpp/cmake/cudf-config.cmake.in
+++ b/cpp/cmake/cudf-config.cmake.in
@@ -15,12 +15,22 @@ find_dependency(CUDAToolkit)
 find_dependency(Threads)
 find_dependency(ZLIB)
 
+# Don't look for a Boost CMake configuration file because it adds the
+# `-DBOOST_ALL_NO_LIB` and `-DBOOST_FILESYSTEM_DYN_LINK` compile defs
+set(Boost_NO_BOOST_CMAKE ON)
+find_dependency(Boost @CUDF_MIN_VERSION_Boost@ COMPONENTS filesystem)
+
 find_dependency(Arrow @CUDF_VERSION_Arrow@)
+
+set(ArrowCUDA_DIR "${Arrow_DIR}")
 find_dependency(ArrowCUDA @CUDF_VERSION_Arrow@)
-find_dependency(Boost @CUDF_MIN_VERSION_Boost@)
 
 find_dependency(rmm @CUDF_MIN_VERSION_rmm@)
-find_dependency(gtest @CUDF_MIN_VERSION_gtest@)
+find_dependency(GTest @CUDF_MIN_VERSION_GTest@)
+
+set(Thrust_ROOT "${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/Thrust")
+find_dependency(Thrust @CUDF_MIN_VERSION_Thrust@)
+thrust_create_target(cudf::Thrust FROM_OPTIONS)
 
 list(POP_FRONT CMAKE_MODULE_PATH)
 
diff --git a/cpp/cmake/thirdparty/CUDF_GetGTest.cmake b/cpp/cmake/thirdparty/CUDF_GetGTest.cmake
index 2911e4fce29..e346dce1730 100644
--- a/cpp/cmake/thirdparty/CUDF_GetGTest.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetGTest.cmake
@@ -48,6 +48,6 @@ function(find_and_configure_gtest VERSION)
     endif()
 endfunction()
 
-set(CUDF_MIN_VERSION_gtest 1.10.0)
+set(CUDF_MIN_VERSION_GTest 1.10.0)
 
-find_and_configure_gtest(${CUDF_MIN_VERSION_gtest})
+find_and_configure_gtest(${CUDF_MIN_VERSION_GTest})
diff --git a/cpp/cmake/thirdparty/CUDF_GetThrust.cmake b/cpp/cmake/thirdparty/CUDF_GetThrust.cmake
index e045093104a..5a304f234d2 100644
--- a/cpp/cmake/thirdparty/CUDF_GetThrust.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetThrust.cmake
@@ -24,6 +24,7 @@ function(find_and_configure_thrust VERSION)
 
     thrust_create_target(cudf::Thrust FROM_OPTIONS)
     set(THRUST_LIBRARY "cudf::Thrust" PARENT_SCOPE)
+    set(Thrust_SOURCE_DIR "${Thrust_SOURCE_DIR}" PARENT_SCOPE)
 endfunction()
 
 set(CUDF_MIN_VERSION_Thrust 1.10.0)

From 11455eef5a4e1215c5fcfe9ee175e004b6f11fcc Mon Sep 17 00:00:00 2001
From: MithunR <mythrocks@gmail.com>
Date: Sun, 21 Mar 2021 10:17:36 -0700
Subject: [PATCH 25/26] Fix Debug build break with device_uvectors in
 grouped_rolling.cu (#7633)

After #7523, on `Debug` builds, one seems to see the following build error:
```
cudf/cpp/src/rolling/grouped_rolling.cu(130): error: no operator "[]" matches these operands
            operand types are: const cudf::groupby::detail::sort::sort_groupby_helper::index_vector [ int ]
...
cudf/cpp/src/rolling/grouped_rolling.cu(130): error: no operator "[]" matches these operands
            operand types are: const cudf::groupby::detail::sort::sort_groupby_helper::index_vector [ unsigned long ]
```

The offending line has an `assert()` that is only compiled during `Debug` builds. This commit corrects the indexing into `device_uvector`.

Authors:
  - MithunR (@mythrocks)

Approvers:
  - David (@davidwendt)
  - Nghia Truong (@ttnghia)
  - Devavret Makkar (@devavret)
  - Karthikeyan (@karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/7633
---
 cpp/src/rolling/grouped_rolling.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index c1ebc9f3f9f..b8cb5e45fec 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -110,8 +110,8 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
   using sort_groupby_helper = cudf::groupby::detail::sort::sort_groupby_helper;
 
   sort_groupby_helper helper{group_keys, cudf::null_policy::INCLUDE, cudf::sorted::YES};
-  auto const& group_offsets{helper.group_offsets()};
-  auto const& group_labels{helper.group_labels()};
+  auto const& group_offsets{helper.group_offsets(stream)};
+  auto const& group_labels{helper.group_labels(stream)};
 
   // `group_offsets` are interpreted in adjacent pairs, each pair representing the offsets
   // of the first, and one past the last elements in a group.
@@ -127,8 +127,8 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
   //   groups.)
   //   3. [0, 500, 1000] indicates two equal-sized groups: [0,500), and [500,1000).
 
-  assert(group_offsets.size() >= 2 && group_offsets[0] == 0 &&
-         group_offsets[group_offsets.size() - 1] == input.size() &&
+  assert(group_offsets.size() >= 2 && group_offsets.element(0, stream) == 0 &&
+         group_offsets.element(group_offsets.size() - 1, stream) == input.size() &&
          "Must have at least one group.");
 
   auto preceding_calculator = [d_group_offsets = group_offsets.data(),

From 5d7767ec2fdf6eec4a76c601e66a8eaf21bea666 Mon Sep 17 00:00:00 2001
From: "Ram (Ramakrishna Prabhu)"
 <42624703+rgsl888prabhu@users.noreply.github.com>
Date: Mon, 22 Mar 2021 11:09:39 +0530
Subject: [PATCH 26/26] Fix bool column corruption with ORC Reader (#7483)

Since there was a possibility that only partial set of bits would be read in case the values are buffered, the max_vals had to be rounded-off to higher next byte.

closes #7345

Authors:
  - Ram (Ramakrishna Prabhu) (@rgsl888prabhu)

Approvers:
  - Vukasin Milovanovic (@vuule)
  - Devavret Makkar (@devavret)

URL: https://github.com/rapidsai/cudf/pull/7483
---
 cpp/src/io/orc/stripe_data.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 1ff752034ad..6206d98773f 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1572,7 +1572,7 @@ __global__ void __launch_bounds__(block_size)
           if (t == 0) { s->top.data.buffered_count = n; }
         }
 
-        numvals = min(numvals * 8, is_last_set ? s->top.data.max_vals : blockDim.x);
+        numvals = min(numvals * 8, is_last_set ? (s->top.data.max_vals + 7) & (~0x7) : blockDim.x);
 
       } else if (s->chunk.type_kind == LONG || s->chunk.type_kind == TIMESTAMP ||
                  s->chunk.type_kind == DECIMAL) {