diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 4edd6965c4b..8bd4c8d1a63 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -160,7 +160,7 @@ git submodule update --init --remote --recursive
 ```bash
 # create the conda environment (assuming in base `cudf` directory)
 # note: RAPIDS currently doesn't support `channel_priority: strict`; use `channel_priority: flexible` instead
-conda env create --name cudf_dev --file conda/environments/cudf_dev_cuda10.0.yml
+conda env create --name cudf_dev --file conda/environments/cudf_dev_cuda11.0.yml
 # activate the environment
 conda activate cudf_dev
 ```
@@ -281,8 +281,8 @@ A Dockerfile is provided with a preconfigured conda environment for building and
 ### Prerequisites
 
 * Install [nvidia-docker2](https://github.com/nvidia/nvidia-docker/wiki/Installation-(version-2.0)) for Docker + GPU support
-* Verify NVIDIA driver is `410.48` or higher
-* Ensure CUDA 10.0+ is installed
+* Verify NVIDIA driver is `450.80.02` or higher
+* Ensure CUDA 11.0+ is installed
 
 ### Usage
 
@@ -309,9 +309,9 @@ flag. Below is a list of the available arguments and their purpose:
 
 | Build Argument | Default Value | Other Value(s) | Purpose |
 | --- | --- | --- | --- |
-| `CUDA_VERSION` | 10.0 | 10.1, 10.2 | set CUDA version |
-| `LINUX_VERSION` | ubuntu16.04 | ubuntu18.04 | set Ubuntu version |
-| `CC` & `CXX` | 5 | 7 | set gcc/g++ version; **NOTE:** gcc7 requires Ubuntu 18.04 |
+| `CUDA_VERSION` | 11.0 | 11.1, 11.2.2 | set CUDA version |
+| `LINUX_VERSION` | ubuntu18.04 | ubuntu20.04 | set Ubuntu version |
+| `CC` & `CXX` | 9 | 10 | set gcc/g++ version |
 | `CUDF_REPO` | This repo | Forks of cuDF | set git URL to use for `git clone` |
 | `CUDF_BRANCH` | main | Any branch name | set git branch to checkout of `CUDF_REPO` |
 | `NUMBA_VERSION` | newest | >=0.40.0 | set numba version |
diff --git a/Dockerfile b/Dockerfile
index f48ed3646f4..d24c5d05556 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,18 +1,20 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
 # An integration test & dev container which builds and installs cuDF from main
-ARG CUDA_VERSION=10.1
+ARG CUDA_VERSION=11.0
 ARG CUDA_SHORT_VERSION=${CUDA_VERSION}
-ARG LINUX_VERSION=ubuntu16.04
+ARG LINUX_VERSION=ubuntu18.04
 FROM nvidia/cuda:${CUDA_VERSION}-devel-${LINUX_VERSION}
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/lib
-# Needed for cudf.concat(), avoids "OSError: library nvvm not found"
-ENV NUMBAPRO_NVVM=/usr/local/cuda/nvvm/lib64/libnvvm.so
-ENV NUMBAPRO_LIBDEVICE=/usr/local/cuda/nvvm/libdevice/
 ENV DEBIAN_FRONTEND=noninteractive
 
-ARG CC=5
-ARG CXX=5
+ARG CC=9
+ARG CXX=9
 RUN apt update -y --fix-missing && \
     apt upgrade -y && \
+    apt install -y --no-install-recommends software-properties-common && \
+    add-apt-repository ppa:ubuntu-toolchain-r/test && \
+    apt update -y --fix-missing && \
     apt install -y --no-install-recommends \
       git \
       gcc-${CC} \
@@ -66,18 +68,10 @@ RUN if [ -f /cudf/docker/package_versions.sh ]; \
          conda env create --name cudf --file /cudf/conda/environments/cudf_dev_cuda${CUDA_SHORT_VERSION}.yml ; \
     fi
 
-# libcudf build/install
 ENV CC=/usr/bin/gcc-${CC}
 ENV CXX=/usr/bin/g++-${CXX}
-RUN source activate cudf && \
-    mkdir -p /cudf/cpp/build && \
-    cd /cudf/cpp/build && \
-    cmake .. -DCMAKE_INSTALL_PREFIX=${CONDA_PREFIX} && \
-    make -j"$(nproc)" install
 
-# cuDF build/install
+# libcudf & cudf build/install
 RUN source activate cudf && \
-    cd /cudf/python/cudf && \
-    python setup.py build_ext --inplace && \
-    python setup.py install && \
-    python setup.py install
+    cd /cudf/ && \
+    ./build.sh libcudf cudf
diff --git a/ci/cpu/prebuild.sh b/ci/cpu/prebuild.sh
index 76059867321..ed2484814fb 100755
--- a/ci/cpu/prebuild.sh
+++ b/ci/cpu/prebuild.sh
@@ -14,14 +14,14 @@ else
 fi
 
 # upload cudf_kafka for all versions of Python
-if [[ "$CUDA" == "10.1" ]]; then
+if [[ "$CUDA" == "11.0" ]]; then
     export UPLOAD_CUDF_KAFKA=1
 else
     export UPLOAD_CUDF_KAFKA=0
 fi
 
 #We only want to upload libcudf_kafka once per python/CUDA combo
-if [[ "$PYTHON" == "3.7" ]] && [[ "$CUDA" == "10.1" ]]; then
+if [[ "$PYTHON" == "3.7" ]] && [[ "$CUDA" == "11.0" ]]; then
     export UPLOAD_LIBCUDF_KAFKA=1
 else
     export UPLOAD_LIBCUDF_KAFKA=0
diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml
deleted file mode 100644
index 3c26dedda20..00000000000
--- a/conda/environments/cudf_dev_cuda10.1.yml
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
-
-name: cudf_dev
-channels:
-  - rapidsai
-  - nvidia
-  - rapidsai-nightly
-  - conda-forge
-  - defaults
-dependencies:
-  - clang=8.0.1
-  - clang-tools=8.0.1
-  - cupy>7.1.0,<9.0.0a0
-  - rmm=0.20.*
-  - cmake>=3.14
-  - cmake_setuptools>=0.1.3
-  - python>=3.7,<3.9
-  - numba>=0.49.0,!=0.51.0
-  - numpy
-  - pandas>=1.0,<1.3.0dev0
-  - pyarrow=1.0.1
-  - fastavro>=0.22.9
-  - notebook>=0.5.0
-  - cython>=0.29,<0.30
-  - fsspec>=0.6.0
-  - pytest
-  - pytest-benchmark
-  - pytest-xdist
-  - sphinx
-  - sphinx_rtd_theme
-  - sphinxcontrib-websupport
-  - nbsphinx
-  - numpydoc
-  - ipython
-  - recommonmark
-  - pandoc=<2.0.0
-  - cudatoolkit=10.1
-  - pip
-  - flake8=3.8.3
-  - black=19.10
-  - isort=5.0.7
-  - mypy=0.782
-  - typing_extensions
-  - pre_commit
-  - dask==2021.4.0
-  - distributed>=2.22.0,<=2021.4.0
-  - streamz
-  - dlpack
-  - arrow-cpp=1.0.1
-  - arrow-cpp-proc * cuda
-  - boost-cpp>=1.72.0
-  - double-conversion
-  - rapidjson
-  - flatbuffers
-  - hypothesis
-  - sphinx-markdown-tables
-  - sphinx-copybutton
-  - mimesis
-  - packaging
-  - protobuf
-  - nvtx>=0.2.1
-  - cachetools
-  - pip:
-      - git+https://github.com/dask/dask.git@main
-      - git+https://github.com/dask/distributed.git@main
-      - git+https://github.com/python-streamz/streamz.git
-      - pyorc
diff --git a/conda/environments/cudf_dev_cuda10.2.yml b/conda/environments/cudf_dev_cuda10.2.yml
deleted file mode 100644
index cc78894a99c..00000000000
--- a/conda/environments/cudf_dev_cuda10.2.yml
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
-
-name: cudf_dev
-channels:
-  - rapidsai
-  - nvidia
-  - rapidsai-nightly
-  - conda-forge
-  - defaults
-dependencies:
-  - clang=8.0.1
-  - clang-tools=8.0.1
-  - cupy>7.1.0,<9.0.0a0
-  - rmm=0.20.*
-  - cmake>=3.14
-  - cmake_setuptools>=0.1.3
-  - python>=3.7,<3.9
-  - numba>=0.49,!=0.51.0
-  - numpy
-  - pandas>=1.0,<1.3.0dev0
-  - pyarrow=1.0.1
-  - fastavro>=0.22.9
-  - notebook>=0.5.0
-  - cython>=0.29,<0.30
-  - fsspec>=0.6.0
-  - pytest
-  - pytest-benchmark
-  - pytest-xdist
-  - sphinx
-  - sphinx_rtd_theme
-  - sphinxcontrib-websupport
-  - nbsphinx
-  - numpydoc
-  - ipython
-  - recommonmark
-  - pandoc=<2.0.0
-  - cudatoolkit=10.2
-  - pip
-  - flake8=3.8.3
-  - black=19.10
-  - isort=5.0.7
-  - mypy=0.782
-  - typing_extensions
-  - pre_commit
-  - dask==2021.4.0
-  - distributed>=2.22.0,<=2021.4.0
-  - streamz
-  - dlpack
-  - arrow-cpp=1.0.1
-  - arrow-cpp-proc * cuda
-  - boost-cpp>=1.72.0
-  - double-conversion
-  - rapidjson
-  - flatbuffers
-  - hypothesis
-  - sphinx-markdown-tables
-  - sphinx-copybutton
-  - mimesis
-  - packaging
-  - protobuf
-  - nvtx>=0.2.1
-  - cachetools
-  - pip:
-      - git+https://github.com/dask/dask.git@main
-      - git+https://github.com/dask/distributed.git@main
-      - git+https://github.com/python-streamz/streamz.git
-      - pyorc
diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index 10eb683657b..0b9905c67f9 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -17,7 +17,7 @@ dependencies:
   - python>=3.7,<3.9
   - numba>=0.49,!=0.51.0
   - numpy
-  - pandas>=1.0,<1.3.0dev0
+  - pandas>=1.0,<=1.2.4
   - pyarrow=1.0.1
   - fastavro>=0.22.9
   - notebook>=0.5.0
diff --git a/conda/environments/cudf_dev_cuda11.1.yml b/conda/environments/cudf_dev_cuda11.1.yml
index 30062e38021..4dd3a96c154 100644
--- a/conda/environments/cudf_dev_cuda11.1.yml
+++ b/conda/environments/cudf_dev_cuda11.1.yml
@@ -17,7 +17,7 @@ dependencies:
   - python>=3.7,<3.9
   - numba>=0.49,!=0.51.0
   - numpy
-  - pandas>=1.0,<1.3.0dev0
+  - pandas>=1.0,<=1.2.4
   - pyarrow=1.0.1
   - fastavro>=0.22.9
   - notebook>=0.5.0
diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml
index 63821910790..b2323f2b2e9 100644
--- a/conda/environments/cudf_dev_cuda11.2.yml
+++ b/conda/environments/cudf_dev_cuda11.2.yml
@@ -17,7 +17,7 @@ dependencies:
   - python>=3.7,<3.9
   - numba>=0.49,!=0.51.0
   - numpy
-  - pandas>=1.0,<1.3.0dev0
+  - pandas>=1.0,<=1.2.4
   - pyarrow=1.0.1
   - fastavro>=0.22.9
   - notebook>=0.5.0
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 5635f54ba20..c9d2ee06d58 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -35,7 +35,7 @@ requirements:
     - protobuf
     - python
     - typing_extensions
-    - pandas >=1.0,<1.3.0dev0
+    - pandas >=1.0,<=1.2.4
     - cupy >7.1.0,<9.0.0a0
     - numba >=0.49.0
     - numpy
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 453707e4559..2ec22edc491 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -210,8 +210,8 @@ add_library(cudf
     src/groupby/sort/sort_helper.cu
     src/hash/hashing.cu
     src/interop/dlpack.cpp
-    src/interop/from_arrow.cpp
-    src/interop/to_arrow.cpp
+    src/interop/from_arrow.cu
+    src/interop/to_arrow.cu
     src/io/avro/avro.cpp
     src/io/avro/avro_gpu.cu
     src/io/avro/reader_impl.cu
@@ -247,6 +247,7 @@ add_library(cudf
     src/io/parquet/reader_impl.cu
     src/io/parquet/writer_impl.cu
     src/io/statistics/column_stats.cu
+    src/io/utilities/column_buffer.cpp
     src/io/utilities/data_sink.cpp
     src/io/utilities/datasource.cpp
     src/io/utilities/file_io_utilities.cpp
diff --git a/cpp/benchmarks/string/json_benchmark.cpp b/cpp/benchmarks/string/json_benchmark.cpp
index 6fb6a07a8d0..c6a6b757951 100644
--- a/cpp/benchmarks/string/json_benchmark.cpp
+++ b/cpp/benchmarks/string/json_benchmark.cpp
@@ -113,7 +113,7 @@ static void BM_case(benchmark::State& state, QueryArg&&... query_arg)
   std::string json_path(query_arg...);
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true);
     auto result = cudf::strings::get_json_object(scv, json_path);
     cudaStreamSynchronize(0);
   }
diff --git a/cpp/benchmarks/text/ngrams_benchmark.cpp b/cpp/benchmarks/text/ngrams_benchmark.cpp
index 1fe8e3b7f2e..52f55249631 100644
--- a/cpp/benchmarks/text/ngrams_benchmark.cpp
+++ b/cpp/benchmarks/text/ngrams_benchmark.cpp
@@ -43,7 +43,7 @@ static void BM_ngrams(benchmark::State& state, ngrams_type nt)
   cudf::strings_column_view input(table->view().column(0));
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true);
     switch (nt) {
       case ngrams_type::tokens: nvtext::generate_ngrams(input); break;
       case ngrams_type::characters: nvtext::generate_character_ngrams(input); break;
diff --git a/cpp/benchmarks/text/replace_benchmark.cpp b/cpp/benchmarks/text/replace_benchmark.cpp
index f5428aee225..8f6704ab1af 100644
--- a/cpp/benchmarks/text/replace_benchmark.cpp
+++ b/cpp/benchmarks/text/replace_benchmark.cpp
@@ -54,7 +54,7 @@ static void BM_replace(benchmark::State& state)
   cudf::test::strings_column_wrapper replacements({"1", "2", "7", "0"});
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true);
     nvtext::replace_tokens(
       view, cudf::strings_column_view(targets), cudf::strings_column_view(replacements));
   }
diff --git a/cpp/cmake/thirdparty/CUDF_GetCPM.cmake b/cpp/cmake/thirdparty/CUDF_GetCPM.cmake
index d0fe88eb398..ce2921f5954 100644
--- a/cpp/cmake/thirdparty/CUDF_GetCPM.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetCPM.cmake
@@ -1,4 +1,4 @@
-set(CPM_DOWNLOAD_VERSION 4fad2eac0a3741df3d9c44b791f9163b74aa7b07) # 0.32.0
+set(CPM_DOWNLOAD_VERSION 7644c3a40fc7889f8dee53ce21e85dc390b883dc) # v0.32.1
 
 if(CPM_SOURCE_CACHE)
   # Expand relative path. This is important if the provided path contains a tilde (~)
diff --git a/cpp/docs/TESTING.md b/cpp/docs/TESTING.md
index 638f7224ab8..2c7b62b8b6d 100644
--- a/cpp/docs/TESTING.md
+++ b/cpp/docs/TESTING.md
@@ -1,7 +1,7 @@
 # Unit Testing in libcudf
 
 Unit tests in libcudf are written using 
-[Google Test](https://github.com/google/googletest/blob/master/googletest/docs/primer.md).
+[Google Test](https://github.com/google/googletest/blob/master/docs/primer.md).
 
 **Important:** Instead of including `gtest/gtest.h` directly, use 
 `#include <cudf_test/cudf_gtest.hpp>`.
@@ -59,7 +59,7 @@ files, and are therefore preferred in test code over `thrust::device_vector`.
 
 ## Base Fixture
 
-All libcudf unit tests should make use of a GTest ["Test Fixture"](https://github.com/google/googletest/blob/master/googletest/docs/primer.md#test-fixtures-using-the-same-data-configuration-for-multiple-tests-same-data-multiple-tests).
+All libcudf unit tests should make use of a GTest ["Test Fixture"](https://github.com/google/googletest/blob/master/docs/primer.md#test-fixtures-using-the-same-data-configuration-for-multiple-tests-same-data-multiple-tests).
 Even if the fixture is empty, it should inherit from the base fixture `cudf::test::BaseFixture` 
 found in `include/cudf_test/base_fixture.hpp`. This ensures that RMM is properly initialized and 
 finalized. `cudf::test::BaseFixture` already inherits from `::testing::Test` and therefore it is 
@@ -75,7 +75,7 @@ class MyTestFiture : public cudf::test::BaseFixture {...};
 In general, libcudf features must work across all of the supported types (there are exceptions e.g.
 not all binary operations are supported for all types). In order to automate the process of running
 the same tests across multiple types, we use GTest's 
-[Typed Tests](https://github.com/google/googletest/blob/master/googletest/docs/advanced.md#typed-tests).
+[Typed Tests](https://github.com/google/googletest/blob/master/docs/advanced.md#typed-tests).
 Typed tests allow you to write a test once and run it across a list of types.
 
 For example:
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 7a560e4c048..1dd0d472d0d 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -20,6 +20,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
@@ -35,7 +36,6 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <algorithm>
@@ -567,15 +567,14 @@ void gather_bitmask(table_view const& source,
   }
 
   // Make device array of target bitmask pointers
-  thrust::host_vector<bitmask_type*> target_masks(target.size());
+  std::vector<bitmask_type*> target_masks(target.size());
   std::transform(target.begin(), target.end(), target_masks.begin(), [](auto const& col) {
     return col->mutable_view().null_mask();
   });
-  rmm::device_vector<bitmask_type*> d_target_masks(target_masks);
+  auto d_target_masks = make_device_uvector_async(target_masks, stream);
 
-  auto const masks         = d_target_masks.data().get();
   auto const device_source = table_device_view::create(source, stream);
-  auto d_valid_counts      = rmm::device_vector<size_type>(target.size());
+  auto d_valid_counts      = make_zeroed_device_uvector_async<size_type>(target.size(), stream);
 
   // Dispatch operation enum to get implementation
   auto const impl = [op]() {
@@ -591,14 +590,14 @@ void gather_bitmask(table_view const& source,
   }();
   impl(*device_source,
        gather_map,
-       masks,
+       d_target_masks.data(),
        target.size(),
        target_rows,
-       d_valid_counts.data().get(),
+       d_valid_counts.data(),
        stream);
 
   // Copy the valid counts into each column
-  auto const valid_counts = thrust::host_vector<size_type>(d_valid_counts);
+  auto const valid_counts = make_std_vector_sync(d_valid_counts, stream);
   for (size_t i = 0; i < target.size(); ++i) {
     if (target[i]->nullable()) {
       auto const null_count = target_rows - valid_counts[i];
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index c72f9327108..db8e6e4a156 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -28,9 +28,56 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
+#include <vector>
+
 namespace cudf {
 namespace detail {
 
+/**
+ * @brief Asynchronously construct a `device_uvector` and set all elements to zero.
+ *
+ * @note This function does not synchronize `stream`.
+ *
+ * @tparam T The type of the data to copy
+ * @param size The number of elements in the created vector
+ * @param stream The stream on which to allocate memory and perform the memset
+ * @param mr The memory resource to use for allocating the returned device_uvector
+ * @return A device_uvector containing zeros
+ */
+template <typename T>
+rmm::device_uvector<T> make_zeroed_device_uvector_async(
+  std::size_t size,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  rmm::device_uvector<T> ret(size, stream, mr);
+  CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value()));
+  return ret;
+}
+
+/**
+ * @brief Synchronously construct a `device_uvector` and set all elements to zero.
+ *
+ * @note This function synchronizes `stream`.
+ *
+ * @tparam T The type of the data to copy
+ * @param size The number of elements in the created vector
+ * @param stream The stream on which to allocate memory and perform the memset
+ * @param mr The memory resource to use for allocating the returned device_uvector
+ * @return A device_uvector containing zeros
+ */
+template <typename T>
+rmm::device_uvector<T> make_zeroed_device_uvector_sync(
+  std::size_t size,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  rmm::device_uvector<T> ret(size, stream, mr);
+  CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value()));
+  stream.synchronize();
+  return ret;
+}
+
 /**
  * @brief Asynchronously construct a `device_uvector` containing a deep copy of data from a
  * `host_span`
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index c685837ae2b..11ce9199c2d 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -25,7 +25,6 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 
-#include <thrust/device_vector.h>
 #include <thrust/distance.h>
 
 namespace cudf {
diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh
index 176a548da4d..7121a6a5a8a 100644
--- a/cpp/include/cudf/strings/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
@@ -92,8 +93,9 @@ std::unique_ptr<cudf::column> copy_if_else(
   auto d_offsets = offsets_column->view().template data<int32_t>();
 
   // build chars column
-  size_type bytes   = thrust::device_pointer_cast(d_offsets)[strings_count];
-  auto chars_column = create_chars_child_column(strings_count, null_count, bytes, stream, mr);
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
+  auto chars_column = create_chars_child_column(strings_count, bytes, stream, mr);
   auto d_chars      = chars_column->mutable_view().template data<char>();
   // fill in chars
   thrust::for_each_n(
diff --git a/cpp/include/cudf/strings/detail/copy_range.cuh b/cpp/include/cudf/strings/detail/copy_range.cuh
index c5d87258b7a..c0fa74c4662 100644
--- a/cpp/include/cudf/strings/detail/copy_range.cuh
+++ b/cpp/include/cudf/strings/detail/copy_range.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
@@ -180,10 +181,10 @@ std::unique_ptr<column> copy_range(
 
     auto p_offsets =
       thrust::device_pointer_cast(p_offsets_column->view().template data<size_type>());
-    auto chars_bytes = p_offsets[target.size()];
-
-    auto p_chars_column = strings::detail::create_chars_child_column(
-      target.size(), null_count, chars_bytes, stream, mr);
+    auto const chars_bytes =
+      cudf::detail::get_value<int32_t>(p_offsets_column->view(), target.size(), stream);
+    auto p_chars_column =
+      strings::detail::create_chars_child_column(target.size(), chars_bytes, stream, mr);
 
     // copy to the chars column
 
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 988fa552100..86f79881408 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -65,7 +65,7 @@ std::unique_ptr<cudf::column> gather_chars(StringIterator strings_begin,
   auto const output_count = std::distance(map_begin, map_end);
   if (output_count == 0) return make_empty_column(data_type{type_id::INT8});
 
-  auto chars_column  = create_chars_child_column(output_count, 0, chars_bytes, stream, mr);
+  auto chars_column  = create_chars_child_column(output_count, chars_bytes, stream, mr);
   auto const d_chars = chars_column->mutable_view().template data<char>();
 
   auto gather_chars_fn = [strings_begin, map_begin, offsets] __device__(size_type out_idx) -> char {
diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/include/cudf/strings/detail/merge.cuh
index caac0579085..7541be177f1 100644
--- a/cpp/include/cudf/strings/detail/merge.cuh
+++ b/cpp/include/cudf/strings/detail/merge.cuh
@@ -82,9 +82,9 @@ std::unique_ptr<column> merge(strings_column_view const& lhs,
   auto d_offsets = offsets_column->view().template data<int32_t>();
 
   // create the chars column
-  size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
-  auto chars_column =
-    strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
+  auto chars_column = strings::detail::create_chars_child_column(strings_count, bytes, stream, mr);
   // merge the strings
   auto d_chars = chars_column->mutable_view().template data<char>();
   thrust::for_each_n(rmm::exec_policy(stream),
diff --git a/cpp/include/cudf/strings/detail/modify_strings.cuh b/cpp/include/cudf/strings/detail/modify_strings.cuh
index 6feaa039bab..d37d9da9f5e 100644
--- a/cpp/include/cudf/strings/detail/modify_strings.cuh
+++ b/cpp/include/cudf/strings/detail/modify_strings.cuh
@@ -82,11 +82,11 @@ std::unique_ptr<column> modify_strings(strings_column_view const& strings,
                                             // one (`d_chars = ...`) doesn't
 
   // build the chars column -- convert characters based on case_flag parameter
-  size_type bytes = thrust::device_pointer_cast(d_new_offsets)[strings_count];
-  auto chars_column =
-    strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
-  auto chars_view = chars_column->mutable_view();
-  auto d_chars    = chars_view.data<char>();
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
+  auto chars_column = strings::detail::create_chars_child_column(strings_count, bytes, stream, mr);
+  auto chars_view   = chars_column->mutable_view();
+  auto d_chars      = chars_view.data<char>();
 
   device_execute_functor d_execute_fctr{
     d_column, d_new_offsets, d_chars, std::forward<Types>(args)...};
diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
index 92cf537454c..69121b26b0b 100644
--- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh
+++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -115,7 +115,7 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
     } else {
       // this approach is 2-3x faster for a large number of smaller string lengths
       auto chars_column =
-        strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
+        strings::detail::create_chars_child_column(strings_count, bytes, stream, mr);
       auto d_chars    = chars_column->mutable_view().template data<char>();
       auto copy_chars = [d_chars] __device__(auto item) {
         string_index_pair const str = thrust::get<0>(item);
@@ -185,9 +185,8 @@ std::unique_ptr<column> make_strings_column(CharIterator chars_begin,
                     [] __device__(auto offset) { return static_cast<int32_t>(offset); });
 
   // build chars column
-  auto chars_column =
-    strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
-  auto chars_view = chars_column->mutable_view();
+  auto chars_column = strings::detail::create_chars_child_column(strings_count, bytes, stream, mr);
+  auto chars_view   = chars_column->mutable_view();
   thrust::copy(rmm::exec_policy(stream), chars_begin, chars_end, chars_view.data<char>());
 
   return make_strings_column(strings_count,
diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp
index a5db4d55001..4eff3f2dafc 100644
--- a/cpp/include/cudf/strings/detail/utilities.hpp
+++ b/cpp/include/cudf/strings/detail/utilities.hpp
@@ -30,7 +30,6 @@ namespace detail {
  * This will return the properly sized column to be filled in by the caller.
  *
  * @param strings_count Number of strings in the column.
- * @param null_count Number of null string entries in the column.
  * @param bytes Number of bytes for the chars column.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
@@ -38,7 +37,6 @@ namespace detail {
  */
 std::unique_ptr<column> create_chars_child_column(
   size_type strings_count,
-  size_type null_count,
   size_type bytes,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index 4bcb46e4655..f5ab2046441 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -92,20 +92,6 @@ __device__ inline size_type string_view::length() const
 {
   if (_length == UNKNOWN_STRING_LENGTH)
     _length = strings::detail::characters_in_string(_data, _bytes);
-  if (_length && (_char_width == UNKNOWN_CHAR_WIDTH)) {
-    uint8_t const* ptr = reinterpret_cast<uint8_t const*>(data());
-    auto const first   = strings::detail::bytes_in_utf8_byte(*ptr);
-    // see if they are all the same width
-    _char_width = (thrust::find_if(thrust::seq,
-                                   ptr,
-                                   ptr + size_bytes(),
-                                   [first](auto ch) {
-                                     auto width = strings::detail::bytes_in_utf8_byte(ch);
-                                     return (width != 0) && (width != first);
-                                   })) == (ptr + size_bytes())
-                    ? first
-                    : VARIABLE_CHAR_WIDTH;
-  }
   return _length;
 }
 
@@ -251,7 +237,7 @@ __device__ inline size_type string_view::byte_offset(size_type pos) const
   size_type offset = 0;
   const char* sptr = _data;
   const char* eptr = sptr + _bytes;
-  if (_char_width > 0) return pos * _char_width;
+  if (length() == size_bytes()) return pos;
   while ((pos > 0) && (sptr < eptr)) {
     size_type charbytes = strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(*sptr++));
     if (charbytes) --pos;
@@ -408,7 +394,7 @@ __device__ inline string_view string_view::substr(size_type pos, size_type lengt
 
 __device__ inline size_type string_view::character_offset(size_type bytepos) const
 {
-  if (_char_width > 0) return bytepos / _char_width;
+  if (length() == size_bytes()) return bytepos;
   return strings::detail::characters_in_string(data(), bytepos);
 }
 
diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp
index 667a25c7641..4b1a901d72f 100644
--- a/cpp/include/cudf/strings/string_view.hpp
+++ b/cpp/include/cudf/strings/string_view.hpp
@@ -36,13 +36,6 @@ using char_utf8 = uint32_t;  ///< UTF-8 characters are 1-4 bytes
  */
 constexpr cudf::size_type UNKNOWN_STRING_LENGTH{-1};
 
-/**
- * @brief The char width is initialized to this value as a place-holder.
- *
- * The byte-width of the characters in a string is computed on-demand.
- */
-constexpr int8_t UNKNOWN_CHAR_WIDTH{-1};
-
 /**
  * @brief This value is assigned to the _char_width member if the string
  * contains characters of different widths.
@@ -314,7 +307,7 @@ class string_view {
   /**
    * @brief Default constructor represents an empty string.
    */
-  CUDA_HOST_DEVICE_CALLABLE string_view() : _data(""), _bytes(0), _length(0), _char_width(0) {}
+  CUDA_HOST_DEVICE_CALLABLE string_view() : _data(""), _bytes(0), _length(0) {}
 
   /**
    * @brief Create instance from existing device char array.
@@ -323,7 +316,7 @@ class string_view {
    * @param bytes Number of bytes in data array.
    */
   CUDA_HOST_DEVICE_CALLABLE string_view(const char* data, size_type bytes)
-    : _data(data), _bytes(bytes), _length(UNKNOWN_STRING_LENGTH), _char_width(UNKNOWN_CHAR_WIDTH)
+    : _data(data), _bytes(bytes), _length(UNKNOWN_STRING_LENGTH)
   {
   }
 
@@ -334,10 +327,9 @@ class string_view {
   string_view& operator=(string_view&&) = default;
 
  private:
-  const char* _data{};           ///< Pointer to device memory contain char array for this string
-  size_type _bytes{};            ///< Number of bytes in _data for this string
-  mutable size_type _length{};   ///< Number of characters in this string (computed)
-  mutable int8_t _char_width{};  ///< Number of bytes per character if uniform width (computed)
+  const char* _data{};          ///< Pointer to device memory contain char array for this string
+  size_type _bytes{};           ///< Number of bytes in _data for this string
+  mutable size_type _length{};  ///< Number of characters in this string (computed)
 
   /**
    * @brief Return the character position of the given byte offset.
diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu
index 53be019f73b..530849601de 100644
--- a/cpp/src/hash/hashing.cu
+++ b/cpp/src/hash/hashing.cu
@@ -103,8 +103,8 @@ std::unique_ptr<column> md5_hash(table_view const& input,
   auto offsets_column =
     cudf::strings::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);
 
-  auto chars_column = strings::detail::create_chars_child_column(
-    input.num_rows(), 0, input.num_rows() * 32, stream, mr);
+  auto chars_column =
+    strings::detail::create_chars_child_column(input.num_rows(), input.num_rows() * 32, stream, mr);
   auto chars_view = chars_column->mutable_view();
   auto d_chars    = chars_view.data<char>();
 
diff --git a/cpp/src/interop/from_arrow.cpp b/cpp/src/interop/from_arrow.cu
similarity index 88%
rename from cpp/src/interop/from_arrow.cpp
rename to cpp/src/interop/from_arrow.cu
index 99c9b386a15..ee02fadc017 100644
--- a/cpp/src/interop/from_arrow.cpp
+++ b/cpp/src/interop/from_arrow.cu
@@ -13,12 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/concatenate.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/interop.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/transform.hpp>
@@ -34,6 +34,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <thrust/gather.h>
+
 namespace cudf {
 
 namespace detail {
@@ -54,7 +56,7 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type)
     case arrow::Type::DOUBLE: return data_type(type_id::FLOAT64);
     case arrow::Type::DATE32: return data_type(type_id::TIMESTAMP_DAYS);
     case arrow::Type::TIMESTAMP: {
-      arrow::TimestampType const* type = static_cast<arrow::TimestampType const*>(&arrow_type);
+      auto type = static_cast<arrow::TimestampType const*>(&arrow_type);
       switch (type->unit()) {
         case arrow::TimeUnit::type::SECOND: return data_type(type_id::TIMESTAMP_SECONDS);
         case arrow::TimeUnit::type::MILLI: return data_type(type_id::TIMESTAMP_MILLISECONDS);
@@ -64,7 +66,7 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type)
       }
     }
     case arrow::Type::DURATION: {
-      arrow::DurationType const* type = static_cast<arrow::DurationType const*>(&arrow_type);
+      auto type = static_cast<arrow::DurationType const*>(&arrow_type);
       switch (type->unit()) {
         case arrow::TimeUnit::type::SECOND: return data_type(type_id::DURATION_SECONDS);
         case arrow::TimeUnit::type::MILLI: return data_type(type_id::DURATION_MILLISECONDS);
@@ -76,6 +78,10 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type)
     case arrow::Type::STRING: return data_type(type_id::STRING);
     case arrow::Type::DICTIONARY: return data_type(type_id::DICTIONARY32);
     case arrow::Type::LIST: return data_type(type_id::LIST);
+    case arrow::Type::DECIMAL: {
+      auto const type = static_cast<arrow::Decimal128Type const*>(&arrow_type);
+      return data_type{type_id::DECIMAL64, -type->scale()};
+    }
     case arrow::Type::STRUCT: return data_type(type_id::STRUCT);
     default: CUDF_FAIL("Unsupported type_id conversion to cudf");
   }
@@ -174,6 +180,54 @@ std::unique_ptr<column> get_column(arrow::Array const& array,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr);
 
+template <>
+std::unique_ptr<column> dispatch_to_cudf_column::operator()<numeric::decimal64>(
+  arrow::Array const& array,
+  data_type type,
+  bool skip_mask,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  using DeviceType = int64_t;
+
+  auto constexpr BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
+  auto data_buffer               = array.data()->buffers[1];
+  auto const num_rows            = static_cast<size_type>(array.length());
+
+  rmm::device_uvector<DeviceType> buf(num_rows * BIT_WIDTH_RATIO, stream);
+  rmm::device_uvector<DeviceType> out_buf(num_rows, stream, mr);
+
+  CUDA_TRY(cudaMemcpyAsync(
+    reinterpret_cast<uint8_t*>(buf.data()),
+    reinterpret_cast<const uint8_t*>(data_buffer->address()) + array.offset() * sizeof(DeviceType),
+    buf.size() * sizeof(DeviceType),
+    cudaMemcpyDefault,
+    stream.value()));
+
+  auto every_other = [] __device__(size_type i) { return 2 * i; };
+  auto gather_map  = cudf::detail::make_counting_transform_iterator(0, every_other);
+
+  thrust::gather(
+    rmm::exec_policy(stream), gather_map, gather_map + num_rows, buf.data(), out_buf.data());
+
+  auto null_mask = [&] {
+    if (not skip_mask and array.null_bitmap_data()) {
+      auto temp_mask = get_mask_buffer(array, stream, mr);
+      // If array is sliced, we have to copy whole mask and then take copy.
+      return (num_rows == static_cast<size_type>(data_buffer->size() / sizeof(DeviceType)))
+               ? *temp_mask.release()
+               : cudf::detail::copy_bitmask(static_cast<bitmask_type*>(temp_mask->data()),
+                                            array.offset(),
+                                            array.offset() + num_rows,
+                                            stream,
+                                            mr);
+    }
+    return rmm::device_buffer{};
+  }();
+
+  return std::make_unique<cudf::column>(type, num_rows, out_buf.release(), std::move(null_mask));
+}
+
 template <>
 std::unique_ptr<column> dispatch_to_cudf_column::operator()<bool>(
   arrow::Array const& array,
diff --git a/cpp/src/interop/to_arrow.cpp b/cpp/src/interop/to_arrow.cu
similarity index 88%
rename from cpp/src/interop/to_arrow.cpp
rename to cpp/src/interop/to_arrow.cu
index 4bc50b21718..d9be3316f9d 100644
--- a/cpp/src/interop/to_arrow.cpp
+++ b/cpp/src/interop/to_arrow.cu
@@ -17,6 +17,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/interop.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
@@ -30,6 +31,9 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+
 namespace cudf {
 namespace detail {
 namespace {
@@ -135,6 +139,49 @@ struct dispatch_to_arrow {
   }
 };
 
+template <>
+std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(
+  column_view input,
+  cudf::type_id id,
+  column_metadata const& metadata,
+  arrow::MemoryPool* ar_mr,
+  rmm::cuda_stream_view stream)
+{
+  using DeviceType                = int64_t;
+  size_type const BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
+
+  rmm::device_uvector<DeviceType> buf(input.size() * BIT_WIDTH_RATIO, stream);
+
+  auto count = thrust::make_counting_iterator(0);
+
+  thrust::for_each(count,
+                   count + input.size(),
+                   [in = input.begin<DeviceType>(), out = buf.data()] __device__(auto in_idx) {
+                     auto const out_idx = in_idx * 2;
+                     out[out_idx]       = in[in_idx];
+                     out[out_idx + 1]   = in[in_idx] < 0 ? -1 : 0;
+                   });
+
+  auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType);
+  auto result                  = arrow::AllocateBuffer(buf_size_in_bytes, ar_mr);
+  CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for data");
+
+  std::shared_ptr<arrow::Buffer> data_buffer = std::move(result.ValueOrDie());
+
+  CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
+                           buf.data(),
+                           buf_size_in_bytes,
+                           cudaMemcpyDeviceToHost,
+                           stream.value()));
+
+  auto type    = arrow::decimal(18, -input.type().scale());
+  auto mask    = fetch_mask_buffer(input, ar_mr, stream);
+  auto buffers = std::vector<std::shared_ptr<arrow::Buffer>>{mask, data_buffer};
+  auto data    = std::make_shared<arrow::ArrayData>(type, input.size(), buffers);
+
+  return std::make_shared<arrow::Decimal128Array>(data);
+}
+
 template <>
 std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<bool>(column_view input,
                                                                   cudf::type_id id,
diff --git a/cpp/src/io/avro/avro_common.h b/cpp/src/io/avro/avro_common.h
index 509eca41e61..3ef36863cd2 100644
--- a/cpp/src/io/avro/avro_common.h
+++ b/cpp/src/io/avro/avro_common.h
@@ -18,6 +18,7 @@
 
 #include <stdint.h>
 #include <stdio.h>
+#include <io/utilities/column_buffer.hpp>
 
 namespace cudf {
 namespace io {
@@ -56,6 +57,8 @@ enum type_kind_e {
   type_array,
 };
 
+using cudf::io::detail::string_index_pair;
+
 }  // namespace avro
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu
index 321f5ee8963..ebd7f51a08a 100644
--- a/cpp/src/io/avro/avro_gpu.cu
+++ b/cpp/src/io/avro/avro_gpu.cu
@@ -72,7 +72,7 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema,
                                                  size_t max_rows,
                                                  const uint8_t *cur,
                                                  const uint8_t *end,
-                                                 device_span<nvstrdesc_s> global_dictionary)
+                                                 device_span<string_index_pair> global_dictionary)
 {
   uint32_t array_start = 0, array_repeat_count = 0;
   int array_children = 0;
@@ -123,8 +123,8 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema,
           if (kind == type_enum) {  // dictionary
             size_t idx = schema[i].count + v;
             if (idx < global_dictionary.size()) {
-              ptr   = global_dictionary[idx].ptr;
-              count = global_dictionary[idx].count;
+              ptr   = global_dictionary[idx].first;
+              count = global_dictionary[idx].second;
             }
           } else if (v >= 0 && cur + v <= end) {  // string
             ptr   = reinterpret_cast<const char *>(cur);
@@ -132,8 +132,8 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema,
             cur += count;
           }
           if (dataptr != nullptr && row < max_rows) {
-            static_cast<nvstrdesc_s *>(dataptr)[row].ptr   = ptr;
-            static_cast<nvstrdesc_s *>(dataptr)[row].count = count;
+            static_cast<string_index_pair *>(dataptr)[row].first  = ptr;
+            static_cast<string_index_pair *>(dataptr)[row].second = count;
           }
         }
       } break;
@@ -230,7 +230,7 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema,
 extern "C" __global__ void __launch_bounds__(num_warps * 32, 2)
   gpuDecodeAvroColumnData(block_desc_s *blocks,
                           schemadesc_s *schema_g,
-                          device_span<nvstrdesc_s> global_dictionary,
+                          device_span<string_index_pair> global_dictionary,
                           const uint8_t *avro_data,
                           uint32_t num_blocks,
                           uint32_t schema_len,
@@ -313,7 +313,7 @@ extern "C" __global__ void __launch_bounds__(num_warps * 32, 2)
  */
 void DecodeAvroColumnData(block_desc_s *blocks,
                           schemadesc_s *schema,
-                          device_span<nvstrdesc_s> global_dictionary,
+                          device_span<string_index_pair> global_dictionary,
                           const uint8_t *avro_data,
                           uint32_t num_blocks,
                           uint32_t schema_len,
diff --git a/cpp/src/io/avro/avro_gpu.h b/cpp/src/io/avro/avro_gpu.h
index 95b6e13d3f6..a82d3604d02 100644
--- a/cpp/src/io/avro/avro_gpu.h
+++ b/cpp/src/io/avro/avro_gpu.h
@@ -25,13 +25,6 @@ namespace cudf {
 namespace io {
 namespace avro {
 namespace gpu {
-/**
- * @brief Struct to describe the output of a string datatype
- */
-struct nvstrdesc_s {
-  const char *ptr;
-  size_t count;
-};
 
 /**
  * @brief Struct to describe the avro schema
@@ -59,7 +52,7 @@ struct schemadesc_s {
  */
 void DecodeAvroColumnData(block_desc_s *blocks,
                           schemadesc_s *schema,
-                          cudf::device_span<nvstrdesc_s> global_dictionary,
+                          cudf::device_span<string_index_pair> global_dictionary,
                           const uint8_t *avro_data,
                           uint32_t num_blocks,
                           uint32_t schema_len,
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index 42035687750..600633f0ed8 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -235,7 +235,7 @@ rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer &comp_
 
 void reader::impl::decode_data(const rmm::device_buffer &block_data,
                                const std::vector<std::pair<uint32_t, uint32_t>> &dict,
-                               device_span<gpu::nvstrdesc_s> global_dictionary,
+                               device_span<string_index_pair> global_dictionary,
                                size_t num_rows,
                                std::vector<std::pair<int, std::string>> selection,
                                std::vector<column_buffer> &out_buffers,
@@ -393,10 +393,10 @@ table_with_metadata reader::impl::read(avro_reader_options const &options,
         for (const auto &sym : col_schema.symbols) { dictionary_data_size += sym.length(); }
       }
 
-      rmm::device_uvector<gpu::nvstrdesc_s> d_global_dict(total_dictionary_entries, stream);
+      rmm::device_uvector<string_index_pair> d_global_dict(total_dictionary_entries, stream);
       rmm::device_uvector<char> d_global_dict_data(dictionary_data_size, stream);
       if (total_dictionary_entries > 0) {
-        std::vector<gpu::nvstrdesc_s> h_global_dict(total_dictionary_entries);
+        std::vector<string_index_pair> h_global_dict(total_dictionary_entries);
         std::vector<char> h_global_dict_data(dictionary_data_size);
         size_t dict_pos = 0;
         for (size_t i = 0; i < column_types.size(); ++i) {
@@ -406,10 +406,10 @@ table_with_metadata reader::impl::read(avro_reader_options const &options,
           for (size_t j = 0; j < dict[i].second; j++) {
             auto const &symbols = col_schema.symbols[j];
 
-            auto const data_dst       = h_global_dict_data.data() + dict_pos;
-            auto const len            = symbols.length();
-            col_dict_entries[j].ptr   = data_dst;
-            col_dict_entries[j].count = len;
+            auto const data_dst        = h_global_dict_data.data() + dict_pos;
+            auto const len             = symbols.length();
+            col_dict_entries[j].first  = data_dst;
+            col_dict_entries[j].second = len;
 
             std::copy(symbols.c_str(), symbols.c_str() + len, data_dst);
             dict_pos += len;
@@ -418,7 +418,7 @@ table_with_metadata reader::impl::read(avro_reader_options const &options,
 
         CUDA_TRY(cudaMemcpyAsync(d_global_dict.data(),
                                  h_global_dict.data(),
-                                 h_global_dict.size() * sizeof(gpu::nvstrdesc_s),
+                                 h_global_dict.size() * sizeof(string_index_pair),
                                  cudaMemcpyDefault,
                                  stream.value()));
         CUDA_TRY(cudaMemcpyAsync(d_global_dict_data.data(),
diff --git a/cpp/src/io/avro/reader_impl.hpp b/cpp/src/io/avro/reader_impl.hpp
index 22fa1aaa760..8e09da03563 100644
--- a/cpp/src/io/avro/reader_impl.hpp
+++ b/cpp/src/io/avro/reader_impl.hpp
@@ -97,7 +97,7 @@ class reader::impl {
    */
   void decode_data(const rmm::device_buffer &block_data,
                    const std::vector<std::pair<uint32_t, uint32_t>> &dict,
-                   cudf::device_span<gpu::nvstrdesc_s> global_dictionary,
+                   cudf::device_span<string_index_pair> global_dictionary,
                    size_t num_rows,
                    std::vector<std::pair<int, std::string>> columns,
                    std::vector<column_buffer> &out_buffers,
diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu
index 953872ab7ed..541163eb086 100644
--- a/cpp/src/io/comp/debrotli.cu
+++ b/cpp/src/io/comp/debrotli.cu
@@ -357,8 +357,6 @@ static __device__ uint8_t *ext_heap_alloc(uint32_t bytes,
     first_free_block = atomicExch((unsigned int *)heap_ptr, first_free_block);
     if (first_free_block == ~0 || first_free_block >= ext_heap_size) {
       // Some other block is holding the heap or there are no free blocks: try again later
-      // Wait a bit in an attempt to make the spin less resource-hungry
-      nanosleep(100);
       continue;
     }
     if (first_free_block == 0) {
@@ -408,8 +406,7 @@ static __device__ uint8_t *ext_heap_alloc(uint32_t bytes,
       }
     } while (blk_next != 0 && blk_next < ext_heap_size);
     first_free_block = atomicExch((unsigned int *)heap_ptr, first_free_block);
-    // Wait a while since reaching here means the heap is full
-    nanosleep(10000);
+    // Reaching here means the heap is full
     // Just in case we're trying to allocate more than the entire heap
     if (len > ext_heap_size - 4 * sizeof(uint32_t)) { break; }
   }
@@ -429,8 +426,7 @@ static __device__ void ext_heap_free(void *ptr,
   for (;;) {
     first_free_block = atomicExch((unsigned int *)heap_ptr, first_free_block);
     if (first_free_block != ~0) { break; }
-    // Some other block is holding the heap: wait
-    nanosleep(50);
+    // Some other block is holding the heap
   }
   if (first_free_block >= ext_heap_size) {
     // Heap is currently empty
diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu
index a31cf1717e7..eda1d37f78c 100644
--- a/cpp/src/io/comp/gpuinflate.cu
+++ b/cpp/src/io/comp/gpuinflate.cu
@@ -512,13 +512,10 @@ __device__ void decode_symbols(inflate_state_s *s)
 #if ENABLE_PREFETCH
     // Wait for prefetcher to fetch a worst-case of 48 bits per symbol
     while ((*(volatile int32_t *)&s->pref.cur_p - (int32_t)(size_t)cur < batch_size * 6) ||
-           (s->x.batch_len[batch] != 0))
+           (s->x.batch_len[batch] != 0)) {}
 #else
-    while (s->x.batch_len[batch] != 0)
+    while (s->x.batch_len[batch] != 0) {}
 #endif
-    {
-      nanosleep(100);
-    }
     batch_len = 0;
 #if ENABLE_PREFETCH
     if (cur + (bitpos >> 3) >= end) {
@@ -662,7 +659,7 @@ __device__ void decode_symbols(inflate_state_s *s)
     if (batch_len != 0) batch = (batch + 1) & (batch_count - 1);
   } while (sym != 256);
 
-  while (s->x.batch_len[batch] != 0) { nanosleep(150); }
+  while (s->x.batch_len[batch] != 0) {}
   s->x.batch_len[batch] = -1;
   s->bitbuf             = bitbuf;
   s->bitpos             = bitpos;
@@ -779,7 +776,7 @@ __device__ void process_symbols(inflate_state_s *s, int t)
     uint32_t lit_mask;
 
     if (t == 0) {
-      while ((batch_len = s->x.batch_len[batch]) == 0) { nanosleep(100); }
+      while ((batch_len = s->x.batch_len[batch]) == 0) {}
     } else {
       batch_len = 0;
     }
@@ -962,8 +959,6 @@ __device__ void prefetch_warp(volatile inflate_state_s *s, int t)
         s->pref.cur_p = cur_p;
         __threadfence_block();
       }
-    } else if (t == 0) {
-      nanosleep(150);
     }
   }
 }
diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu
index 2b799b5e1bf..c58880c9ed8 100644
--- a/cpp/src/io/comp/unsnap.cu
+++ b/cpp/src/io/comp/unsnap.cu
@@ -99,7 +99,6 @@ __device__ void snappy_prefetch_bytestream(unsnap_state_s *s, int t)
           blen = 0;
           break;
         }
-        nanosleep(100);
       }
     }
     blen = shuffle(blen);
@@ -281,7 +280,7 @@ __device__ void snappy_decode_symbols(unsnap_state_s *s, uint32_t t)
     if (t == 0) {
       s->q.prefetch_rdpos = cur;
 #pragma unroll(1)  // We don't want unrolling here
-      while (s->q.prefetch_wrpos < min(cur + 5 * batch_size, end)) { nanosleep(50); }
+      while (s->q.prefetch_wrpos < min(cur + 5 * batch_size, end)) {}
       b = &s->q.batch[batch * batch_size];
     }
     // Process small symbols in parallel: for data that does not get good compression,
@@ -441,7 +440,7 @@ __device__ void snappy_decode_symbols(unsnap_state_s *s, uint32_t t)
           // Wait for prefetcher
           s->q.prefetch_rdpos = cur;
 #pragma unroll(1)  // We don't want unrolling here
-          while (s->q.prefetch_wrpos < min(cur + 5 * batch_size, end)) { nanosleep(50); }
+          while (s->q.prefetch_wrpos < min(cur + 5 * batch_size, end)) {}
           dst_pos += blen;
           if (bytes_left < blen) break;
           bytes_left -= blen;
@@ -457,7 +456,7 @@ __device__ void snappy_decode_symbols(unsnap_state_s *s, uint32_t t)
     }
     batch_len = shuffle(batch_len);
     if (t == 0) {
-      while (s->q.batch_len[batch] != 0) { nanosleep(100); }
+      while (s->q.batch_len[batch] != 0) {}
     }
     if (batch_len != batch_size) { break; }
   }
@@ -490,7 +489,7 @@ __device__ void snappy_process_symbols(unsnap_state_s *s, int t, Storage &temp_s
     int32_t batch_len, blen_t, dist_t;
 
     if (t == 0) {
-      while ((batch_len = s->q.batch_len[batch]) == 0) { nanosleep(100); }
+      while ((batch_len = s->q.batch_len[batch]) == 0) {}
     } else {
       batch_len = 0;
     }
diff --git a/cpp/src/io/csv/csv.h b/cpp/src/io/csv/csv.h
deleted file mode 100644
index b20ca4222b2..00000000000
--- a/cpp/src/io/csv/csv.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (c) 2019, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "csv_common.h"
diff --git a/cpp/src/io/csv/datetime.cuh b/cpp/src/io/csv/datetime.cuh
index 7f3c2ab4942..4e4ddd09a9f 100644
--- a/cpp/src/io/csv/datetime.cuh
+++ b/cpp/src/io/csv/datetime.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "thrust/reduce.h"
+#include <thrust/reduce.h>
 
 #include <cudf/wrappers/durations.hpp>
 #include <io/utilities/parsing_utils.cuh>
@@ -435,4 +435,4 @@ __inline__ __device__ int64_t to_time_delta(char const* begin, char const* end)
 }
 
 }  // namespace io
-}  // namespace cudf
\ No newline at end of file
+}  // namespace cudf
diff --git a/cpp/src/io/csv/durations.cu b/cpp/src/io/csv/durations.cu
index 5a7c772f73c..efd2243febc 100644
--- a/cpp/src/io/csv/durations.cu
+++ b/cpp/src/io/csv/durations.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -190,8 +190,8 @@ struct dispatch_from_durations_fn {
     // build chars column
     auto const chars_bytes =
       cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
-    auto chars_column = strings::detail::create_chars_child_column(
-      strings_count, durations.null_count(), chars_bytes, stream, mr);
+    auto chars_column =
+      strings::detail::create_chars_child_column(strings_count, chars_bytes, stream, mr);
     auto chars_view = chars_column->mutable_view();
     auto d_chars    = chars_view.template data<char>();
 
diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
index 2764eb0980c..d61c2847b7e 100644
--- a/cpp/src/io/csv/reader_impl.hpp
+++ b/cpp/src/io/csv/reader_impl.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "csv.h"
+#include "csv_common.h"
 #include "csv_gpu.h"
 
 #include <cudf/detail/utilities/trie.cuh>
diff --git a/cpp/src/io/csv/writer_impl.hpp b/cpp/src/io/csv/writer_impl.hpp
index 9c42a3666fb..965c036dc75 100644
--- a/cpp/src/io/csv/writer_impl.hpp
+++ b/cpp/src/io/csv/writer_impl.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "csv.h"
+#include "csv_common.h"
 #include "csv_gpu.h"
 
 #include <cudf/strings/strings_column_view.hpp>
diff --git a/cpp/src/io/json/json.h b/cpp/src/io/json/json.h
deleted file mode 100644
index 0c2309d9d64..00000000000
--- a/cpp/src/io/json/json.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "json_common.h"
diff --git a/cpp/src/io/json/json_common.h b/cpp/src/io/json/json_common.h
index 0bcd4e95f9a..f33435c1673 100644
--- a/cpp/src/io/json/json_common.h
+++ b/cpp/src/io/json/json_common.h
@@ -17,6 +17,8 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <io/utilities/column_buffer.hpp>
 #include <io/utilities/column_type_histogram.hpp>
 
 class SerialTrieNode;
+using cudf::io::detail::string_index_pair;
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index 75910ae6b5b..b9ced355107 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -46,8 +46,6 @@ namespace json {
 namespace gpu {
 using namespace ::cudf;
 
-using string_pair = std::pair<const char *, size_t>;
-
 namespace {
 /**
  * @brief CUDA Kernel that adjusts the row range to exclude the character outside of the top level
@@ -516,7 +514,7 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts,
     if (!serialized_trie_contains(opts.trie_na, {desc.value_begin, value_len})) {
       // Type dispatcher does not handle strings
       if (column_types[desc.column].id() == type_id::STRING) {
-        auto str_list           = static_cast<string_pair *>(output_columns[desc.column]);
+        auto str_list           = static_cast<string_index_pair *>(output_columns[desc.column]);
         str_list[rec_id].first  = desc.value_begin;
         str_list[rec_id].second = value_len;
 
@@ -537,7 +535,7 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts,
         }
       }
     } else if (column_types[desc.column].id() == type_id::STRING) {
-      auto str_list           = static_cast<string_pair *>(output_columns[desc.column]);
+      auto str_list           = static_cast<string_index_pair *>(output_columns[desc.column]);
       str_list[rec_id].first  = nullptr;
       str_list[rec_id].second = 0;
     }
diff --git a/cpp/src/io/json/json_gpu.h b/cpp/src/io/json/json_gpu.h
index fb8d7b2c7ab..4a68ce48f20 100644
--- a/cpp/src/io/json/json_gpu.h
+++ b/cpp/src/io/json/json_gpu.h
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <io/json/json_common.h>
 #include <io/utilities/parsing_utils.cuh>
+#include "json_common.h"
 
 #include <hash/concurrent_unordered_map.cuh>
 
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index ffd3dc58fe7..e6df503619f 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -21,7 +21,7 @@
 
 #pragma once
 
-#include "json.h"
+#include "json_common.h"
 #include "json_gpu.h"
 
 #include <io/utilities/column_buffer.hpp>
diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index 9abaabace4f..bef6bd56cba 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#include <io/orc/orc.h>
-#include <io/orc/orc_field_reader.hpp>
-#include <io/orc/orc_field_writer.hpp>
+#include "orc.h"
 #include <string>
+#include "orc_field_reader.hpp"
+#include "orc_field_writer.hpp"
 
 namespace cudf {
 namespace io {
diff --git a/cpp/src/io/orc/orc_field_reader.hpp b/cpp/src/io/orc/orc_field_reader.hpp
index 9bb1ff4310b..8e9bca44340 100644
--- a/cpp/src/io/orc/orc_field_reader.hpp
+++ b/cpp/src/io/orc/orc_field_reader.hpp
@@ -15,8 +15,8 @@
  */
 #pragma once
 
-#include <io/orc/orc.h>
 #include <string>
+#include "orc.h"
 
 /**
  * @file orc_field_reader.hpp
diff --git a/cpp/src/io/orc/orc_field_writer.hpp b/cpp/src/io/orc/orc_field_writer.hpp
index c60e5cbd23c..13c7befa3a1 100644
--- a/cpp/src/io/orc/orc_field_writer.hpp
+++ b/cpp/src/io/orc/orc_field_writer.hpp
@@ -15,9 +15,9 @@
  */
 #pragma once
 
-#include <io/orc/orc.h>
 #include <numeric>
 #include <string>
+#include "orc.h"
 
 /**
  * @file orc_field_writer.hpp
diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h
index 55df0adf95b..dadc8a06281 100644
--- a/cpp/src/io/orc/orc_gpu.h
+++ b/cpp/src/io/orc/orc_gpu.h
@@ -19,10 +19,12 @@
 #include "timezone.cuh"
 
 #include <io/comp/gpuinflate.h>
-#include <io/orc/orc_common.h>
 #include <io/statistics/column_stats.h>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+#include <io/utilities/column_buffer.hpp>
+#include "orc_common.h"
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -30,12 +32,15 @@ namespace cudf {
 namespace io {
 namespace orc {
 namespace gpu {
+
+using cudf::detail::device_2dspan;
+
 struct CompressedStreamInfo {
   CompressedStreamInfo() = default;
   explicit constexpr CompressedStreamInfo(const uint8_t *compressed_data_, size_t compressed_size_)
     : compressed_data(compressed_data_),
-      compressed_data_size(compressed_size_),
       uncompressed_data(nullptr),
+      compressed_data_size(compressed_size_),
       decctl(nullptr),
       decstatus(nullptr),
       copyctl(nullptr),
@@ -67,14 +72,6 @@ enum StreamIndexType {
   CI_NUM_STREAMS
 };
 
-/**
- * @brief Struct to describe the output of a string datatype
- */
-struct nvstrdesc_s {
-  const char *ptr;
-  size_t count;
-};
-
 /**
  * @brief Struct to describe a single entry in the global dictionary
  */
@@ -292,8 +289,8 @@ void DecodeOrcColumnData(ColumnDesc const *chunks,
  * @param[in, out] streams chunk streams device array [column][rowgroup]
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
-void EncodeOrcColumnData(detail::device_2dspan<EncChunk const> chunks,
-                         detail::device_2dspan<encoder_chunk_streams> streams,
+void EncodeOrcColumnData(device_2dspan<EncChunk const> chunks,
+                         device_2dspan<encoder_chunk_streams> streams,
                          rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
@@ -307,10 +304,10 @@ void EncodeOrcColumnData(detail::device_2dspan<EncChunk const> chunks,
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
 void EncodeStripeDictionaries(StripeDictionary *stripes,
-                              detail::device_2dspan<EncChunk const> chunks,
+                              device_2dspan<EncChunk const> chunks,
                               uint32_t num_string_columns,
                               uint32_t num_stripes,
-                              detail::device_2dspan<encoder_chunk_streams> enc_streams,
+                              device_2dspan<encoder_chunk_streams> enc_streams,
                               rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
@@ -321,7 +318,7 @@ void EncodeStripeDictionaries(StripeDictionary *stripes,
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
 void set_chunk_columns(const table_device_view &view,
-                       detail::device_2dspan<EncChunk> chunks,
+                       device_2dspan<EncChunk> chunks,
                        rmm::cuda_stream_view stream);
 
 /**
@@ -331,8 +328,8 @@ void set_chunk_columns(const table_device_view &view,
  * @param[in,out] enc_streams chunk streams device array [column][rowgroup]
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
-void CompactOrcDataStreams(detail::device_2dspan<StripeStream> strm_desc,
-                           detail::device_2dspan<encoder_chunk_streams> enc_streams,
+void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
+                           device_2dspan<encoder_chunk_streams> enc_streams,
                            rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
@@ -352,8 +349,8 @@ void CompressOrcDataStreams(uint8_t *compressed_data,
                             uint32_t num_compressed_blocks,
                             CompressionKind compression,
                             uint32_t comp_blk_size,
-                            detail::device_2dspan<StripeStream> strm_desc,
-                            detail::device_2dspan<encoder_chunk_streams> enc_streams,
+                            device_2dspan<StripeStream> strm_desc,
+                            device_2dspan<encoder_chunk_streams> enc_streams,
                             gpu_inflate_input_s *comp_in,
                             gpu_inflate_status_s *comp_out,
                             rmm::cuda_stream_view stream = rmm::cuda_stream_default);
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index dd4972ee8f8..63f184a9bff 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -23,7 +23,7 @@
 #include "timezone.cuh"
 
 #include <io/comp/gpuinflate.h>
-#include <io/orc/orc.h>
+#include "orc.h"
 
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 6206d98773f..4abccb5bf25 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -25,6 +25,8 @@ namespace io {
 namespace orc {
 namespace gpu {
 
+using cudf::io::detail::string_index_pair;
+
 // Must be able to handle 512x 8-byte values. These values are base 128 encoded
 // so 8 byte value is expanded to 10 bytes.
 constexpr int bytestream_buffer_size = 512 * 8 * 2;
@@ -147,9 +149,9 @@ static __device__ void bytestream_init(volatile orc_bytestream_s *bs,
                                        const uint8_t *base,
                                        uint32_t len)
 {
-  uint32_t pos   = static_cast<uint32_t>(7 & reinterpret_cast<size_t>(base));
+  uint32_t pos   = (len > 0) ? static_cast<uint32_t>(7 & reinterpret_cast<size_t>(base)) : 0;
   bs->base       = base - pos;
-  bs->pos        = (len > 0) ? pos : 0;
+  bs->pos        = pos;
   bs->len        = (len + pos + 7) & ~7;
   bs->fill_pos   = 0;
   bs->fill_count = min(bs->len, bytestream_buffer_size) >> 3;
@@ -1683,9 +1685,9 @@ __global__ void __launch_bounds__(block_size)
             case BINARY:
             case VARCHAR:
             case CHAR: {
-              nvstrdesc_s *strdesc = &static_cast<nvstrdesc_s *>(data_out)[row];
-              void const *ptr      = nullptr;
-              uint32_t count       = 0;
+              string_index_pair *strdesc = &static_cast<string_index_pair *>(data_out)[row];
+              void const *ptr            = nullptr;
+              uint32_t count             = 0;
               if (is_dictionary(s->chunk.encoding_kind)) {
                 auto const dict_idx = s->vals.u32[t + vals_skipped];
                 if (dict_idx < s->chunk.dict_len) {
@@ -1703,8 +1705,8 @@ __global__ void __launch_bounds__(block_size)
                   count = secondary_val;
                 }
               }
-              strdesc->ptr   = static_cast<char const *>(ptr);
-              strdesc->count = count;
+              strdesc->first  = static_cast<char const *>(ptr);
+              strdesc->second = count;
               break;
             }
             case TIMESTAMP: {
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 10932d36309..6ed9071f5b7 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -27,7 +27,7 @@ namespace io {
 namespace orc {
 namespace gpu {
 
-using detail::device_2dspan;
+using cudf::detail::device_2dspan;
 
 constexpr int scratch_buffer_size = 512 * 4;
 
@@ -1226,8 +1226,8 @@ void CompressOrcDataStreams(uint8_t *compressed_data,
                             uint32_t num_compressed_blocks,
                             CompressionKind compression,
                             uint32_t comp_blk_size,
-                            detail::device_2dspan<StripeStream> strm_desc,
-                            detail::device_2dspan<encoder_chunk_streams> enc_streams,
+                            device_2dspan<StripeStream> strm_desc,
+                            device_2dspan<encoder_chunk_streams> enc_streams,
                             gpu_inflate_input_s *comp_in,
                             gpu_inflate_status_s *comp_out,
                             rmm::cuda_stream_view stream)
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index 61917403b41..42cb15a56b7 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -40,7 +40,7 @@ extern "C" __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeDat
   int strm_id                  = blockIdx.x * 4 + (threadIdx.x / 32);
   int lane_id                  = threadIdx.x % 32;
 
-  if (lane_id == 0) { s->info = strm_info[strm_id]; }
+  if (strm_id < num_streams && lane_id == 0) { s->info = strm_info[strm_id]; }
 
   __syncthreads();
   if (strm_id < num_streams) {
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index ddb5006098d..a9b8eb0ac6b 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <io/parquet/compact_protocol_writer.hpp>
+#include "compact_protocol_writer.hpp"
 
 namespace cudf {
 namespace io {
diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp
index 680ea078a2f..2ce9245490e 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.hpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.hpp
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <io/parquet/parquet.hpp>
-#include <io/parquet/parquet_common.hpp>
+#include "parquet.hpp"
+#include "parquet_common.hpp"
 
 #include <stddef.h>
 #include <stdint.h>
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 538e238b5ea..dfd9c1384c5 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include <io/parquet/parquet_gpu.hpp>
 #include <io/utilities/block_utils.cuh>
 #include <io/utilities/column_buffer.hpp>
+#include "parquet_gpu.hpp"
 
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/utilities/bit.hpp>
@@ -518,13 +518,14 @@ inline __device__ void gpuOutputString(volatile page_state_s *s, int src_pos, vo
 
   if (s->dict_base) {
     // String dictionary
-    uint32_t dict_pos = (s->dict_bits > 0)
-                          ? s->dict_idx[src_pos & (non_zero_buffer_size - 1)] * sizeof(nvstrdesc_s)
-                          : 0;
+    uint32_t dict_pos = (s->dict_bits > 0) ? s->dict_idx[src_pos & (non_zero_buffer_size - 1)] *
+                                               sizeof(string_index_pair)
+                                           : 0;
     if (dict_pos < (uint32_t)s->dict_size) {
-      const nvstrdesc_s *src = reinterpret_cast<const nvstrdesc_s *>(s->dict_base + dict_pos);
-      ptr                    = src->ptr;
-      len                    = src->count;
+      const string_index_pair *src =
+        reinterpret_cast<const string_index_pair *>(s->dict_base + dict_pos);
+      ptr = src->first;
+      len = src->second;
     }
   } else {
     // Plain encoding
@@ -539,9 +540,9 @@ inline __device__ void gpuOutputString(volatile page_state_s *s, int src_pos, vo
     *static_cast<uint32_t *>(dstv) = device_str2hash32(ptr, len);
   } else {
     // Output string descriptor
-    nvstrdesc_s *dst = static_cast<nvstrdesc_s *>(dstv);
-    dst->ptr         = ptr;
-    dst->count       = len;
+    string_index_pair *dst = static_cast<string_index_pair *>(dstv);
+    dst->first             = ptr;
+    dst->second            = len;
   }
 }
 
@@ -1010,7 +1011,7 @@ static __device__ bool setupLocalPageInfo(page_state_s *const s,
           // Fall through to DOUBLE
         case DOUBLE: s->dtype_len = 8; break;
         case INT96: s->dtype_len = 12; break;
-        case BYTE_ARRAY: s->dtype_len = sizeof(nvstrdesc_s); break;
+        case BYTE_ARRAY: s->dtype_len = sizeof(string_index_pair); break;
         default:  // FIXED_LEN_BYTE_ARRAY:
           s->dtype_len = dtype_len_out;
           s->error |= (s->dtype_len <= 0);
@@ -1094,7 +1095,7 @@ static __device__ bool setupLocalPageInfo(page_state_s *const s,
           if (((s->col.data_type & 7) == BYTE_ARRAY) && (s->col.str_dict_index)) {
             // String dictionary: use index
             s->dict_base = reinterpret_cast<const uint8_t *>(s->col.str_dict_index);
-            s->dict_size = s->col.page_info[0].num_input_values * sizeof(nvstrdesc_s);
+            s->dict_size = s->col.page_info[0].num_input_values * sizeof(string_index_pair);
           } else {
             s->dict_base =
               s->col.page_info[0].page_data;  // dictionary is always stored in the first page
diff --git a/cpp/src/io/parquet/page_dict.cu b/cpp/src/io/parquet/page_dict.cu
index 2676f30474d..30842820448 100644
--- a/cpp/src/io/parquet/page_dict.cu
+++ b/cpp/src/io/parquet/page_dict.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include <io/parquet/parquet_gpu.hpp>
 #include <io/utilities/block_utils.cuh>
+#include "parquet_gpu.hpp"
 
 #include <cudf/utilities/error.hpp>
 
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 51ec0013f1a..6c31605887a 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -13,8 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <io/parquet/parquet_gpu.hpp>
 #include <io/utilities/block_utils.cuh>
+#include "parquet_gpu.hpp"
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 34f5ee6fb1a..bc10fd92566 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -15,8 +15,8 @@
  */
 
 #include <thrust/tuple.h>
-#include <io/parquet/parquet_gpu.hpp>
 #include <io/utilities/block_utils.cuh>
+#include "parquet_gpu.hpp"
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -447,10 +447,10 @@ extern "C" __global__ void __launch_bounds__(128)
   if (chunk >= num_chunks) { return; }
   if (!lane_id && ck->num_dict_pages > 0 && ck->str_dict_index) {
     // Data type to describe a string
-    nvstrdesc_s *dict_index = ck->str_dict_index;
-    const uint8_t *dict     = ck->page_info[0].page_data;
-    int dict_size           = ck->page_info[0].uncompressed_page_size;
-    int num_entries         = ck->page_info[0].num_input_values;
+    string_index_pair *dict_index = ck->str_dict_index;
+    const uint8_t *dict           = ck->page_info[0].page_data;
+    int dict_size                 = ck->page_info[0].uncompressed_page_size;
+    int num_entries               = ck->page_info[0].num_input_values;
     int pos = 0, cur = 0;
     for (int i = 0; i < num_entries; i++) {
       int len = 0;
@@ -464,8 +464,8 @@ extern "C" __global__ void __launch_bounds__(128)
         }
       }
       // TODO: Could store 8 entries in shared mem, then do a single warp-wide store
-      dict_index[i].ptr   = reinterpret_cast<const char *>(dict + pos + 4);
-      dict_index[i].count = len;
+      dict_index[i].first  = reinterpret_cast<const char *>(dict + pos + 4);
+      dict_index[i].second = len;
     }
   }
 }
diff --git a/cpp/src/io/parquet/parquet.cpp b/cpp/src/io/parquet/parquet.cpp
index 40ce222825b..2a1bd0d5a18 100644
--- a/cpp/src/io/parquet/parquet.cpp
+++ b/cpp/src/io/parquet/parquet.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
+#include "parquet.hpp"
 #include <algorithm>
-#include <io/parquet/parquet.hpp>
 
 namespace cudf {
 namespace io {
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 6c1c6209266..eefff518a9a 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <io/parquet/parquet_common.hpp>
+#include "parquet_common.hpp"
 
 #include <stddef.h>
 #include <stdint.h>
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 555259c443d..a7698ea8a78 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -18,9 +18,9 @@
 
 #include <io/comp/gpuinflate.h>
 #include <io/statistics/column_stats.h>
-#include <io/parquet/parquet_common.hpp>
 #include <io/utilities/column_buffer.hpp>
 #include <io/utilities/hostdevice_vector.hpp>
+#include "parquet_common.hpp"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/lists/lists_column_view.hpp>
@@ -39,6 +39,8 @@ namespace cudf {
 namespace io {
 namespace parquet {
 
+using cudf::io::detail::string_index_pair;
+
 /**
  * @brief Struct representing an input column in the file.
  */
@@ -70,14 +72,6 @@ enum level_type {
   NUM_LEVEL_TYPES
 };
 
-/**
- * @brief Struct to describe the output of a string datatype
- */
-struct nvstrdesc_s {
-  const char *ptr;
-  size_t count;
-};
-
 /**
  * @brief Nesting information
  */
@@ -211,7 +205,7 @@ struct ColumnChunkDesc {
   int32_t max_num_pages;                      // size of page_info array
   PageInfo *page_info;                        // output page info for up to num_dict_pages +
                                               // num_data_pages (dictionary pages first)
-  nvstrdesc_s *str_dict_index;                // index for string dictionary
+  string_index_pair *str_dict_index;          // index for string dictionary
   uint32_t **valid_map_base;                  // base pointers of valid bit map for this column
   void **column_data_base;                    // base pointers of column data
   int8_t codec;                               // compressed codec enum
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index 698eb1569cb..363a90522f5 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -927,7 +927,7 @@ rmm::device_buffer reader::impl::decompress_page_data(
   };
 
   // Brotli scratch memory for decompressing
-  rmm::device_vector<uint8_t> debrotli_scratch;
+  rmm::device_buffer debrotli_scratch;
 
   // Count the exact number of compressed pages
   size_t num_comp_pages    = 0;
@@ -943,7 +943,7 @@ rmm::device_buffer reader::impl::decompress_page_data(
       num_comp_pages++;
     });
     if (codec.first == parquet::BROTLI && codec.second > 0) {
-      debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.second));
+      debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.second), stream);
     }
   }
 
@@ -1001,7 +1001,7 @@ rmm::device_buffer reader::impl::decompress_page_data(
         case parquet::BROTLI:
           CUDA_TRY(gpu_debrotli(inflate_in.device_ptr(start_pos),
                                 inflate_out.device_ptr(start_pos),
-                                debrotli_scratch.data().get(),
+                                debrotli_scratch.data(),
                                 debrotli_scratch.size(),
                                 argc - start_pos,
                                 stream));
@@ -1052,10 +1052,10 @@ void reader::impl::allocate_nesting_info(hostdevice_vector<gpu::ColumnChunkDesc>
   int target_page_index = 0;
   int src_info_index    = 0;
   for (size_t idx = 0; idx < chunks.size(); idx++) {
-    int src_col_schema = chunks[idx].src_col_schema;
-    auto &schema       = _metadata->get_schema(src_col_schema);
-    auto const per_page_nesting_info_size =
-      max(schema.max_definition_level + 1, _metadata->get_output_nesting_depth(src_col_schema));
+    int src_col_schema                    = chunks[idx].src_col_schema;
+    auto &schema                          = _metadata->get_schema(src_col_schema);
+    auto const per_page_nesting_info_size = std::max(
+      schema.max_definition_level + 1, _metadata->get_output_nesting_depth(src_col_schema));
 
     // skip my dict pages
     target_page_index += chunks[idx].num_dict_pages;
@@ -1083,7 +1083,7 @@ void reader::impl::allocate_nesting_info(hostdevice_vector<gpu::ColumnChunkDesc>
     int max_depth = _metadata->get_output_nesting_depth(src_col_schema);
 
     // # of nesting infos stored per page for this column
-    auto const per_page_nesting_info_size = max(schema.max_definition_level + 1, max_depth);
+    auto const per_page_nesting_info_size = std::max(schema.max_definition_level + 1, max_depth);
 
     // if this column has lists, generate depth remapping
     std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
@@ -1199,7 +1199,7 @@ void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc> &chu
 
   // Build index for string dictionaries since they can't be indexed
   // directly due to variable-sized elements
-  rmm::device_vector<gpu::nvstrdesc_s> str_dict_index;
+  rmm::device_vector<string_index_pair> str_dict_index;
   if (total_str_dict_indexes > 0) { str_dict_index.resize(total_str_dict_indexes); }
 
   // TODO (dm): hd_vec should have begin and end iterator members
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index ca200936134..ffd8975a8d2 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -21,8 +21,8 @@
 
 #pragma once
 
-#include <io/parquet/parquet.hpp>
-#include <io/parquet/parquet_gpu.hpp>
+#include "parquet.hpp"
+#include "parquet_gpu.hpp"
 
 #include <io/utilities/column_buffer.hpp>
 #include <io/utilities/hostdevice_vector.hpp>
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 1e8a6920ea4..b5700af2d6e 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -21,10 +21,11 @@
 
 #include "writer_impl.hpp"
 
-#include <io/parquet/compact_protocol_writer.hpp>
 #include <io/utilities/column_utils.cuh>
+#include "compact_protocol_writer.hpp"
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -34,7 +35,6 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/device_vector.hpp>
 
 #include <algorithm>
 #include <cstring>
@@ -504,7 +504,7 @@ struct parquet_column_view {
                       rmm::cuda_stream_view stream);
 
   column_view leaf_column_view() const;
-  gpu::parquet_column_device_view get_device_view();
+  gpu::parquet_column_device_view get_device_view(rmm::cuda_stream_view stream);
 
   column_view cudf_column_view() const { return cudf_col; }
   parquet::Type physical_type() const { return schema_node.type; }
@@ -517,21 +517,21 @@ struct parquet_column_view {
   bool is_list() const noexcept { return _is_list; }
 
   // Dictionary related member functions
-  uint32_t *get_dict_data() { return (_dict_data.size()) ? _dict_data.data().get() : nullptr; }
-  uint32_t *get_dict_index() { return (_dict_index.size()) ? _dict_index.data().get() : nullptr; }
+  uint32_t *get_dict_data() { return (_dict_data.size()) ? _dict_data.data() : nullptr; }
+  uint32_t *get_dict_index() { return (_dict_index.size()) ? _dict_index.data() : nullptr; }
   void use_dictionary(bool use_dict) { _dictionary_used = use_dict; }
-  void alloc_dictionary(size_t max_num_rows)
+  void alloc_dictionary(size_t max_num_rows, rmm::cuda_stream_view stream)
   {
-    _dict_data.resize(max_num_rows);
-    _dict_index.resize(max_num_rows);
+    _dict_data.resize(max_num_rows, stream);
+    _dict_index.resize(max_num_rows, stream);
   }
-  bool check_dictionary_used()
+  bool check_dictionary_used(rmm::cuda_stream_view stream)
   {
     if (!_dictionary_used) {
-      _dict_data.resize(0);
-      _dict_data.shrink_to_fit();
-      _dict_index.resize(0);
-      _dict_index.shrink_to_fit();
+      _dict_data.resize(0, stream);
+      _dict_data.shrink_to_fit(stream);
+      _dict_index.resize(0, stream);
+      _dict_index.shrink_to_fit(stream);
     }
     return _dictionary_used;
   }
@@ -558,8 +558,8 @@ struct parquet_column_view {
 
   // Dictionary related members
   bool _dictionary_used = false;
-  rmm::device_vector<uint32_t> _dict_data;
-  rmm::device_vector<uint32_t> _dict_index;
+  rmm::device_uvector<uint32_t> _dict_data;
+  rmm::device_uvector<uint32_t> _dict_index;
 };
 
 parquet_column_view::parquet_column_view(schema_tree_node const &schema_node,
@@ -569,7 +569,9 @@ parquet_column_view::parquet_column_view(schema_tree_node const &schema_node,
     _d_nullability(0, stream),
     _dremel_offsets(0, stream),
     _rep_level(0, stream),
-    _def_level(0, stream)
+    _def_level(0, stream),
+    _dict_data(0, stream),
+    _dict_index(0, stream)
 {
   // Construct single inheritance column_view from linked_column_view
   auto curr_col                           = schema_node.leaf_column.get();
@@ -680,16 +682,17 @@ column_view parquet_column_view::leaf_column_view() const
   return col;
 }
 
-gpu::parquet_column_device_view parquet_column_view::get_device_view()
+gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream_view stream)
 {
   column_view col  = leaf_column_view();
   auto desc        = gpu::parquet_column_device_view{};  // Zero out all fields
   desc.stats_dtype = schema_node.stats_dtype;
   desc.ts_scale    = schema_node.ts_scale;
 
-  // TODO (dm): Enable dictionary for list after refactor
-  if (physical_type() != BOOLEAN && physical_type() != UNDEFINED_TYPE && !is_list()) {
-    alloc_dictionary(_data_count);
+  // TODO (dm): Enable dictionary for list and struct after refactor
+  if (physical_type() != BOOLEAN && physical_type() != UNDEFINED_TYPE &&
+      !is_nested(cudf_col.type())) {
+    alloc_dictionary(_data_count, stream);
     desc.dict_index = get_dict_index();
     desc.dict_data  = get_dict_data();
   }
@@ -739,9 +742,9 @@ void writer::impl::gather_fragment_statistics(
   uint32_t num_fragments,
   uint32_t fragment_size)
 {
-  rmm::device_vector<statistics_group> frag_stats_group(num_fragments * num_columns);
+  rmm::device_uvector<statistics_group> frag_stats_group(num_fragments * num_columns, stream);
 
-  gpu::InitFragmentStatistics(frag_stats_group.data().get(),
+  gpu::InitFragmentStatistics(frag_stats_group.data(),
                               frag.device_ptr(),
                               col_desc.device_ptr(),
                               num_fragments,
@@ -749,7 +752,7 @@ void writer::impl::gather_fragment_statistics(
                               fragment_size,
                               stream);
   GatherColumnStatistics(
-    frag_stats_chunk, frag_stats_group.data().get(), num_fragments * num_columns, stream);
+    frag_stats_chunk, frag_stats_group.data(), num_fragments * num_columns, stream);
   stream.synchronize();
 }
 
@@ -761,10 +764,11 @@ void writer::impl::build_chunk_dictionaries(
   uint32_t num_dictionaries)
 {
   size_t dict_scratch_size = (size_t)num_dictionaries * gpu::kDictScratchSize;
-  rmm::device_vector<uint32_t> dict_scratch(dict_scratch_size / sizeof(uint32_t));
+  auto dict_scratch        = cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
+    dict_scratch_size / sizeof(uint32_t), stream);
   chunks.host_to_device(stream);
   gpu::BuildChunkDictionaries(chunks.device_ptr(),
-                              dict_scratch.data().get(),
+                              dict_scratch.data(),
                               dict_scratch_size,
                               num_rowgroups * num_columns,
                               stream);
@@ -789,22 +793,22 @@ void writer::impl::init_encoder_pages(hostdevice_vector<gpu::EncColumnChunk> &ch
                                       uint32_t num_pages,
                                       uint32_t num_stats_bfr)
 {
-  rmm::device_vector<statistics_merge_group> page_stats_mrg(num_stats_bfr);
+  rmm::device_uvector<statistics_merge_group> page_stats_mrg(num_stats_bfr, stream);
   chunks.host_to_device(stream);
   InitEncoderPages(chunks.device_ptr(),
                    pages,
                    col_desc.device_ptr(),
                    num_rowgroups,
                    num_columns,
-                   (num_stats_bfr) ? page_stats_mrg.data().get() : nullptr,
-                   (num_stats_bfr > num_pages) ? page_stats_mrg.data().get() + num_pages : nullptr,
+                   (num_stats_bfr) ? page_stats_mrg.data() : nullptr,
+                   (num_stats_bfr > num_pages) ? page_stats_mrg.data() + num_pages : nullptr,
                    stream);
   if (num_stats_bfr > 0) {
-    MergeColumnStatistics(page_stats, frag_stats, page_stats_mrg.data().get(), num_pages, stream);
+    MergeColumnStatistics(page_stats, frag_stats, page_stats_mrg.data(), num_pages, stream);
     if (num_stats_bfr > num_pages) {
       MergeColumnStatistics(page_stats + num_pages,
                             page_stats,
-                            page_stats_mrg.data().get() + num_pages,
+                            page_stats_mrg.data() + num_pages,
                             num_stats_bfr - num_pages,
                             stream);
     }
@@ -977,8 +981,8 @@ void writer::impl::write(table_view const &table)
   // This should've been `auto const&` but isn't since dictionary space is allocated when calling
   // get_device_view(). Fix during dictionary refactor.
   std::transform(
-    parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [](auto &pcol) {
-      return pcol.get_device_view();
+    parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [&](auto &pcol) {
+      return pcol.get_device_view(stream);
     });
 
   // Init page fragments
@@ -1035,12 +1039,12 @@ void writer::impl::write(table_view const &table)
   }
 
   // Allocate column chunks and gather fragment statistics
-  rmm::device_vector<statistics_chunk> frag_stats;
+  rmm::device_uvector<statistics_chunk> frag_stats(0, stream);
   if (stats_granularity_ != statistics_freq::STATISTICS_NONE) {
-    frag_stats.resize(num_fragments * num_columns);
+    frag_stats.resize(num_fragments * num_columns, stream);
     if (frag_stats.size() != 0) {
       gather_fragment_statistics(
-        frag_stats.data().get(), fragments, col_desc, num_columns, num_fragments, fragment_size);
+        frag_stats.data(), fragments, col_desc, num_columns, num_fragments, fragment_size);
     }
   }
   // Initialize row groups and column chunks
@@ -1063,8 +1067,7 @@ void writer::impl::write(table_view const &table)
       ck->bfr_size         = 0;
       ck->compressed_size  = 0;
       ck->fragments        = fragments.device_ptr() + i * num_fragments + f;
-      ck->stats =
-        (frag_stats.size() != 0) ? frag_stats.data().get() + i * num_fragments + f : nullptr;
+      ck->stats = (frag_stats.size() != 0) ? frag_stats.data() + i * num_fragments + f : nullptr;
       ck->start_row      = start_row;
       ck->num_rows       = (uint32_t)md.row_groups[global_r].num_rows;
       ck->first_fragment = i * num_fragments + f;
@@ -1112,7 +1115,7 @@ void writer::impl::write(table_view const &table)
   }
 
   // Free unused dictionaries
-  for (auto &col : parquet_columns) { col.check_dictionary_used(); }
+  for (auto &col : parquet_columns) { col.check_dictionary_used(stream); }
 
   // Build chunk dictionaries and count pages
   if (num_chunks != 0) {
@@ -1167,10 +1170,10 @@ void writer::impl::write(table_view const &table)
     (stats_granularity_ != statistics_freq::STATISTICS_NONE) ? num_pages + num_chunks : 0;
   rmm::device_buffer uncomp_bfr(max_uncomp_bfr_size, stream);
   rmm::device_buffer comp_bfr(max_comp_bfr_size, stream);
-  rmm::device_vector<gpu_inflate_input_s> comp_in(max_comp_pages);
-  rmm::device_vector<gpu_inflate_status_s> comp_out(max_comp_pages);
-  rmm::device_vector<gpu::EncPage> pages(num_pages);
-  rmm::device_vector<statistics_chunk> page_stats(num_stats_bfr);
+  rmm::device_uvector<gpu_inflate_input_s> comp_in(max_comp_pages, stream);
+  rmm::device_uvector<gpu_inflate_status_s> comp_out(max_comp_pages, stream);
+  rmm::device_uvector<gpu::EncPage> pages(num_pages, stream);
+  rmm::device_uvector<statistics_chunk> page_stats(num_stats_bfr, stream);
   for (uint32_t b = 0, r = 0; b < (uint32_t)batch_list.size(); b++) {
     uint8_t *bfr   = static_cast<uint8_t *>(uncomp_bfr.data());
     uint8_t *bfr_c = static_cast<uint8_t *>(comp_bfr.data());
@@ -1188,9 +1191,9 @@ void writer::impl::write(table_view const &table)
   if (num_pages != 0) {
     init_encoder_pages(chunks,
                        col_desc,
-                       pages.data().get(),
-                       (num_stats_bfr) ? page_stats.data().get() : nullptr,
-                       (num_stats_bfr) ? frag_stats.data().get() : nullptr,
+                       pages.data(),
+                       (num_stats_bfr) ? page_stats.data() : nullptr,
+                       (num_stats_bfr) ? frag_stats.data() : nullptr,
                        num_rowgroups,
                        num_columns,
                        num_pages,
@@ -1210,16 +1213,16 @@ void writer::impl::write(table_view const &table)
     uint32_t pages_in_batch = first_page_in_next_batch - first_page_in_batch;
     encode_pages(
       chunks,
-      pages.data().get(),
+      pages.data(),
       num_columns,
       pages_in_batch,
       first_page_in_batch,
       batch_list[b],
       r,
-      comp_in.data().get(),
-      comp_out.data().get(),
-      (stats_granularity_ == statistics_freq::STATISTICS_PAGE) ? page_stats.data().get() : nullptr,
-      (stats_granularity_ != statistics_freq::STATISTICS_NONE) ? page_stats.data().get() + num_pages
+      comp_in.data(),
+      comp_out.data(),
+      (stats_granularity_ == statistics_freq::STATISTICS_PAGE) ? page_stats.data() : nullptr,
+      (stats_granularity_ != statistics_freq::STATISTICS_NONE) ? page_stats.data() + num_pages
                                                                : nullptr);
     for (; r < rnext; r++, global_r++) {
       for (auto i = 0; i < num_columns; i++) {
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index b8532d755eb..e5103122033 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -21,8 +21,8 @@
 
 #pragma once
 
-#include <io/parquet/parquet.hpp>
-#include <io/parquet/parquet_gpu.hpp>
+#include "parquet.hpp"
+#include "parquet_gpu.hpp"
 
 #include <cudf/io/data_sink.hpp>
 #include <io/utilities/hostdevice_vector.hpp>
diff --git a/cpp/src/io/utilities/block_utils.cuh b/cpp/src/io/utilities/block_utils.cuh
index 4c03f9a9ca0..759aa2517b6 100644
--- a/cpp/src/io/utilities/block_utils.cuh
+++ b/cpp/src/io/utilities/block_utils.cuh
@@ -36,16 +36,6 @@ inline __device__ void syncwarp(void) { __syncwarp(); }
 
 inline __device__ uint32_t ballot(int pred) { return __ballot_sync(~0, pred); }
 
-template <typename T>
-inline __device__ void nanosleep(T d)
-{
-#if (__CUDA_ARCH__ >= 700)
-  __nanosleep(d);
-#else
-  clock();
-#endif
-}
-
 // Warp reduction helpers
 template <typename T>
 inline __device__ T WarpReduceOr2(T acc)
@@ -197,60 +187,5 @@ inline __device__ void memcpy_block(void *dstv, const void *srcv, uint32_t len,
   }
 }
 
-/**
- * @brief Compares two strings
- */
-template <class T, const T lesser, const T greater, const T equal>
-inline __device__ T nvstr_compare(const char *as, uint32_t alen, const char *bs, uint32_t blen)
-{
-  uint32_t len = min(alen, blen);
-  uint32_t i   = 0;
-  if (len >= 4) {
-    uint32_t align_a     = 3 & reinterpret_cast<uintptr_t>(as);
-    uint32_t align_b     = 3 & reinterpret_cast<uintptr_t>(bs);
-    const uint32_t *as32 = reinterpret_cast<const uint32_t *>(as - align_a);
-    const uint32_t *bs32 = reinterpret_cast<const uint32_t *>(bs - align_b);
-    uint32_t ofsa        = align_a * 8;
-    uint32_t ofsb        = align_b * 8;
-    do {
-      uint32_t a = *as32++;
-      uint32_t b = *bs32++;
-      if (ofsa) a = __funnelshift_r(a, *as32, ofsa);
-      if (ofsb) b = __funnelshift_r(b, *bs32, ofsb);
-      if (a != b) {
-        return (lesser == greater || __byte_perm(a, 0, 0x0123) < __byte_perm(b, 0, 0x0123))
-                 ? lesser
-                 : greater;
-      }
-      i += 4;
-    } while (i + 4 <= len);
-  }
-  while (i < len) {
-    uint8_t a = as[i];
-    uint8_t b = bs[i];
-    if (a != b) { return (a < b) ? lesser : greater; }
-    ++i;
-  }
-  return (alen == blen) ? equal : (alen < blen) ? lesser : greater;
-}
-
-inline __device__ bool nvstr_is_lesser(const char *as, uint32_t alen, const char *bs, uint32_t blen)
-{
-  return nvstr_compare<bool, true, false, false>(as, alen, bs, blen);
-}
-
-inline __device__ bool nvstr_is_greater(const char *as,
-                                        uint32_t alen,
-                                        const char *bs,
-                                        uint32_t blen)
-{
-  return nvstr_compare<bool, false, true, false>(as, alen, bs, blen);
-}
-
-inline __device__ bool nvstr_is_equal(const char *as, uint32_t alen, const char *bs, uint32_t blen)
-{
-  return nvstr_compare<bool, false, false, true>(as, alen, bs, blen);
-}
-
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
new file mode 100644
index 00000000000..9170a9016c4
--- /dev/null
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file column_buffer.cpp
+ * @brief cuDF-IO column_buffer class implementation
+ */
+
+#include "column_buffer.hpp"
+#include <cudf/detail/utilities/vector_factories.hpp>
+
+namespace cudf {
+namespace io {
+namespace detail {
+
+void column_buffer::create(size_type _size,
+                           rmm::cuda_stream_view stream,
+                           rmm::mr::device_memory_resource* mr)
+{
+  size = _size;
+
+  switch (type.id()) {
+    case type_id::STRING:
+      _strings = std::make_unique<rmm::device_uvector<string_index_pair>>(
+        cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(size, stream));
+      break;
+
+    // list columns store a buffer of int32's as offsets to represent
+    // their individual rows
+    case type_id::LIST: _data = create_data(data_type{type_id::INT32}, size, stream, mr); break;
+
+    // struct columns store no data themselves.  just validity and children.
+    case type_id::STRUCT: break;
+
+    default: _data = create_data(type, size, stream, mr); break;
+  }
+  if (is_nullable) {
+    _null_mask =
+      cudf::detail::create_null_mask(size, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr);
+  }
+}
+
+/**
+ * @brief Creates a column from an existing set of device memory buffers.
+ *
+ * @throws std::bad_alloc if device memory allocation fails
+ *
+ * @param buffer Column buffer descriptors
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @return `std::unique_ptr<cudf::column>` Column from the existing device data
+ */
+std::unique_ptr<column> make_column(column_buffer& buffer,
+                                    column_name_info* schema_info,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
+{
+  if (schema_info != nullptr) { schema_info->name = buffer.name; }
+
+  switch (buffer.type.id()) {
+    case type_id::STRING:
+      if (schema_info != nullptr) {
+        schema_info->children.push_back(column_name_info{"offsets"});
+        schema_info->children.push_back(column_name_info{"chars"});
+      }
+      return make_strings_column(*buffer._strings, stream, mr);
+
+    case type_id::LIST: {
+      // make offsets column
+      auto offsets =
+        std::make_unique<column>(data_type{type_id::INT32}, buffer.size, std::move(buffer._data));
+
+      column_name_info* child_info = nullptr;
+      if (schema_info != nullptr) {
+        schema_info->children.push_back(column_name_info{"offsets"});
+        schema_info->children.push_back(column_name_info{""});
+        child_info = &schema_info->children.back();
+      }
+
+      // make child column
+      CUDF_EXPECTS(buffer.children.size() > 0, "Encountered malformed column_buffer");
+      auto child = make_column(buffer.children[0], child_info, stream, mr);
+
+      // make the final list column (note : size is the # of offsets, so our actual # of rows is 1
+      // less)
+      return make_lists_column(buffer.size - 1,
+                               std::move(offsets),
+                               std::move(child),
+                               buffer._null_count,
+                               std::move(buffer._null_mask),
+                               stream,
+                               mr);
+    } break;
+
+    case type_id::STRUCT: {
+      std::vector<std::unique_ptr<cudf::column>> output_children;
+      output_children.reserve(buffer.children.size());
+      std::transform(buffer.children.begin(),
+                     buffer.children.end(),
+                     std::back_inserter(output_children),
+                     [&](column_buffer& col) {
+                       column_name_info* child_info = nullptr;
+                       if (schema_info != nullptr) {
+                         schema_info->children.push_back(column_name_info{""});
+                         child_info = &schema_info->children.back();
+                       }
+                       return make_column(col, child_info, stream, mr);
+                     });
+
+      return make_structs_column(buffer.size,
+                                 std::move(output_children),
+                                 buffer._null_count,
+                                 std::move(buffer._null_mask),
+                                 stream,
+                                 mr);
+    } break;
+
+    default: {
+      return std::make_unique<column>(buffer.type,
+                                      buffer.size,
+                                      std::move(buffer._data),
+                                      std::move(buffer._null_mask),
+                                      buffer._null_count);
+    }
+  }
+}
+
+}  // namespace detail
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 75e9a4c18df..5da4b7a873b 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -60,18 +60,13 @@ inline rmm::device_buffer create_data(
   return data;
 }
 
+using string_index_pair = thrust::pair<const char*, size_type>;
+
 /**
  * @brief Class for holding device memory buffers to column data that eventually
  * will be used to create a column.
  */
 struct column_buffer {
-  // there is a potential bug here.  In the decoding step, the buffer of
-  // data holding these pairs is cast to an nvstrdesc_s, which is a struct
-  // containing <const char *, size_t>.   So there is a mismatch between the
-  // size_type and the size_t.  I believe this works because the str_pair is
-  // aligned out to 8 bytes anyway.
-  using str_pair = thrust::pair<const char*, size_type>;
-
   column_buffer() = default;
 
   // construct without a known size. call create() later to actually
@@ -84,7 +79,7 @@ struct column_buffer {
                 bool _is_nullable                   = true,
                 rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-    : type(_type), is_nullable(_is_nullable), _null_count(0)
+    : type(_type), is_nullable(_is_nullable)
   {
     create(_size, stream, mr);
   }
@@ -101,30 +96,7 @@ struct column_buffer {
   // preprocessing steps such as in the Parquet reader
   void create(size_type _size,
               rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-              rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-  {
-    size = _size;
-
-    switch (type.id()) {
-      case type_id::STRING:
-        _strings = std::make_unique<rmm::device_uvector<str_pair>>(size, stream);
-        cudaMemsetAsync(_strings->data(), 0, size * sizeof(str_pair), stream.value());
-        break;
-
-      // list columns store a buffer of int32's as offsets to represent
-      // their individual rows
-      case type_id::LIST: _data = create_data(data_type{type_id::INT32}, size, stream, mr); break;
-
-      // struct columns store no data themselves.  just validity and children.
-      case type_id::STRUCT: break;
-
-      default: _data = create_data(type, size, stream, mr); break;
-    }
-    if (is_nullable) {
-      _null_mask = cudf::detail::create_null_mask(
-        size, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr);
-    }
-  }
+              rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   auto data() { return _strings ? _strings->data() : _data.data(); }
   auto data_size() const { return _strings ? _strings->size() : _data.size(); }
@@ -138,110 +110,24 @@ struct column_buffer {
 
   auto& null_count() { return _null_count; }
 
-  std::unique_ptr<rmm::device_uvector<str_pair>> _strings;
+  std::unique_ptr<rmm::device_uvector<string_index_pair>> _strings;
   rmm::device_buffer _data{};
   rmm::device_buffer _null_mask{};
   size_type _null_count{0};
 
-  bool is_nullable{false};
   data_type type{type_id::EMPTY};
+  bool is_nullable{false};
   size_type size{0};
   std::vector<column_buffer> children;
   uint32_t user_data{0};  // arbitrary user data
   std::string name;
 };
 
-namespace {
-/**
- * @brief Creates a column from an existing set of device memory buffers.
- *
- * @throws std::bad_alloc if device memory allocation fails
- *
- * @param buffer Column buffer descriptors
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory
- *
- * @return `std::unique_ptr<cudf::column>` Column from the existing device data
- */
 std::unique_ptr<column> make_column(
   column_buffer& buffer,
   column_name_info* schema_info       = nullptr,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  using str_pair = thrust::pair<const char*, size_type>;
-
-  if (schema_info != nullptr) { schema_info->name = buffer.name; }
-
-  switch (buffer.type.id()) {
-    case type_id::STRING:
-      if (schema_info != nullptr) {
-        schema_info->children.push_back(column_name_info{"offsets"});
-        schema_info->children.push_back(column_name_info{"chars"});
-      }
-      return make_strings_column(*buffer._strings, stream, mr);
-
-    case type_id::LIST: {
-      // make offsets column
-      auto offsets =
-        std::make_unique<column>(data_type{type_id::INT32}, buffer.size, std::move(buffer._data));
-
-      column_name_info* child_info = nullptr;
-      if (schema_info != nullptr) {
-        schema_info->children.push_back(column_name_info{"offsets"});
-        schema_info->children.push_back(column_name_info{""});
-        child_info = &schema_info->children.back();
-      }
-
-      // make child column
-      CUDF_EXPECTS(buffer.children.size() > 0, "Encountered malformed column_buffer");
-      auto child = make_column(buffer.children[0], child_info, stream, mr);
-
-      // make the final list column (note : size is the # of offsets, so our actual # of rows is 1
-      // less)
-      return make_lists_column(buffer.size - 1,
-                               std::move(offsets),
-                               std::move(child),
-                               buffer._null_count,
-                               std::move(buffer._null_mask),
-                               stream,
-                               mr);
-    } break;
-
-    case type_id::STRUCT: {
-      std::vector<std::unique_ptr<cudf::column>> output_children;
-      output_children.reserve(buffer.children.size());
-      std::transform(buffer.children.begin(),
-                     buffer.children.end(),
-                     std::back_inserter(output_children),
-                     [&](column_buffer& col) {
-                       column_name_info* child_info = nullptr;
-                       if (schema_info != nullptr) {
-                         schema_info->children.push_back(column_name_info{""});
-                         child_info = &schema_info->children.back();
-                       }
-                       return make_column(col, child_info, stream, mr);
-                     });
-
-      return make_structs_column(buffer.size,
-                                 std::move(output_children),
-                                 buffer._null_count,
-                                 std::move(buffer._null_mask),
-                                 stream,
-                                 mr);
-    } break;
-
-    default: {
-      return std::make_unique<column>(buffer.type,
-                                      buffer.size,
-                                      std::move(buffer._data),
-                                      std::move(buffer._null_mask),
-                                      buffer._null_count);
-    }
-  }
-}
-
-}  // namespace
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace io
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index 10af7bcb0bd..d133d813ab3 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -18,7 +18,7 @@
 
 #include <cudf/io/data_sink.hpp>
 #include <cudf/utilities/error.hpp>
-#include <io/utilities/file_io_utilities.hpp>
+#include "file_io_utilities.hpp"
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 8f2a5389b4d..ac8deccd078 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -21,7 +21,7 @@
 #include <unistd.h>
 
 #include <cudf/utilities/error.hpp>
-#include <io/utilities/file_io_utilities.hpp>
+#include "file_io_utilities.hpp"
 
 namespace cudf {
 namespace io {
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index 322296715fc..abf3a3fdef0 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <io/utilities/file_io_utilities.hpp>
+#include "file_io_utilities.hpp"
 
 #include <rmm/device_buffer.hpp>
 
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index b7719cba580..9cfa46aaf11 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -20,7 +20,7 @@
 #include <cudf/io/types.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <io/utilities/column_type_histogram.hpp>
+#include "column_type_histogram.hpp"
 
 #include <rmm/device_vector.hpp>
 
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index 1f7e0672404..84151bbce0e 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -18,6 +18,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/dictionary/detail/search.hpp>
@@ -65,9 +66,10 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> form_offsets_and_cha
 
   auto d_offsets = offsets_column->view().template data<size_type>();
   // build chars column
-  size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
   auto chars_column =
-    cudf::strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
+    cudf::strings::detail::create_chars_child_column(strings_count, bytes, stream, mr);
 
   return std::make_pair(std::move(offsets_column), std::move(chars_column));
 }
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 4cf6899116d..1e6c779c51a 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -225,8 +225,8 @@ std::unique_ptr<cudf::column> replace_nulls_column_kernel_forwarder::operator()<
     cudf::data_type(cudf::type_id::INT32), input.size(), cudf::mask_state::UNALLOCATED, stream);
 
   auto sizes_view         = sizes->mutable_view();
-  auto device_in          = cudf::column_device_view::create(input);
-  auto device_replacement = cudf::column_device_view::create(replacement);
+  auto device_in          = cudf::column_device_view::create(input, stream);
+  auto device_replacement = cudf::column_device_view::create(replacement, stream);
 
   rmm::device_buffer valid_bits =
     cudf::detail::create_null_mask(input.size(), cudf::mask_state::UNINITIALIZED, stream, mr);
@@ -245,14 +245,13 @@ std::unique_ptr<cudf::column> replace_nulls_column_kernel_forwarder::operator()<
     sizes_view.begin<int32_t>(), sizes_view.end<int32_t>(), stream, mr);
   auto offsets_view = offsets->mutable_view();
 
-  int32_t size;
-  CUDA_TRY(cudaMemcpyAsync(
-    &size, offsets_view.end<int32_t>() - 1, sizeof(int32_t), cudaMemcpyDefault, stream.value()));
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_view, offsets_view.size() - 1, stream);
 
   // Allocate chars array and output null mask
   cudf::size_type null_count = input.size() - valid_counter.value(stream);
   std::unique_ptr<cudf::column> output_chars =
-    cudf::strings::detail::create_chars_child_column(input.size(), null_count, size, stream, mr);
+    cudf::strings::detail::create_chars_child_column(input.size(), bytes, stream, mr);
 
   auto output_chars_view = output_chars->mutable_view();
 
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 517ef91eecb..43040ac5dfb 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -37,6 +37,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/concatenate.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/replace.hpp>
@@ -412,15 +413,13 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::string_
     sizes_view.begin<int32_t>(), sizes_view.end<int32_t>(), stream, mr);
   auto offsets_view   = offsets->mutable_view();
   auto device_offsets = cudf::mutable_column_device_view::create(offsets_view);
-  int32_t size;
-  CUDA_TRY(cudaMemcpyAsync(
-    &size, offsets_view.end<int32_t>() - 1, sizeof(int32_t), cudaMemcpyDefault, stream.value()));
-  stream.synchronize();
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_view, offsets_view.size() - 1, stream);
 
   // Allocate chars array and output null mask
-  cudf::size_type null_count                 = input_col.size() - valid_counter.value(stream);
-  std::unique_ptr<cudf::column> output_chars = cudf::strings::detail::create_chars_child_column(
-    input_col.size(), null_count, size, stream, mr);
+  cudf::size_type null_count = input_col.size() - valid_counter.value(stream);
+  std::unique_ptr<cudf::column> output_chars =
+    cudf::strings::detail::create_chars_child_column(input_col.size(), bytes, stream, mr);
 
   auto output_chars_view = output_chars->mutable_view();
   auto device_chars      = cudf::mutable_column_device_view::create(output_chars_view);
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index 3e2cb4ac02f..b78dc963153 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -90,9 +90,9 @@ struct interleave_columns_functor {
     auto d_results_offsets = offsets_column->view().template data<int32_t>();
 
     // Create the chars column
-    size_type bytes = thrust::device_pointer_cast(d_results_offsets)[num_strings];
-    auto chars_column =
-      strings::detail::create_chars_child_column(num_strings, null_count, bytes, stream, mr);
+    auto const bytes =
+      cudf::detail::get_value<int32_t>(offsets_column->view(), num_strings, stream);
+    auto chars_column = strings::detail::create_chars_child_column(num_strings, bytes, stream, mr);
     // Fill the chars column
     auto d_results_chars = chars_column->mutable_view().data<char>();
     thrust::for_each_n(
diff --git a/cpp/src/sort/sort_impl.cuh b/cpp/src/sort/sort_impl.cuh
index 506334c2a3d..e99d88a9e84 100644
--- a/cpp/src/sort/sort_impl.cuh
+++ b/cpp/src/sort/sort_impl.cuh
@@ -18,10 +18,12 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/gather.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+
 #include <structs/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -123,14 +125,14 @@ std::unique_ptr<column> sorted_order(table_view input,
   }
 
   auto flattened = structs::detail::flatten_nested_columns(input, column_order, null_precedence);
-  auto& input_flattened = std::get<0>(flattened);
-  auto device_table     = table_device_view::create(input_flattened, stream);
-  rmm::device_vector<order> d_column_order(std::get<1>(flattened));
+  auto& input_flattened     = std::get<0>(flattened);
+  auto device_table         = table_device_view::create(input_flattened, stream);
+  auto const d_column_order = make_device_uvector_async(std::get<1>(flattened), stream);
 
   if (has_nulls(input_flattened)) {
-    rmm::device_vector<null_order> d_null_precedence(std::get<2>(flattened));
-    auto comparator = row_lexicographic_comparator<true>(
-      *device_table, *device_table, d_column_order.data().get(), d_null_precedence.data().get());
+    auto const d_null_precedence = make_device_uvector_async(std::get<2>(flattened), stream);
+    auto const comparator        = row_lexicographic_comparator<true>(
+      *device_table, *device_table, d_column_order.data(), d_null_precedence.data());
     if (stable) {
       thrust::stable_sort(rmm::exec_policy(stream),
                           mutable_indices_view.begin<size_type>(),
@@ -142,9 +144,11 @@ std::unique_ptr<column> sorted_order(table_view input,
                    mutable_indices_view.end<size_type>(),
                    comparator);
     }
+    // protection for temporary d_column_order and d_null_precedence
+    stream.synchronize();
   } else {
-    auto comparator = row_lexicographic_comparator<false>(
-      *device_table, *device_table, d_column_order.data().get());
+    auto const comparator =
+      row_lexicographic_comparator<false>(*device_table, *device_table, d_column_order.data());
     if (stable) {
       thrust::stable_sort(rmm::exec_policy(stream),
                           mutable_indices_view.begin<size_type>(),
@@ -156,6 +160,8 @@ std::unique_ptr<column> sorted_order(table_view input,
                    mutable_indices_view.end<size_type>(),
                    comparator);
     }
+    // protection for temporary d_column_order
+    stream.synchronize();
   }
 
   return sorted_indices;
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 9f3a1caba8a..7f0e63e1790 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -138,8 +138,7 @@ std::unique_ptr<column> convert_case(strings_column_view const& strings,
                          get_special_case_mapping_table()};
 
   // this utility calls the functor to build the offsets and chars columns
-  auto children = cudf::strings::detail::make_strings_children(
-    functor, strings.size(), strings.null_count(), stream, mr);
+  auto children = cudf::strings::detail::make_strings_children(functor, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),
                              std::move(children.first),
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index 0b384ad0631..d822fafc9bc 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -173,8 +173,7 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
   rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
 
   // this utility calls filterer to build the offsets and chars columns
-  auto children = cudf::strings::detail::make_strings_children(
-    filterer, strings_count, strings.null_count(), stream, mr);
+  auto children = cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr);
 
   // return new strings column
   return make_strings_column(strings_count,
diff --git a/cpp/src/strings/combine.cu b/cpp/src/strings/combine.cu
index f9b8b9e0ea3..ebc31177b92 100644
--- a/cpp/src/strings/combine.cu
+++ b/cpp/src/strings/combine.cu
@@ -117,8 +117,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
   // create the chars column
   auto const bytes =
     cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
-  auto chars_column =
-    strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
+  auto chars_column = strings::detail::create_chars_child_column(strings_count, bytes, stream, mr);
   // fill the chars column
   auto d_results_chars = chars_column->mutable_view().data<char>();
   thrust::for_each_n(
@@ -218,10 +217,9 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
     null_mask  = cudf::detail::create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr);
     null_count = 1;
   }
-  auto chars_column =
-    detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
-  auto chars_view = chars_column->mutable_view();
-  auto d_chars    = chars_view.data<char>();
+  auto chars_column = detail::create_chars_child_column(strings_count, bytes, stream, mr);
+  auto chars_view   = chars_column->mutable_view();
+  auto d_chars      = chars_view.data<char>();
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<size_type>(0),
@@ -377,9 +375,9 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
   auto d_results_offsets = offsets_column->view().data<int32_t>();
 
   // Create the chars column
-  size_type bytes = thrust::device_pointer_cast(d_results_offsets)[strings_count];
-  auto chars_column =
-    strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
+  auto chars_column = strings::detail::create_chars_child_column(strings_count, bytes, stream, mr);
 
   // Fill the chars column
   auto d_results_chars = chars_column->mutable_view().data<char>();
diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu
index e1b203a60e2..8ff57340136 100644
--- a/cpp/src/strings/convert/convert_booleans.cu
+++ b/cpp/src/strings/convert/convert_booleans.cu
@@ -124,11 +124,10 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
   auto d_offsets    = offsets_view.data<int32_t>();
 
   // build chars column
-  size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
-  auto chars_column =
-    create_chars_child_column(strings_count, booleans.null_count(), bytes, stream, mr);
-  auto chars_view = chars_column->mutable_view();
-  auto d_chars    = chars_view.data<char>();
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
+  auto chars_column = create_chars_child_column(strings_count, bytes, stream, mr);
+  auto d_chars      = chars_column->mutable_view().data<char>();
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 8b46f66a48f..d6f01e2b68d 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -962,11 +962,10 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
   auto d_new_offsets = offsets_view.template data<int32_t>();
 
   // build chars column
-  size_type bytes = thrust::device_pointer_cast(d_new_offsets)[strings_count];
-  auto chars_column =
-    create_chars_child_column(strings_count, timestamps.null_count(), bytes, stream, mr);
-  auto chars_view = chars_column->mutable_view();
-  auto d_chars    = chars_view.template data<char>();
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
+  auto chars_column = create_chars_child_column(strings_count, bytes, stream, mr);
+  auto d_chars      = chars_column->mutable_view().template data<char>();
   // fill in chars column with timestamps
   // dispatcher is called to handle the different timestamp types
   cudf::type_dispatcher(timestamps.type(),
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 9e0df374bc8..4135b542595 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -427,10 +427,8 @@ struct dispatch_from_durations_fn {
     // build chars column
     auto const chars_bytes =
       cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
-    auto chars_column = detail::create_chars_child_column(
-      strings_count, durations.null_count(), chars_bytes, stream, mr);
-    auto chars_view = chars_column->mutable_view();
-    auto d_chars    = chars_view.template data<char>();
+    auto chars_column = detail::create_chars_child_column(strings_count, chars_bytes, stream, mr);
+    auto d_chars      = chars_column->mutable_view().template data<char>();
 
     thrust::for_each_n(rmm::exec_policy(stream),
                        thrust::make_counting_iterator<size_type>(0),
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index c22fee5ec05..cf450b19151 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -334,9 +334,8 @@ struct dispatch_from_fixed_point_fn {
     // build chars column
     auto const bytes =
       cudf::detail::get_value<int32_t>(offsets_column->view(), input.size(), stream);
-    auto chars_column =
-      detail::create_chars_child_column(input.size(), input.null_count(), bytes, stream, mr);
-    auto d_chars = chars_column->mutable_view().template data<char>();
+    auto chars_column = detail::create_chars_child_column(input.size(), bytes, stream, mr);
+    auto d_chars      = chars_column->mutable_view().template data<char>();
     thrust::for_each_n(rmm::exec_policy(stream),
                        thrust::make_counting_iterator<size_type>(0),
                        input.size(),
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index b6d99efd51f..2eb64e65d96 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -492,11 +492,10 @@ struct dispatch_from_floats_fn {
     auto d_offsets    = offsets_view.template data<int32_t>();
 
     // build chars column
-    size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
-    auto chars_column =
-      detail::create_chars_child_column(strings_count, floats.null_count(), bytes, stream, mr);
-    auto chars_view = chars_column->mutable_view();
-    auto d_chars    = chars_view.template data<char>();
+    auto const bytes  = cudf::detail::get_value<int32_t>(offsets_view, strings_count, stream);
+    auto chars_column = detail::create_chars_child_column(strings_count, bytes, stream, mr);
+    auto chars_view   = chars_column->mutable_view();
+    auto d_chars      = chars_view.template data<char>();
     thrust::for_each_n(rmm::exec_policy(stream),
                        thrust::make_counting_iterator<size_type>(0),
                        strings_count,
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index 7eee2b3cc0e..4e705318d43 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -349,11 +349,10 @@ struct dispatch_from_integers_fn {
     auto d_new_offsets = offsets_view.template data<int32_t>();
 
     // build chars column
-    size_type bytes = thrust::device_pointer_cast(d_new_offsets)[strings_count];
-    auto chars_column =
-      detail::create_chars_child_column(strings_count, integers.null_count(), bytes, stream, mr);
-    auto chars_view = chars_column->mutable_view();
-    auto d_chars    = chars_view.template data<char>();
+    auto const bytes  = cudf::detail::get_value<int32_t>(offsets_view, strings_count, stream);
+    auto chars_column = detail::create_chars_child_column(strings_count, bytes, stream, mr);
+    auto chars_view   = chars_column->mutable_view();
+    auto d_chars      = chars_view.template data<char>();
     thrust::for_each_n(rmm::exec_policy(stream),
                        thrust::make_counting_iterator<size_type>(0),
                        strings_count,
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index 6347c0eab61..a2d80f5f2cc 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -190,10 +190,10 @@ std::unique_ptr<column> integers_to_ipv4(
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // build chars column
-  size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
-  auto chars_column =
-    create_chars_child_column(strings_count, integers.null_count(), bytes, stream, mr);
-  auto d_chars = chars_column->mutable_view().data<char>();
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
+  auto chars_column = create_chars_child_column(strings_count, bytes, stream, mr);
+  auto d_chars      = chars_column->mutable_view().data<char>();
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index 54f73e70f58..84ca2e6880a 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -139,14 +139,11 @@ std::unique_ptr<column> url_encode(
   auto offsets_column = make_offsets_child_column(
     offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto d_offsets = offsets_column->view().data<int32_t>();
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
   // build chars column
-  auto chars_column =
-    create_chars_child_column(strings_count,
-                              strings.null_count(),
-                              thrust::device_pointer_cast(d_offsets)[strings_count],
-                              stream,
-                              mr);
-  auto d_chars = chars_column->mutable_view().data<char>();
+  auto chars_column = create_chars_child_column(strings_count, bytes, stream, mr);
+  auto d_chars      = chars_column->mutable_view().data<char>();
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
@@ -396,7 +393,6 @@ std::unique_ptr<column> url_decode(
   // create the chars column
   auto chars_column =
     create_chars_child_column(strings_count,
-                              strings.null_count(),
                               chars_bytes - (esc_count * 2),  // replacing 3 bytes with 1
                               stream,
                               mr);
diff --git a/cpp/src/strings/filling/fill.cu b/cpp/src/strings/filling/fill.cu
index 779a64c9eb2..ae5e0b69346 100644
--- a/cpp/src/strings/filling/fill.cu
+++ b/cpp/src/strings/filling/fill.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -78,9 +78,9 @@ std::unique_ptr<column> fill(
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // create the chars column
-  size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
-  auto chars_column =
-    strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
+  auto chars_column = strings::detail::create_chars_child_column(strings_count, bytes, stream, mr);
   // fill the chars column
   auto d_chars = chars_column->mutable_view().data<char>();
   thrust::for_each_n(
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index 7ed77a830ad..13733bd674b 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -139,8 +139,7 @@ std::unique_ptr<column> filter_characters(
 
   // this utility calls the strip_fn to build the offsets and chars columns
   filter_fn ffn{*d_strings, keep_characters, table.begin(), table.end(), d_replacement};
-  auto children = cudf::strings::detail::make_strings_children(
-    ffn, strings.size(), strings.null_count(), stream, mr);
+  auto children = cudf::strings::detail::make_strings_children(ffn, strings.size(), stream, mr);
 
   return make_strings_column(strings_count,
                              std::move(children.first),
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index cd8aae12070..3b0290736ae 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -945,7 +945,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::get_json_object(col, json_path, 0, mr);
+  return detail::get_json_object(col, json_path, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index 83a8d5c840f..429da119c90 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -80,10 +80,10 @@ std::unique_ptr<column> pad(
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // build chars column
-  size_type bytes   = thrust::device_pointer_cast(d_offsets)[strings_count];
-  auto chars_column = strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), bytes, stream, mr);
-  auto d_chars = chars_column->mutable_view().data<char>();
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
+  auto chars_column = strings::detail::create_chars_child_column(strings_count, bytes, stream, mr);
+  auto d_chars      = chars_column->mutable_view().data<char>();
 
   if (side == pad_side::LEFT) {
     thrust::for_each_n(
@@ -168,10 +168,10 @@ std::unique_ptr<column> zfill(
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // build chars column
-  size_type bytes   = thrust::device_pointer_cast(d_offsets)[strings_count];
-  auto chars_column = strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), bytes, stream, mr);
-  auto d_chars = chars_column->mutable_view().data<char>();
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
+  auto chars_column = strings::detail::create_chars_child_column(strings_count, bytes, stream, mr);
+  auto d_chars      = chars_column->mutable_view().data<char>();
 
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<cudf::size_type>(0),
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index cac774ef43e..1ed1ee4d96f 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -119,15 +119,14 @@ std::unique_ptr<column> replace_with_backrefs(
         backrefs_fn<BackRefIterator, RX_STACK_SMALL>{
           *d_strings, *d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
         strings.size(),
-        strings.null_count(),
         stream,
         mr);
     } else if (regex_insts <= RX_MEDIUM_INSTS)
       return replace_with_backrefs_medium(
-        *d_strings, *d_prog, d_repl_template, backrefs, strings.null_count(), stream, mr);
+        *d_strings, *d_prog, d_repl_template, backrefs, stream, mr);
     else
       return replace_with_backrefs_large(
-        *d_strings, *d_prog, d_repl_template, backrefs, strings.null_count(), stream, mr);
+        *d_strings, *d_prog, d_repl_template, backrefs, stream, mr);
   }();
 
   return make_strings_column(strings.size(),
diff --git a/cpp/src/strings/replace/backref_re.cuh b/cpp/src/strings/replace/backref_re.cuh
index 529b91a98e5..c0a1456b196 100644
--- a/cpp/src/strings/replace/backref_re.cuh
+++ b/cpp/src/strings/replace/backref_re.cuh
@@ -118,7 +118,6 @@ children_pair replace_with_backrefs_medium(column_device_view const& d_strings,
                                            reprog_device& d_prog,
                                            string_view const& d_repl_template,
                                            device_span<backref_type> backrefs,
-                                           size_type null_count,
                                            rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr);
 
@@ -126,7 +125,6 @@ children_pair replace_with_backrefs_large(column_device_view const& d_strings,
                                           reprog_device& d_prog,
                                           string_view const& d_repl_template,
                                           device_span<backref_type> backrefs,
-                                          size_type null_count,
                                           rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr);
 
diff --git a/cpp/src/strings/replace/backref_re_large.cu b/cpp/src/strings/replace/backref_re_large.cu
index 56bd8941b8a..4f3c2fb3e1d 100644
--- a/cpp/src/strings/replace/backref_re_large.cu
+++ b/cpp/src/strings/replace/backref_re_large.cu
@@ -28,7 +28,6 @@ children_pair replace_with_backrefs_large(column_device_view const& d_strings,
                                           reprog_device& d_prog,
                                           string_view const& d_repl_template,
                                           device_span<backref_type> backrefs,
-                                          size_type null_count,
                                           rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
 {
@@ -37,7 +36,6 @@ children_pair replace_with_backrefs_large(column_device_view const& d_strings,
     backrefs_fn<Iterator, RX_STACK_LARGE>{
       d_strings, d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
     d_strings.size(),
-    null_count,
     stream,
     mr);
 }
diff --git a/cpp/src/strings/replace/backref_re_medium.cu b/cpp/src/strings/replace/backref_re_medium.cu
index 8b1dd6c5999..277c75930a6 100644
--- a/cpp/src/strings/replace/backref_re_medium.cu
+++ b/cpp/src/strings/replace/backref_re_medium.cu
@@ -28,7 +28,6 @@ children_pair replace_with_backrefs_medium(column_device_view const& d_strings,
                                            reprog_device& d_prog,
                                            string_view const& d_repl_template,
                                            device_span<backref_type> backrefs,
-                                           size_type null_count,
                                            rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
@@ -37,7 +36,6 @@ children_pair replace_with_backrefs_medium(column_device_view const& d_strings,
     backrefs_fn<Iterator, RX_STACK_MEDIUM>{
       d_strings, d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
     d_strings.size(),
-    null_count,
     stream,
     mr);
 }
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index 39725361741..bbbe7a524d3 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -181,7 +181,6 @@ std::unique_ptr<column> replace_re(
         replace_multi_regex_fn<RX_STACK_SMALL>{
           *d_strings, d_progs, static_cast<size_type>(progs.size()), d_found_ranges, *d_repls},
         strings_count,
-        strings.null_count(),
         stream,
         mr);
     else if (regex_insts <= RX_MEDIUM_INSTS)
@@ -189,7 +188,6 @@ std::unique_ptr<column> replace_re(
         replace_multi_regex_fn<RX_STACK_MEDIUM>{
           *d_strings, d_progs, static_cast<size_type>(progs.size()), d_found_ranges, *d_repls},
         strings_count,
-        strings.null_count(),
         stream,
         mr);
     else
@@ -197,7 +195,6 @@ std::unique_ptr<column> replace_re(
         replace_multi_regex_fn<RX_STACK_LARGE>{
           *d_strings, d_progs, static_cast<size_type>(progs.size()), d_found_ranges, *d_repls},
         strings_count,
-        strings.null_count(),
         stream,
         mr);
   }();
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index ea474644d06..e19c3efa62b 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -476,12 +476,9 @@ std::unique_ptr<column> replace_char_parallel(strings_column_view const& strings
                     offsets_update_fn);
 
   // build the characters column
-  auto chars_column = create_chars_child_column(strings_count,
-                                                strings.null_count(),
-                                                chars_bytes + (delta_per_target * target_count),
-                                                stream,
-                                                mr);
-  auto d_out_chars  = chars_column->mutable_view().data<char>();
+  auto chars_column = create_chars_child_column(
+    strings_count, chars_bytes + (delta_per_target * target_count), stream, mr);
+  auto d_out_chars = chars_column->mutable_view().data<char>();
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<size_type>(chars_start),
@@ -528,11 +525,7 @@ std::unique_ptr<column> replace_row_parallel(strings_column_view const& strings,
 
   // this utility calls the given functor to build the offsets and chars columns
   auto children = cudf::strings::detail::make_strings_children(
-    replace_row_parallel_fn{*d_strings, d_target, d_repl, maxrepl},
-    strings.size(),
-    strings.null_count(),
-    stream,
-    mr);
+    replace_row_parallel_fn{*d_strings, d_target, d_repl, maxrepl}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),
                              std::move(children.first),
@@ -698,12 +691,8 @@ std::unique_ptr<column> replace_slice(strings_column_view const& strings,
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
   // this utility calls the given functor to build the offsets and chars columns
-  auto children =
-    cudf::strings::detail::make_strings_children(replace_slice_fn{*d_strings, d_repl, start, stop},
-                                                 strings.size(),
-                                                 strings.null_count(),
-                                                 stream,
-                                                 mr);
+  auto children = cudf::strings::detail::make_strings_children(
+    replace_slice_fn{*d_strings, d_repl, start, stop}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),
                              std::move(children.first),
@@ -790,12 +779,8 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
   auto d_repls   = column_device_view::create(repls.parent(), stream);
 
   // this utility calls the given functor to build the offsets and chars columns
-  auto children =
-    cudf::strings::detail::make_strings_children(replace_multi_fn{*d_strings, *d_targets, *d_repls},
-                                                 strings.size(),
-                                                 strings.null_count(),
-                                                 stream,
-                                                 mr);
+  auto children = cudf::strings::detail::make_strings_children(
+    replace_multi_fn{*d_strings, *d_targets, *d_repls}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),
                              std::move(children.first),
@@ -831,10 +816,10 @@ std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // build chars column
-  size_type bytes   = thrust::device_pointer_cast(d_offsets)[strings_count];
-  auto chars_column = strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), bytes, stream, mr);
-  auto d_chars = chars_column->mutable_view().data<char>();
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
+  auto chars_column = strings::detail::create_chars_child_column(strings_count, bytes, stream, mr);
+  auto d_chars      = chars_column->mutable_view().data<char>();
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index 156b246fdfc..aef9cb545a6 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -135,21 +135,18 @@ std::unique_ptr<column> replace_re(
     children =
       make_strings_children(replace_regex_fn<RX_STACK_SMALL>{d_strings, d_prog, d_repl, maxrepl},
                             strings_count,
-                            null_count,
                             stream,
                             mr);
   else if (regex_insts <= RX_MEDIUM_INSTS)
     children =
       make_strings_children(replace_regex_fn<RX_STACK_MEDIUM>{d_strings, d_prog, d_repl, maxrepl},
                             strings_count,
-                            null_count,
                             stream,
                             mr);
   else
     children =
       make_strings_children(replace_regex_fn<RX_STACK_LARGE>{d_strings, d_prog, d_repl, maxrepl},
                             strings_count,
-                            null_count,
                             stream,
                             mr);
 
diff --git a/cpp/src/strings/strip.cu b/cpp/src/strings/strip.cu
index 3ffa331ba49..3365897fb4a 100644
--- a/cpp/src/strings/strip.cu
+++ b/cpp/src/strings/strip.cu
@@ -110,7 +110,7 @@ std::unique_ptr<column> strip(
 
   // this utility calls the strip_fn to build the offsets and chars columns
   auto children = cudf::strings::detail::make_strings_children(
-    strip_fn{*d_column, stype, d_to_strip}, strings.size(), strings.null_count(), stream, mr);
+    strip_fn{*d_column, stype, d_to_strip}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),
                              std::move(children.first),
diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu
index e8da3120c38..69526a96128 100644
--- a/cpp/src/strings/substring.cu
+++ b/cpp/src/strings/substring.cu
@@ -120,11 +120,8 @@ std::unique_ptr<column> slice_strings(
   auto const d_stop   = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(stop));
   auto const d_step   = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(step));
 
-  auto children = make_strings_children(substring_fn{*d_column, d_start, d_stop, d_step},
-                                        strings.size(),
-                                        strings.null_count(),
-                                        stream,
-                                        mr);
+  auto children = make_strings_children(
+    substring_fn{*d_column, d_start, d_stop, d_step}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),
                              std::move(children.first),
@@ -172,7 +169,7 @@ struct substring_from_fn {
     }
     auto const d_str  = d_column.template element<string_view>(idx);
     auto const length = d_str.length();
-    auto const start  = starts[idx];
+    auto const start  = std::max(starts[idx], 0);
     if (start >= length) {
       if (!d_chars) d_offsets[idx] = 0;
       return;
@@ -218,8 +215,8 @@ std::unique_ptr<column> compute_substrings_from_fn(column_device_view const& d_c
       : rmm::device_buffer(
           d_column.null_mask(), cudf::bitmask_allocation_size_bytes(strings_count), stream, mr);
 
-  auto children = make_strings_children(
-    substring_from_fn{d_column, starts, stops}, strings_count, null_count, stream, mr);
+  auto children =
+    make_strings_children(substring_from_fn{d_column, starts, stops}, strings_count, stream, mr);
 
   return make_strings_column(strings_count,
                              std::move(children.first),
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index 138fe3fa508..0dd48ea4d24 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -112,11 +112,8 @@ std::unique_ptr<column> translate(
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
-  auto children = make_strings_children(translate_fn{*d_strings, table.begin(), table.end()},
-                                        strings.size(),
-                                        strings.null_count(),
-                                        stream,
-                                        mr);
+  auto children = make_strings_children(
+    translate_fn{*d_strings, table.begin(), table.end()}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),
                              std::move(children.first),
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index 2af313627ad..9b3c5606d1c 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -37,21 +37,6 @@
 namespace cudf {
 namespace strings {
 namespace detail {
-// Used to build a temporary string_view object from a single host string.
-std::unique_ptr<string_view, std::function<void(string_view*)>> string_from_host(
-  const char* str, rmm::cuda_stream_view stream)
-{
-  if (!str) return nullptr;
-  auto length = std::strlen(str);
-
-  auto* d_str = new rmm::device_buffer(length, stream);
-  CUDA_TRY(cudaMemcpyAsync(d_str->data(), str, length, cudaMemcpyHostToDevice, stream.value()));
-  stream.synchronize();
-
-  auto deleter = [d_str](string_view* sv) { delete d_str; };
-  return std::unique_ptr<string_view, decltype(deleter)>{
-    new string_view(reinterpret_cast<char*>(d_str->data()), length), deleter};
-}
 
 /**
  * @copydoc create_string_vector_from_column
@@ -117,12 +102,10 @@ std::unique_ptr<cudf::column> child_chars_from_string_vector(cudf::device_span<s
 
 //
 std::unique_ptr<column> create_chars_child_column(cudf::size_type strings_count,
-                                                  cudf::size_type null_count,
                                                   cudf::size_type total_bytes,
                                                   rmm::cuda_stream_view stream,
                                                   rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(null_count <= strings_count, "Invalid null count");
   return make_numeric_column(
     data_type{type_id::INT8}, total_bytes, mask_state::UNALLOCATED, stream, mr);
 }
diff --git a/cpp/src/strings/utilities.cuh b/cpp/src/strings/utilities.cuh
index 4f234471e69..a361615f169 100644
--- a/cpp/src/strings/utilities.cuh
+++ b/cpp/src/strings/utilities.cuh
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <cudf/detail/get_value.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -66,7 +67,6 @@ __device__ inline char* copy_string(char* buffer, const string_view& d_string)
  *        After that, the d_offsets and d_chars are set and this is called again to fill in the
  *        chars memory.
  * @param strings_count Number of strings.
- * @param null_count Number of nulls in the strings column.
  * @param mr Device memory resource used to allocate the returned columns' device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @return offsets child column and chars child column for a strings column
@@ -75,7 +75,6 @@ template <typename SizeAndExecuteFunction>
 auto make_strings_children(
   SizeAndExecuteFunction size_and_exec_fn,
   size_type strings_count,
-  size_type null_count,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
@@ -100,8 +99,9 @@ auto make_strings_children(
     rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets);
 
   // Now build the chars column
-  std::unique_ptr<column> chars_column = create_chars_child_column(
-    strings_count, null_count, thrust::device_pointer_cast(d_offsets)[strings_count], stream, mr);
+  auto const bytes = cudf::detail::get_value<int32_t>(offsets_view, strings_count, stream);
+  std::unique_ptr<column> chars_column =
+    create_chars_child_column(strings_count, bytes, stream, mr);
   size_and_exec_fn.d_chars = chars_column->mutable_view().template data<char>();
   for_each_fn(size_and_exec_fn);
 
diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu
index efc9f587a79..2fc35587b20 100644
--- a/cpp/src/text/detokenize.cu
+++ b/cpp/src/text/detokenize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -176,7 +176,7 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& string
   cudf::size_type const total_bytes =
     cudf::detail::get_value<int32_t>(offsets_column->view(), output_count, stream);
   auto chars_column =
-    cudf::strings::detail::create_chars_child_column(output_count, 0, total_bytes, stream, mr);
+    cudf::strings::detail::create_chars_child_column(output_count, total_bytes, stream, mr);
   auto d_chars = chars_column->mutable_view().data<char>();
   thrust::for_each_n(
     rmm::exec_policy(stream),
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 4a41dacbd30..c121fccc960 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -132,7 +132,7 @@ std::unique_ptr<cudf::column> generate_ngrams(
   auto const ngrams_count = strings_count - ngrams + 1;
 
   auto children = cudf::strings::detail::make_strings_children(
-    ngram_generator_fn{d_strings, ngrams, d_separator}, ngrams_count, 0, stream, mr);
+    ngram_generator_fn{d_strings, ngrams, d_separator}, ngrams_count, stream, mr);
 
   // make the output strings column from the offsets and chars column
   return cudf::make_strings_column(ngrams_count,
@@ -245,7 +245,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
   auto const chars_bytes =
     cudf::detail::get_value<int32_t>(offsets_column->view(), total_ngrams, stream);
   auto chars_column =
-    cudf::strings::detail::create_chars_child_column(total_ngrams, 0, chars_bytes, stream, mr);
+    cudf::strings::detail::create_chars_child_column(total_ngrams, chars_bytes, stream, mr);
   generator.d_chars = chars_column->mutable_view().data<char>();  // output chars
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<cudf::size_type>(0),
diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu
index 96b06e7a1eb..4b080243435 100644
--- a/cpp/src/text/ngrams_tokenize.cu
+++ b/cpp/src/text/ngrams_tokenize.cu
@@ -221,8 +221,8 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
   auto d_ngram_sizes = ngram_sizes.data();                         // ngram to generate
 
   // build chars column
-  auto chars_column = cudf::strings::detail::create_chars_child_column(
-    strings_count, 0, output_chars_size, stream, mr);
+  auto chars_column =
+    cudf::strings::detail::create_chars_child_column(strings_count, output_chars_size, stream, mr);
   auto d_chars = chars_column->mutable_view().data<char>();
   // Generate the ngrams into the chars column data buffer.
   // The ngram_builder_fn functor also fills the d_ngram_sizes vector with the
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index e3a43ac25c0..d00687c6d90 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -180,7 +180,7 @@ std::unique_ptr<cudf::column> normalize_spaces(
 
   // build offsets and children using the normalize_space_fn
   auto children = cudf::strings::detail::make_strings_children(
-    normalize_spaces_fn{*d_strings}, strings.size(), strings.null_count(), stream, mr);
+    normalize_spaces_fn{*d_strings}, strings.size(), stream, mr);
 
   return cudf::make_strings_column(strings.size(),
                                    std::move(children.first),
@@ -225,11 +225,7 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
 
   // build offsets and children using the codepoint_to_utf8_fn
   auto children = cudf::strings::detail::make_strings_children(
-    codepoint_to_utf8_fn{*d_strings, cp_chars, cp_offsets},
-    strings.size(),
-    strings.null_count(),
-    stream,
-    mr);
+    codepoint_to_utf8_fn{*d_strings, cp_chars, cp_offsets}, strings.size(), stream, mr);
 
   return cudf::make_strings_column(strings.size(),
                                    std::move(children.first),
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index e1a03c3462b..a239c5951cf 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -223,8 +223,7 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& st
   rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
 
   // this utility calls replacer to build the offsets and chars columns
-  auto children = cudf::strings::detail::make_strings_children(
-    replacer, strings_count, strings.null_count(), stream, mr);
+  auto children = cudf::strings::detail::make_strings_children(replacer, strings_count, stream, mr);
 
   // return new strings column
   return cudf::make_strings_column(strings_count,
@@ -258,8 +257,7 @@ std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& str
   rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
 
   // this utility calls filterer to build the offsets and chars columns
-  auto children = cudf::strings::detail::make_strings_children(
-    filterer, strings_count, strings.null_count(), stream, mr);
+  auto children = cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr);
 
   // return new strings column
   return cudf::make_strings_column(strings_count,
diff --git a/cpp/tests/copying/detail_gather_tests.cu b/cpp/tests/copying/detail_gather_tests.cu
index 8457171ac6a..f976a6bcf58 100644
--- a/cpp/tests/copying/detail_gather_tests.cu
+++ b/cpp/tests/copying/detail_gather_tests.cu
@@ -30,6 +30,8 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <rmm/device_uvector.hpp>
+
 template <typename T>
 class GatherTest : public cudf::test::BaseFixture {
 };
@@ -37,11 +39,11 @@ class GatherTest : public cudf::test::BaseFixture {
 TYPED_TEST_CASE(GatherTest, cudf::test::NumericTypes);
 
 // This test exercises using different iterator types as gather map inputs
-// to cudf::detail::gather -- device_vector and raw pointers.
+// to cudf::detail::gather -- device_uvector and raw pointers.
 TYPED_TEST(GatherTest, GatherDetailDeviceVectorTest)
 {
   constexpr cudf::size_type source_size{1000};
-  rmm::device_vector<cudf::size_type> gather_map(source_size);
+  rmm::device_uvector<cudf::size_type> gather_map(source_size, rmm::cuda_stream_default);
   thrust::sequence(thrust::device, gather_map.begin(), gather_map.end());
 
   auto data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
@@ -63,8 +65,8 @@ TYPED_TEST(GatherTest, GatherDetailDeviceVectorTest)
 
   // test with raw pointers
   {
-    std::unique_ptr<cudf::table> result = cudf::detail::gather(
-      source_table, gather_map.data().get(), gather_map.data().get() + gather_map.size());
+    std::unique_ptr<cudf::table> result =
+      cudf::detail::gather(source_table, gather_map.data(), gather_map.data() + gather_map.size());
 
     for (auto i = 0; i < source_table.num_columns(); ++i) {
       CUDF_TEST_EXPECT_COLUMNS_EQUAL(source_table.column(i), result->view().column(i));
diff --git a/cpp/tests/copying/gather_str_tests.cu b/cpp/tests/copying/gather_str_tests.cu
index c597c4ae8c2..98a5a48ea0d 100644
--- a/cpp/tests/copying/gather_str_tests.cu
+++ b/cpp/tests/copying/gather_str_tests.cu
@@ -26,6 +26,8 @@
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
 
+#include <rmm/device_uvector.hpp>
+
 class GatherTestStr : public cudf::test::BaseFixture {
 };
 
@@ -131,7 +133,7 @@ TEST_F(GatherTestStr, GatherEmptyMapStringsColumn)
 {
   cudf::column_view zero_size_strings_column(
     cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  rmm::device_vector<cudf::size_type> gather_map{};
+  rmm::device_uvector<cudf::size_type> gather_map{0, rmm::cuda_stream_default};
   auto results = cudf::detail::gather(cudf::table_view({zero_size_strings_column}),
                                       gather_map.begin(),
                                       gather_map.end(),
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index d79307dcbf6..ae8808ba59d 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -32,6 +32,8 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <thrust/iterator/counting_iterator.h>
+
 #include <tests/interop/arrow_utils.hpp>
 
 std::unique_ptr<cudf::table> get_cudf_table()
@@ -76,17 +78,17 @@ TEST_F(FromArrowTest, EmptyTable)
 
 TEST_F(FromArrowTest, DateTimeTable)
 {
-  auto data = {1, 2, 3, 4, 5, 6};
+  auto data = std::vector<int64_t>{1, 2, 3, 4, 5, 6};
 
-  auto col =
-    cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>(data);
+  auto col = cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>(
+    data.begin(), data.end());
 
   cudf::table_view expected_table_view({col});
 
   std::shared_ptr<arrow::Array> arr;
-  arrow::TimestampBuilder timestamp_builder(timestamp(arrow::TimeUnit::type::MILLI),
+  arrow::TimestampBuilder timestamp_builder(arrow::timestamp(arrow::TimeUnit::type::MILLI),
                                             arrow::default_memory_pool());
-  timestamp_builder.AppendValues(std::vector<int64_t>{1, 2, 3, 4, 5, 6});
+  timestamp_builder.AppendValues(data);
   CUDF_EXPECTS(timestamp_builder.Finish(&arr).ok(), "Failed to build array");
 
   std::vector<std::shared_ptr<arrow::Field>> schema_vector({arrow::field("a", arr->type())});
@@ -337,10 +339,10 @@ TEST_P(FromArrowTestSlice, SliceTest)
   auto start           = std::get<0>(GetParam());
   auto end             = std::get<1>(GetParam());
 
-  auto sliced_cudf_table = cudf::slice(cudf_table_view, {start, end})[0];
-  cudf::table expected_cudf_table{sliced_cudf_table};
-  auto sliced_arrow_table = arrow_table->Slice(start, end - start);
-  auto got_cudf_table     = cudf::from_arrow(*sliced_arrow_table);
+  auto sliced_cudf_table   = cudf::slice(cudf_table_view, {start, end})[0];
+  auto expected_cudf_table = cudf::table{sliced_cudf_table};
+  auto sliced_arrow_table  = arrow_table->Slice(start, end - start);
+  auto got_cudf_table      = cudf::from_arrow(*sliced_arrow_table);
 
   // This has been added to take-care of empty string column issue with no children
   if (got_cudf_table->num_rows() == 0 and expected_cudf_table.num_rows() == 0) {
@@ -350,6 +352,131 @@ TEST_P(FromArrowTestSlice, SliceTest)
   }
 }
 
+template <typename T>
+using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
+
+TEST_F(FromArrowTest, FixedPointTable)
+{
+  using namespace numeric;
+  auto constexpr BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
+
+  for (auto const i : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const data     = std::vector<int64_t>{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0};
+    auto const col      = fp_wrapper<int64_t>({1, 2, 3, 4, 5, 6}, scale_type{i});
+    auto const expected = cudf::table_view({col});
+
+    std::shared_ptr<arrow::Array> arr;
+    arrow::Decimal128Builder decimal_builder(arrow::decimal(10, -i), arrow::default_memory_pool());
+    decimal_builder.AppendValues(reinterpret_cast<const uint8_t*>(data.data()),
+                                 data.size() / BIT_WIDTH_RATIO);
+    CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array");
+
+    auto const field         = arrow::field("a", arr->type());
+    auto const schema_vector = std::vector<std::shared_ptr<arrow::Field>>({field});
+    auto const schema        = std::make_shared<arrow::Schema>(schema_vector);
+    auto const arrow_table   = arrow::Table::Make(schema, {arr});
+
+    auto got_cudf_table = cudf::from_arrow(*arrow_table);
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view());
+  }
+}
+
+TEST_F(FromArrowTest, FixedPointTableLarge)
+{
+  using namespace numeric;
+  auto constexpr BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
+  auto constexpr NUM_ELEMENTS    = 1000;
+
+  for (auto const i : {3, 2, 1, 0, -1, -2, -3}) {
+    auto every_other = [](auto i) { return i % BIT_WIDTH_RATIO ? 0 : i / BIT_WIDTH_RATIO; };
+    auto transform   = cudf::detail::make_counting_transform_iterator(BIT_WIDTH_RATIO, every_other);
+    auto const data  = std::vector<int64_t>(transform, transform + NUM_ELEMENTS * BIT_WIDTH_RATIO);
+    auto iota        = thrust::make_counting_iterator(1);
+    auto const col   = fp_wrapper<int64_t>(iota, iota + NUM_ELEMENTS, scale_type{i});
+    auto const expected = cudf::table_view({col});
+
+    std::shared_ptr<arrow::Array> arr;
+    arrow::Decimal128Builder decimal_builder(arrow::decimal(10, -i), arrow::default_memory_pool());
+    decimal_builder.AppendValues(reinterpret_cast<const uint8_t*>(data.data()), NUM_ELEMENTS);
+    CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array");
+
+    auto const field         = arrow::field("a", arr->type());
+    auto const schema_vector = std::vector<std::shared_ptr<arrow::Field>>({field});
+    auto const schema        = std::make_shared<arrow::Schema>(schema_vector);
+    auto const arrow_table   = arrow::Table::Make(schema, {arr});
+
+    auto got_cudf_table = cudf::from_arrow(*arrow_table);
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view());
+  }
+}
+
+TEST_F(FromArrowTest, FixedPointTableNulls)
+{
+  using namespace numeric;
+  auto constexpr BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
+
+  for (auto const i : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const data = std::vector<int64_t>{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0};
+    auto const col =
+      fp_wrapper<int64_t>({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{i});
+    auto const expected = cudf::table_view({col});
+
+    std::shared_ptr<arrow::Array> arr;
+    arrow::Decimal128Builder decimal_builder(arrow::decimal(10, -i), arrow::default_memory_pool());
+    decimal_builder.AppendValues(reinterpret_cast<const uint8_t*>(data.data()),
+                                 data.size() / BIT_WIDTH_RATIO);
+    decimal_builder.AppendNull();
+    decimal_builder.AppendNull();
+
+    CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array");
+
+    auto const field         = arrow::field("a", arr->type());
+    auto const schema_vector = std::vector<std::shared_ptr<arrow::Field>>({field});
+    auto const schema        = std::make_shared<arrow::Schema>(schema_vector);
+    auto const arrow_table   = arrow::Table::Make(schema, {arr});
+
+    auto got_cudf_table = cudf::from_arrow(*arrow_table);
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view());
+  }
+}
+
+TEST_F(FromArrowTest, FixedPointTableNullsLarge)
+{
+  using namespace numeric;
+  auto constexpr BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
+  auto constexpr NUM_ELEMENTS    = 1000;
+
+  for (auto const i : {3, 2, 1, 0, -1, -2, -3}) {
+    auto every_other = [](auto i) { return i % BIT_WIDTH_RATIO ? 0 : i / BIT_WIDTH_RATIO; };
+    auto transform   = cudf::detail::make_counting_transform_iterator(BIT_WIDTH_RATIO, every_other);
+    auto const data  = std::vector<int64_t>(transform, transform + NUM_ELEMENTS * BIT_WIDTH_RATIO);
+    auto iota        = thrust::make_counting_iterator(1);
+    auto const col   = fp_wrapper<int64_t>(iota, iota + NUM_ELEMENTS, transform, scale_type{i});
+    auto const expected = cudf::table_view({col});
+
+    std::shared_ptr<arrow::Array> arr;
+    arrow::Decimal128Builder decimal_builder(arrow::decimal(10, -i), arrow::default_memory_pool());
+    for (int64_t i = 0; i < NUM_ELEMENTS / BIT_WIDTH_RATIO; ++i) {
+      decimal_builder.Append(reinterpret_cast<const uint8_t*>(data.data() + 4 * i));
+      decimal_builder.AppendNull();
+    }
+
+    CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array");
+
+    auto const field         = arrow::field("a", arr->type());
+    auto const schema_vector = std::vector<std::shared_ptr<arrow::Field>>({field});
+    auto const schema        = std::make_shared<arrow::Schema>(schema_vector);
+    auto const arrow_table   = arrow::Table::Make(schema, {arr});
+
+    auto got_cudf_table = cudf::from_arrow(*arrow_table);
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view());
+  }
+}
+
 INSTANTIATE_TEST_CASE_P(FromArrowTest,
                         FromArrowTestSlice,
                         ::testing::Values(std::make_tuple(0, 10000),
diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
index 57275433516..00d625175d0 100644
--- a/cpp/tests/interop/to_arrow_test.cpp
+++ b/cpp/tests/interop/to_arrow_test.cpp
@@ -353,6 +353,134 @@ TEST_F(ToArrowTest, StructColumn)
   ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
 }
 
+template <typename T>
+using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
+
+TEST_F(ToArrowTest, FixedPointTable)
+{
+  using namespace numeric;
+  auto constexpr const BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
+
+  for (auto const i : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const col   = fp_wrapper<int64_t>({-1, 2, 3, 4, 5, 6}, scale_type{i});
+    auto const input = cudf::table_view({col});
+
+    auto const expect_data = std::vector<int64_t>{-1, -1, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0};
+    std::shared_ptr<arrow::Array> arr;
+    arrow::Decimal128Builder decimal_builder(arrow::decimal(18, -i), arrow::default_memory_pool());
+    decimal_builder.AppendValues(reinterpret_cast<const uint8_t*>(expect_data.data()),
+                                 expect_data.size() / BIT_WIDTH_RATIO);
+    CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array");
+
+    auto const field                = arrow::field("a", arr->type());
+    auto const schema_vector        = std::vector<std::shared_ptr<arrow::Field>>({field});
+    auto const schema               = std::make_shared<arrow::Schema>(schema_vector);
+    auto const expected_arrow_table = arrow::Table::Make(schema, {arr});
+
+    auto got_arrow_table = cudf::to_arrow(input, {{"a"}});
+
+    ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
+  }
+}
+
+TEST_F(ToArrowTest, FixedPointTableLarge)
+{
+  using namespace numeric;
+  auto constexpr BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
+  auto constexpr NUM_ELEMENTS    = 1000;
+
+  for (auto const i : {3, 2, 1, 0, -1, -2, -3}) {
+    auto iota        = thrust::make_counting_iterator(1);
+    auto const col   = fp_wrapper<int64_t>(iota, iota + NUM_ELEMENTS, scale_type{i});
+    auto const input = cudf::table_view({col});
+
+    auto every_other = [](auto i) { return i % 2 == 0 ? i / 2 : 0; };
+    auto transform   = cudf::detail::make_counting_transform_iterator(2, every_other);
+    auto const expect_data =
+      std::vector<int64_t>{transform, transform + NUM_ELEMENTS * BIT_WIDTH_RATIO};
+    std::shared_ptr<arrow::Array> arr;
+    arrow::Decimal128Builder decimal_builder(arrow::decimal(18, -i), arrow::default_memory_pool());
+
+    // Note: For some reason, decimal_builder.AppendValues with NUM_ELEMENTS >= 1000 doesn't work
+    for (int i = 0; i < NUM_ELEMENTS; ++i)
+      decimal_builder.Append(reinterpret_cast<const uint8_t*>(expect_data.data() + 2 * i));
+
+    CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array");
+
+    auto const field                = arrow::field("a", arr->type());
+    auto const schema_vector        = std::vector<std::shared_ptr<arrow::Field>>({field});
+    auto const schema               = std::make_shared<arrow::Schema>(schema_vector);
+    auto const expected_arrow_table = arrow::Table::Make(schema, {arr});
+
+    auto got_arrow_table = cudf::to_arrow(input, {{"a"}});
+
+    ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
+  }
+}
+
+TEST_F(ToArrowTest, FixedPointTableNullsSimple)
+{
+  using namespace numeric;
+  auto constexpr BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
+
+  for (auto const i : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const data = std::vector<int64_t>{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0};
+    auto const col =
+      fp_wrapper<int64_t>({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{i});
+    auto const input = cudf::table_view({col});
+
+    std::shared_ptr<arrow::Array> arr;
+    arrow::Decimal128Builder decimal_builder(arrow::decimal(18, -i), arrow::default_memory_pool());
+    decimal_builder.AppendValues(reinterpret_cast<const uint8_t*>(data.data()),
+                                 data.size() / BIT_WIDTH_RATIO);
+    decimal_builder.AppendNull();
+    decimal_builder.AppendNull();
+
+    CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array");
+
+    auto const field         = arrow::field("a", arr->type());
+    auto const schema_vector = std::vector<std::shared_ptr<arrow::Field>>({field});
+    auto const schema        = std::make_shared<arrow::Schema>(schema_vector);
+    auto const arrow_table   = arrow::Table::Make(schema, {arr});
+
+    auto got_arrow_table = cudf::to_arrow(input, {{"a"}});
+
+    ASSERT_TRUE(arrow_table->Equals(*got_arrow_table, true));
+  }
+}
+
+TEST_F(ToArrowTest, FixedPointTableNulls)
+{
+  using namespace numeric;
+  auto constexpr BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
+
+  for (auto const i : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const col = fp_wrapper<int64_t>(
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 0, 1, 0, 1, 0, 1, 0, 1, 0}, scale_type{i});
+    auto const input = cudf::table_view({col});
+
+    auto const expect_data =
+      std::vector<int64_t>{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 9, 0, 10, 0};
+    std::shared_ptr<arrow::Array> arr;
+    arrow::Decimal128Builder decimal_builder(arrow::decimal(18, -i), arrow::default_memory_pool());
+    for (int64_t i = 0; i < input.column(0).size() / BIT_WIDTH_RATIO; ++i) {
+      decimal_builder.Append(reinterpret_cast<const uint8_t*>(expect_data.data() + 4 * i));
+      decimal_builder.AppendNull();
+    }
+
+    CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array");
+
+    auto const field                = arrow::field("a", arr->type());
+    auto const schema_vector        = std::vector<std::shared_ptr<arrow::Field>>({field});
+    auto const schema               = std::make_shared<arrow::Schema>(schema_vector);
+    auto const expected_arrow_table = arrow::Table::Make(schema, {arr});
+
+    auto got_arrow_table = cudf::to_arrow(input, {{"a"}});
+
+    ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
+  }
+}
+
 struct ToArrowTestSlice
   : public ToArrowTest,
     public ::testing::WithParamInterface<std::tuple<cudf::size_type, cudf::size_type>> {
diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu
index 21e5c818197..313113a58e0 100644
--- a/cpp/tests/transform/row_bit_count_test.cu
+++ b/cpp/tests/transform/row_bit_count_test.cu
@@ -16,8 +16,6 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
-#include <cudf/detail/copy.hpp>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 #include <cudf_test/base_fixture.hpp>
@@ -47,7 +45,7 @@ TYPED_TEST(RowBitCountTyped, SimpleTypes)
   // expect size of the type per row
   auto expected = make_fixed_width_column(data_type{type_id::INT32}, 16);
   cudf::mutable_column_view mcv(*expected);
-  thrust::fill(rmm::exec_policy(0),
+  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
                mcv.begin<size_type>(),
                mcv.end<size_type>(),
                sizeof(device_storage_type_t<T>) * CHAR_BIT);
@@ -70,7 +68,7 @@ TYPED_TEST(RowBitCountTyped, SimpleTypesWithNulls)
   // expect size of the type + 1 bit per row
   auto expected = make_fixed_width_column(data_type{type_id::INT32}, 16);
   cudf::mutable_column_view mcv(*expected);
-  thrust::fill(rmm::exec_policy(0),
+  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
                mcv.begin<size_type>(),
                mcv.end<size_type>(),
                (sizeof(device_storage_type_t<T>) * CHAR_BIT) + 1);
@@ -490,7 +488,7 @@ TEST_F(RowBitCount, Table)
   auto expected   = cudf::make_fixed_width_column(data_type{type_id::INT32}, t.num_rows());
   cudf::mutable_column_view mcv(*expected);
   thrust::transform(
-    rmm::exec_policy(0),
+    rmm::exec_policy(rmm::cuda_stream_default),
     thrust::make_counting_iterator(0),
     thrust::make_counting_iterator(0) + t.num_rows(),
     mcv.begin<size_type>(),
@@ -586,7 +584,7 @@ TEST_F(RowBitCount, EmptyTable)
   }
 
   {
-    auto strings = cudf::strings::detail::make_empty_strings_column(0);
+    auto strings = cudf::make_empty_column(data_type{type_id::STRING});
     auto ints    = cudf::make_empty_column(data_type{type_id::INT32});
     cudf::table_view empty({*strings, *ints});
 
diff --git a/java/README.md b/java/README.md
index 366d014db95..7cc1fcf5aec 100644
--- a/java/README.md
+++ b/java/README.md
@@ -78,8 +78,7 @@ There is experimental work to try and remove that requirement but it is not full
 you can build cuDF with `-DCUDA_STATIC_RUNTIME=ON` when running cmake, and similarly 
 `-DCUDA_STATIC_RUNTIME=ON` when running maven.  This will statically link in the CUDA runtime
 and result in a jar with no classifier that should run on any host that has a version of the
-driver new enough to support the runtime that this was built with. Unfortunately `libnvrtc` is still
-required for runtime code generation which also is tied to a specific version of cuda.
+driver new enough to support the runtime that this was built with.
 
 To build with maven for dynamic linking you would run.
 
diff --git a/java/ci/Dockerfile.centos7 b/java/ci/Dockerfile.centos7
index c16b5a283ff..607520692d0 100644
--- a/java/ci/Dockerfile.centos7
+++ b/java/ci/Dockerfile.centos7
@@ -17,7 +17,7 @@
 ###
 # Build the image for cudf development environment.
 #
-# Arguments: CUDA_VERSION=10.1, 10.2 or 11.0
+# Arguments: CUDA_VERSION=11.0, 11.1, 11.2.0 or 11.2.2
 #
 ###
 ARG CUDA_VERSION
@@ -38,3 +38,10 @@ RUN cd /rapids/ && wget https://dl.bintray.com/boostorg/release/1.72.0/source/bo
 
 RUN cd /usr/local/ && wget --quiet https://github.com/Kitware/CMake/releases/download/v3.19.0/cmake-3.19.0-Linux-x86_64.tar.gz && \
    tar zxf cmake-3.19.0-Linux-x86_64.tar.gz
+
+# get GDS user-space lib
+RUN cd /tmp/ && wget https://developer.download.nvidia.com/gds/redist/rel-0.95.0/gds-redistrib-0.95.0.tgz && \
+    tar zxf gds-redistrib-0.95.0.tgz && \
+    cp -R ./gds-redistrib-0.95.0/targets/x86_64-linux/lib/* /usr/local/cuda/targets/x86_64-linux/lib && \
+    cp -R ./gds-redistrib-0.95.0/targets/x86_64-linux/include/* /usr/local/cuda/targets/x86_64-linux/include && \
+    rm -rf gds-redistrib-0.95.0*
diff --git a/java/ci/README.md b/java/ci/README.md
index 64ce1f38814..ed971304dfc 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -11,7 +11,7 @@
 
 In the root path of cuDF repo, run below command to build the docker image.
 ```bash
-docker build -f java/ci/Dockerfile.centos7 --build-arg CUDA_VERSION=11.0 -t cudf-build:11.0-devel-centos7 .
+docker build -f java/ci/Dockerfile.centos7 --build-arg CUDA_VERSION=11.2.2 -t cudf-build:11.2.2-devel-centos7 .
 ```
 
 The following CUDA versions are supported:
@@ -20,7 +20,7 @@ The following CUDA versions are supported:
 * CUDA 11.2
 
 Change the --build-arg CUDA_VERSION to what you need.
-You can replace the tag "cudf-build:11.0-devel-centos7" with another name you like.
+You can replace the tag "cudf-build:11.2.2-devel-centos7" with another name you like.
 
 ## Start the docker then build
 
@@ -28,7 +28,7 @@ You can replace the tag "cudf-build:11.0-devel-centos7" with another name you li
 
 Run below command to start a docker container with GPU.
 ```bash
-nvidia-docker run -it cudf-build:11.0-devel-centos7 bash
+nvidia-docker run -it cudf-build:11.2.2-devel-centos7 bash
 ```
 
 ### Download the cuDF source code
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index bd38c7ca0b6..17776288b49 100755
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -252,7 +252,14 @@ set(SOURCE_FILES
 add_library(cudfjni SHARED ${SOURCE_FILES})
 
 #Override RPATH for cudfjni
-SET_TARGET_PROPERTIES(cudfjni PROPERTIES BUILD_RPATH "\$ORIGIN")
+SET_TARGET_PROPERTIES(cudfjni
+    PROPERTIES BUILD_RPATH "\$ORIGIN"
+               # set target compile options
+               CXX_STANDARD                        17
+               CXX_STANDARD_REQUIRED               ON
+               CUDA_STANDARD                       17
+               CUDA_STANDARD_REQUIRED              ON
+)
 
 target_compile_options(cudfjni
             PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
@@ -267,7 +274,7 @@ target_compile_definitions(cudfjni
 if(USE_GDS)
     add_library(cufilejni SHARED "src/CuFileJni.cpp")
     target_include_directories(cufilejni PRIVATE "${cuFile_INCLUDE_DIRS}")
-    target_link_libraries(cufilejni PRIVATE "${cuFile_LIBRARIES}")
+    target_link_libraries(cufilejni PRIVATE cudfjni "${cuFile_LIBRARIES}")
 endif(USE_GDS)
 
 ###################################################################################################
@@ -284,4 +291,4 @@ target_compile_definitions(cudfjni PUBLIC SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM
 ###################################################################################################
 # - link libraries --------------------------------------------------------------------------------
 
-target_link_libraries(cudfjni ${CUDF_LIB} ${ARROW_LIBRARY} ${NVCOMP_LIB} ${CUDART_LIBRARY} cuda nvrtc)
+target_link_libraries(cudfjni ${CUDF_LIB} ${ARROW_LIBRARY} ${NVCOMP_LIB} ${CUDART_LIBRARY} cuda)
diff --git a/java/src/main/native/src/CuFileJni.cpp b/java/src/main/native/src/CuFileJni.cpp
index 4e1d3c190f4..e3ef22f0089 100644
--- a/java/src/main/native/src/CuFileJni.cpp
+++ b/java/src/main/native/src/CuFileJni.cpp
@@ -23,6 +23,7 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 
+#include "cudf_jni_apis.hpp"
 #include "jni_utils.hpp"
 
 namespace {
@@ -281,6 +282,7 @@ extern "C" {
  */
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFile_createDriver(JNIEnv *env, jclass) {
   try {
+    cudf::jni::auto_set_device(env);
     return reinterpret_cast<jlong>(new cufile_driver());
   }
   CATCH_STD(env, 0);
@@ -295,6 +297,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFile_createDriver(JNIEnv *env, jcl
 JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFile_destroyDriver(JNIEnv *env, jclass,
                                                                 jlong pointer) {
   try {
+    cudf::jni::auto_set_device(env);
     if (pointer != 0) {
       auto *driver = reinterpret_cast<cufile_driver *>(pointer);
       delete driver;
@@ -316,6 +319,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFile_writeToFile(JNIEnv *env, jclas
                                                               jlong file_offset,
                                                               jlong device_pointer, jlong size) {
   try {
+    cudf::jni::auto_set_device(env);
     cufile_buffer buffer{reinterpret_cast<void *>(device_pointer), static_cast<std::size_t>(size)};
     auto writer = cufile_file::make_writer(env->GetStringUTFChars(path, nullptr));
     writer->write(buffer, file_offset);
@@ -334,6 +338,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFile_writeToFile(JNIEnv *env, jclas
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFile_appendToFile(JNIEnv *env, jclass, jstring path,
                                                                 jlong device_pointer, jlong size) {
   try {
+    cudf::jni::auto_set_device(env);
     cufile_buffer buffer{reinterpret_cast<void *>(device_pointer), static_cast<std::size_t>(size)};
     auto writer = cufile_file::make_writer(env->GetStringUTFChars(path, nullptr));
     return writer->append(buffer);
@@ -354,6 +359,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFile_readFromFile(JNIEnv *env, jcla
                                                                jlong device_pointer, jlong size,
                                                                jstring path, jlong file_offset) {
   try {
+    cudf::jni::auto_set_device(env);
     cufile_buffer buffer{reinterpret_cast<void *>(device_pointer), static_cast<std::size_t>(size)};
     auto const reader = cufile_file::make_reader(env->GetStringUTFChars(path, nullptr));
     reader->read(buffer, file_offset);
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index a10ba9a2700..a0938ddb2b5 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -404,7 +404,7 @@ static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
       input_data->data(), input_nm->data(), data->mutable_view().data<int8_t>());
 
   return cudf::make_lists_column(num_rows, std::move(offsets), std::move(data), 0,
-                                 rmm::device_buffer{0, 0, mr}, stream, mr);
+                                 rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr);
 }
 
 static cudf::data_type get_data_type(const cudf::column_view &v) {
diff --git a/python/cudf/cudf/_lib/aggregation.pxd b/python/cudf/cudf/_lib/aggregation.pxd
index bb332c44237..972f95d5aab 100644
--- a/python/cudf/cudf/_lib/aggregation.pxd
+++ b/python/cudf/cudf/_lib/aggregation.pxd
@@ -4,7 +4,7 @@ from libcpp.memory cimport unique_ptr
 from cudf._lib.cpp.aggregation cimport aggregation
 
 
-cdef unique_ptr[aggregation] make_aggregation(op, kwargs=*) except *
-
 cdef class Aggregation:
     cdef unique_ptr[aggregation] c_obj
+
+cdef Aggregation make_aggregation(op, kwargs=*)
diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 7138bb49743..682d8cbf329 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -56,85 +56,55 @@ class AggregationKind(Enum):
 
 
 cdef class Aggregation:
-    def __init__(self, op, **kwargs):
-        self.c_obj = move(make_aggregation(op, kwargs))
-
+    """A Cython wrapper for aggregations.
+
+    **This class should never be instantiated using a standard constructor,
+    only using one of its many factories.** These factories handle mapping
+    different cudf operations to their libcudf analogs, e.g.
+    `cudf.DataFrame.idxmin` -> `libcudf.argmin`. Additionally, they perform
+    any additional configuration needed to translate Python arguments into
+    their corresponding C++ types (for instance, C++ enumerations used for
+    flag arguments). The factory approach is necessary to support operations
+    like `df.agg(lambda x: x.sum())`; such functions are called with this
+    class as an argument to generation the desired aggregation.
+    """
     @property
     def kind(self):
-        return AggregationKind(self.c_obj.get()[0].kind).name.lower()
-
-
-cdef unique_ptr[aggregation] make_aggregation(op, kwargs={}) except *:
-    """
-    Parameters
-    ----------
-    op : str or callable
-        If callable, must meet one of the following requirements:
-
-        * Is of the form lambda x: x.agg(*args, **kwargs), where
-          `agg` is the name of a supported aggregation. Used to
-          to specify aggregations that take arguments, e.g.,
-          `lambda x: x.quantile(0.5)`.
-        * Is a user defined aggregation function that operates on
-          group values. In this case, the output dtype must be
-          specified in the `kwargs` dictionary.
-
-    Returns
-    -------
-    unique_ptr[aggregation]
-    """
-    cdef Aggregation agg
-    if isinstance(op, str):
-        agg = getattr(_AggregationFactory, op)(**kwargs)
-    elif callable(op):
-        if op is list:
-            agg = _AggregationFactory.collect()
-        elif "dtype" in kwargs:
-            agg = _AggregationFactory.from_udf(op, **kwargs)
-        else:
-            agg = op(_AggregationFactory)
-    else:
-        raise TypeError("Unknown aggregation {}".format(op))
-    return move(agg.c_obj)
-
-# The Cython pattern below enables us to create an Aggregation
-# without ever calling its `__init__` method, which would otherwise
-# result in a RecursionError.
-cdef class _AggregationFactory:
+        return AggregationKind(self.c_obj.get()[0].kind).name
 
     @classmethod
     def sum(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_sum_aggregation())
         return agg
 
     @classmethod
     def min(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_min_aggregation())
         return agg
 
     @classmethod
     def max(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_max_aggregation())
         return agg
 
     @classmethod
     def idxmin(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_argmin_aggregation())
         return agg
 
     @classmethod
     def idxmax(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_argmax_aggregation())
         return agg
 
     @classmethod
     def mean(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_mean_aggregation())
         return agg
 
@@ -146,7 +116,7 @@ cdef class _AggregationFactory:
         else:
             c_null_handling = libcudf_types.null_policy.INCLUDE
 
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_count_aggregation(
             c_null_handling
         ))
@@ -154,7 +124,7 @@ cdef class _AggregationFactory:
 
     @classmethod
     def size(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_count_aggregation(
             <libcudf_types.null_policy><underlying_type_t_null_policy>(
                 NullHandling.INCLUDE
@@ -164,13 +134,13 @@ cdef class _AggregationFactory:
 
     @classmethod
     def nunique(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_nunique_aggregation())
         return agg
 
     @classmethod
     def nth(cls, libcudf_types.size_type size):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = cls()
         agg.c_obj = move(
             libcudf_aggregation.make_nth_element_aggregation(size)
         )
@@ -178,49 +148,49 @@ cdef class _AggregationFactory:
 
     @classmethod
     def any(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_any_aggregation())
         return agg
 
     @classmethod
     def all(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_all_aggregation())
         return agg
 
     @classmethod
     def product(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_product_aggregation())
         return agg
 
     @classmethod
     def sum_of_squares(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_sum_of_squares_aggregation())
         return agg
 
     @classmethod
     def var(cls, ddof=1):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_variance_aggregation(ddof))
         return agg
 
     @classmethod
     def std(cls, ddof=1):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_std_aggregation(ddof))
         return agg
 
     @classmethod
     def median(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_median_aggregation())
         return agg
 
     @classmethod
     def quantile(cls, q=0.5, interpolation="linear"):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = cls()
 
         if not pd.api.types.is_list_like(q):
             q = [q]
@@ -240,19 +210,19 @@ cdef class _AggregationFactory:
 
     @classmethod
     def collect(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_collect_list_aggregation())
         return agg
 
     @classmethod
     def unique(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = cls()
         agg.c_obj = move(libcudf_aggregation.make_collect_set_aggregation())
         return agg
 
     @classmethod
     def from_udf(cls, op, *args, **kwargs):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = cls()
 
         cdef libcudf_types.type_id tid
         cdef libcudf_types.data_type out_dtype
@@ -282,3 +252,42 @@ cdef class _AggregationFactory:
             libcudf_aggregation.udf_type.PTX, cpp_str, out_dtype
         ))
         return agg
+
+
+cdef Aggregation make_aggregation(op, kwargs=None):
+    r"""
+    Parameters
+    ----------
+    op : str or callable
+        If callable, must meet one of the following requirements:
+
+        * Is of the form lambda x: x.agg(*args, **kwargs), where
+          `agg` is the name of a supported aggregation. Used to
+          to specify aggregations that take arguments, e.g.,
+          `lambda x: x.quantile(0.5)`.
+        * Is a user defined aggregation function that operates on
+          group values. In this case, the output dtype must be
+          specified in the `kwargs` dictionary.
+    \*\*kwargs : dict, optional
+        Any keyword arguments to be passed to the op.
+
+    Returns
+    -------
+    Aggregation
+    """
+    if kwargs is None:
+        kwargs = {}
+
+    cdef Aggregation agg
+    if isinstance(op, str):
+        agg = getattr(Aggregation, op)(**kwargs)
+    elif callable(op):
+        if op is list:
+            agg = Aggregation.collect()
+        elif "dtype" in kwargs:
+            agg = Aggregation.from_udf(op, **kwargs)
+        else:
+            agg = op(Aggregation)
+    else:
+        raise TypeError(f"Unknown aggregation {op}")
+    return agg
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 4584841dd33..3c2b541f728 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -1,6 +1,15 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 
 from collections import defaultdict
+from pandas.core.groupby.groupby import DataError
+from cudf.utils.dtypes import (
+    is_categorical_dtype,
+    is_string_dtype,
+    is_list_dtype,
+    is_interval_dtype,
+    is_struct_dtype,
+    is_decimal_dtype,
+)
 
 import numpy as np
 import rmm
@@ -13,56 +22,23 @@ from libcpp cimport bool
 
 from cudf._lib.column cimport Column
 from cudf._lib.table cimport Table
-from cudf._lib.aggregation cimport make_aggregation, Aggregation
+from cudf._lib.aggregation cimport Aggregation, make_aggregation
 
 from cudf._lib.cpp.table.table cimport table, table_view
 cimport cudf._lib.cpp.types as libcudf_types
 cimport cudf._lib.cpp.groupby as libcudf_groupby
-cimport cudf._lib.cpp.aggregation as libcudf_aggregation
 
 
 # The sets below define the possible aggregations that can be performed on
-# different dtypes. The uppercased versions of these strings correspond to
-# elements of the AggregationKind enum.
-_CATEGORICAL_AGGS = {
-    "count",
-    "size",
-    "nunique",
-    "unique",
-}
-
-_STRING_AGGS = {
-    "count",
-    "size",
-    "max",
-    "min",
-    "nunique",
-    "nth",
-    "collect",
-    "unique",
-}
-
-_LIST_AGGS = {
-    "collect",
-}
-
-_STRUCT_AGGS = {
-}
-
-_INTERVAL_AGGS = {
-}
-
-_DECIMAL_AGGS = {
-    "count",
-    "sum",
-    "argmin",
-    "argmax",
-    "min",
-    "max",
-    "nunique",
-    "nth",
-    "collect"
-}
+# different dtypes. These strings must be elements of the AggregationKind enum.
+_CATEGORICAL_AGGS = {"COUNT", "SIZE", "NUNIQUE", "UNIQUE"}
+_STRING_AGGS = {"COUNT", "SIZE", "MAX", "MIN", "NUNIQUE", "NTH", "COLLECT",
+                "UNIQUE"}
+_LIST_AGGS = {"COLLECT"}
+_STRUCT_AGGS = set()
+_INTERVAL_AGGS = set()
+_DECIMAL_AGGS = {"COUNT", "SUM", "ARGMIN", "ARGMAX", "MIN", "MAX", "NUNIQUE",
+                 "NTH", "COLLECT"}
 
 
 cdef class GroupBy:
@@ -132,21 +108,51 @@ cdef class GroupBy:
         """
         from cudf.core.column_accessor import ColumnAccessor
         cdef vector[libcudf_groupby.aggregation_request] c_agg_requests
+        cdef libcudf_groupby.aggregation_request c_agg_request
         cdef Column col
+        cdef Aggregation agg_obj
 
-        aggregations = _drop_unsupported_aggs(values, aggregations)
+        allow_empty = all(len(v) == 0 for v in aggregations.values())
 
+        included_aggregations = defaultdict(list)
         for i, (col_name, aggs) in enumerate(aggregations.items()):
             col = values._data[col_name]
-            c_agg_requests.push_back(
-                move(libcudf_groupby.aggregation_request())
+            dtype = col.dtype
+
+            valid_aggregations = (
+                _LIST_AGGS if is_list_dtype(dtype)
+                else _STRING_AGGS if is_string_dtype(dtype)
+                else _CATEGORICAL_AGGS if is_categorical_dtype(dtype)
+                else _STRING_AGGS if is_struct_dtype(dtype)
+                else _INTERVAL_AGGS if is_interval_dtype(dtype)
+                else _DECIMAL_AGGS if is_decimal_dtype(dtype)
+                else "ALL"
             )
-            c_agg_requests[i].values = col.view()
+            if (valid_aggregations is _DECIMAL_AGGS
+                    and rmm._cuda.gpu.runtimeGetVersion() < 11000):
+                raise RuntimeError(
+                    "Decimal aggregations are only supported on CUDA >= 11 "
+                    "due to an nvcc compiler bug."
+                )
+
+            c_agg_request = move(libcudf_groupby.aggregation_request())
             for agg in aggs:
-                c_agg_requests[i].aggregations.push_back(
-                    move(make_aggregation(agg))
+                agg_obj = make_aggregation(agg)
+                if (valid_aggregations == "ALL"
+                        or agg_obj.kind in valid_aggregations):
+                    included_aggregations[col_name].append(agg)
+                    c_agg_request.aggregations.push_back(
+                        move(agg_obj.c_obj)
+                    )
+            if not c_agg_request.aggregations.empty():
+                c_agg_request.values = col.view()
+                c_agg_requests.push_back(
+                    move(c_agg_request)
                 )
 
+        if c_agg_requests.empty() and not allow_empty:
+            raise DataError("All requested aggregations are unsupported.")
+
         cdef pair[
             unique_ptr[table],
             vector[libcudf_groupby.aggregation_result]
@@ -176,81 +182,14 @@ cdef class GroupBy:
         )
 
         result_data = ColumnAccessor(multiindex=True)
-        for i, col_name in enumerate(aggregations):
-            for j, agg_name in enumerate(aggregations[col_name]):
+        # Note: This loop relies on the included_aggregations dict being
+        # insertion ordered to map results to requested aggregations by index.
+        for i, col_name in enumerate(included_aggregations):
+            for j, agg_name in enumerate(included_aggregations[col_name]):
                 if callable(agg_name):
                     agg_name = agg_name.__name__
                 result_data[(col_name, agg_name)] = (
                     Column.from_unique_ptr(move(c_result.second[i].results[j]))
                 )
 
-        result = Table(data=result_data, index=grouped_keys)
-        return result
-
-
-def _drop_unsupported_aggs(Table values, aggs):
-    """
-    Drop any aggregations that are not supported.
-    """
-    from pandas.core.groupby.groupby import DataError
-
-    if all(len(v) == 0 for v in aggs.values()):
-        return aggs
-
-    from cudf.utils.dtypes import (
-        is_categorical_dtype,
-        is_string_dtype,
-        is_list_dtype,
-        is_interval_dtype,
-        is_struct_dtype,
-        is_decimal_dtype,
-    )
-    result = aggs.copy()
-
-    for col_name in aggs:
-        if (
-            is_list_dtype(values._data[col_name].dtype)
-        ):
-            for i, agg_name in enumerate(aggs[col_name]):
-                if Aggregation(agg_name).kind not in _LIST_AGGS:
-                    del result[col_name][i]
-        elif (
-            is_string_dtype(values._data[col_name].dtype)
-        ):
-            for i, agg_name in enumerate(aggs[col_name]):
-                if Aggregation(agg_name).kind not in _STRING_AGGS:
-                    del result[col_name][i]
-        elif (
-                is_categorical_dtype(values._data[col_name].dtype)
-        ):
-            for i, agg_name in enumerate(aggs[col_name]):
-                if Aggregation(agg_name).kind not in _CATEGORICAL_AGGS:
-                    del result[col_name][i]
-        elif (
-                is_struct_dtype(values._data[col_name].dtype)
-        ):
-            for i, agg_name in enumerate(aggs[col_name]):
-                if Aggregation(agg_name).kind not in _STRUCT_AGGS:
-                    del result[col_name][i]
-        elif (
-                is_interval_dtype(values._data[col_name].dtype)
-        ):
-            for i, agg_name in enumerate(aggs[col_name]):
-                if Aggregation(agg_name).kind not in _INTERVAL_AGGS:
-                    del result[col_name][i]
-        elif (
-                is_decimal_dtype(values._data[col_name].dtype)
-        ):
-            if rmm._cuda.gpu.runtimeGetVersion() < 11000:
-                raise RuntimeError(
-                    "Decimal aggregations are only supported on CUDA >= 11 "
-                    "due to an nvcc compiler bug."
-                )
-            for i, agg_name in enumerate(aggs[col_name]):
-                if Aggregation(agg_name).kind not in _DECIMAL_AGGS:
-                    del result[col_name][i]
-
-    if all(len(v) == 0 for v in result.values()):
-        raise DataError("No numeric types to aggregate")
-
-    return result
+        return Table(data=result_data, index=grouped_keys)
diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 62013ea88ae..e5723331f3c 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -12,7 +12,7 @@ from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.column cimport Column
 from cudf._lib.types import np_to_cudf_types
 from cudf._lib.types cimport underlying_type_t_type_id, dtype_to_data_type
-from cudf._lib.aggregation cimport make_aggregation, aggregation
+from cudf._lib.aggregation cimport make_aggregation, Aggregation
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move, pair
 import numpy as np
@@ -45,9 +45,7 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
 
     cdef column_view c_incol_view = incol.view()
     cdef unique_ptr[scalar] c_result
-    cdef unique_ptr[aggregation] c_agg = move(make_aggregation(
-        reduction_op, kwargs
-    ))
+    cdef Aggregation cython_agg = make_aggregation(reduction_op, kwargs)
 
     cdef data_type c_out_dtype = dtype_to_data_type(col_dtype)
 
@@ -65,7 +63,7 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
     with nogil:
         c_result = move(cpp_reduce(
             c_incol_view,
-            c_agg,
+            cython_agg.c_obj,
             c_out_dtype
         ))
 
@@ -95,9 +93,7 @@ def scan(scan_op, Column incol, inclusive, **kwargs):
     """
     cdef column_view c_incol_view = incol.view()
     cdef unique_ptr[column] c_result
-    cdef unique_ptr[aggregation] c_agg = move(
-        make_aggregation(scan_op, kwargs)
-    )
+    cdef Aggregation cython_agg = make_aggregation(scan_op, kwargs)
 
     cdef scan_type c_inclusive
     if inclusive is True:
@@ -108,7 +104,7 @@ def scan(scan_op, Column incol, inclusive, **kwargs):
     with nogil:
         c_result = move(cpp_scan(
             c_incol_view,
-            c_agg,
+            cython_agg.c_obj,
             c_inclusive
         ))
 
diff --git a/python/cudf/cudf/_lib/rolling.pyx b/python/cudf/cudf/_lib/rolling.pyx
index 9c818f39c38..d67fb431ec4 100644
--- a/python/cudf/cudf/_lib/rolling.pyx
+++ b/python/cudf/cudf/_lib/rolling.pyx
@@ -8,12 +8,11 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.aggregation cimport make_aggregation
+from cudf._lib.aggregation cimport Aggregation, make_aggregation
 
 from cudf._lib.cpp.types cimport size_type
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.aggregation cimport aggregation
 from cudf._lib.cpp.rolling cimport (
     rolling_window as cpp_rolling_window
 )
@@ -47,14 +46,12 @@ def rolling(Column source_column, Column pre_column_window,
     cdef column_view source_column_view = source_column.view()
     cdef column_view pre_column_window_view
     cdef column_view fwd_column_window_view
-    cdef unique_ptr[aggregation] agg
+    cdef Aggregation cython_agg
 
     if callable(op):
-        agg = move(
-            make_aggregation(op, {'dtype': source_column.dtype})
-        )
+        cython_agg = make_aggregation(op, {'dtype': source_column.dtype})
     else:
-        agg = move(make_aggregation(op))
+        cython_agg = make_aggregation(op)
 
     if window is None:
         if center:
@@ -71,7 +68,7 @@ def rolling(Column source_column, Column pre_column_window,
                     pre_column_window_view,
                     fwd_column_window_view,
                     c_min_periods,
-                    agg)
+                    cython_agg.c_obj)
             )
     else:
         c_min_periods = min_periods
@@ -89,7 +86,7 @@ def rolling(Column source_column, Column pre_column_window,
                     c_window,
                     c_forward_window,
                     c_min_periods,
-                    agg)
+                    cython_agg.c_obj)
             )
 
     return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py
index 807e96f2c38..24b25b6eec0 100644
--- a/python/cudf/cudf/core/_compat.py
+++ b/python/cudf/cudf/core/_compat.py
@@ -7,4 +7,4 @@
 PANDAS_GE_100 = PANDAS_VERSION >= version.parse("1.0")
 PANDAS_GE_110 = PANDAS_VERSION >= version.parse("1.1")
 PANDAS_GE_120 = PANDAS_VERSION >= version.parse("1.2")
-PANDAS_EQ_123 = PANDAS_VERSION == version.parse("1.2.3")
+PANDAS_LE_122 = PANDAS_VERSION <= version.parse("1.2.2")
diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py
index e0aa9471a2f..32cb557548f 100644
--- a/python/cudf/cudf/core/column/__init__.py
+++ b/python/cudf/cudf/core/column/__init__.py
@@ -7,7 +7,6 @@
     as_column,
     build_categorical_column,
     build_column,
-    column_applymap,
     column_empty,
     column_empty_like,
     column_empty_like_same_mask,
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 6a1600d6461..ee196e6659f 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -25,7 +25,7 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-from numba import cuda, njit
+from numba import cuda
 
 import cudf
 from cudf import _lib as libcudf
@@ -41,8 +41,7 @@
 from cudf._typing import BinaryOperand, ColumnLike, Dtype, ScalarLike
 from cudf.core.abc import Serializable
 from cudf.core.buffer import Buffer
-from cudf.core.dtypes import CategoricalDtype
-from cudf.core.dtypes import IntervalDtype
+from cudf.core.dtypes import CategoricalDtype, IntervalDtype
 from cudf.utils import ioutils, utils
 from cudf.utils.dtypes import (
     NUMERIC_TYPES,
@@ -358,7 +357,7 @@ def to_arrow(self) -> pa.Array:
                 pa.null(), len(self), [pa.py_buffer((b""))]
             )
 
-        return libcudf.interop.to_arrow(
+        result = libcudf.interop.to_arrow(
             libcudf.table.Table(
                 cudf.core.column_accessor.ColumnAccessor({"None": self})
             ),
@@ -366,6 +365,14 @@ def to_arrow(self) -> pa.Array:
             keep_index=False,
         )["None"].chunk(0)
 
+        if isinstance(self.dtype, cudf.Decimal64Dtype):
+            result = result.view(
+                pa.decimal128(
+                    scale=result.type.scale, precision=self.dtype.precision
+                )
+            )
+        return result
+
     @classmethod
     def from_arrow(cls, array: pa.Array) -> ColumnBase:
         """
@@ -430,10 +437,14 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
         elif isinstance(array.type, pa.Decimal128Type):
             return cudf.core.column.DecimalColumn.from_arrow(array)
 
-        return libcudf.interop.from_arrow(data, data.column_names)._data[
+        result = libcudf.interop.from_arrow(data, data.column_names)._data[
             "None"
         ]
 
+        if isinstance(result.dtype, cudf.Decimal64Dtype):
+            result.dtype.precision = array.type.precision
+        return result
+
     def _get_mask_as_column(self) -> ColumnBase:
         return libcudf.transform.mask_to_bools(
             self.base_mask, self.offset, self.offset + len(self)
@@ -445,6 +456,16 @@ def _memory_usage(self, **kwargs) -> int:
     def default_na_value(self) -> Any:
         raise NotImplementedError()
 
+    def applymap(
+        self, udf: Callable[[ScalarLike], ScalarLike], out_dtype: Dtype = None
+    ) -> ColumnBase:
+        """Apply an element-wise function to the values in the Column."""
+        # Subclasses that support applymap must override this behavior.
+        raise TypeError(
+            "User-defined functions are currently not supported on data "
+            f"with dtype {self.dtype}."
+        )
+
     def to_gpu_array(self, fillna=None) -> "cuda.devicearray.DeviceNDArray":
         """Get a dense numba device array for the data.
 
@@ -1874,7 +1895,9 @@ def as_column(
                 col = col.set_mask(mask)
         elif np.issubdtype(col.dtype, np.datetime64):
             if nan_as_null or (mask is None and nan_as_null is None):
-                col = utils.time_col_replace_nulls(col)
+                # Ignore typing error since this method is only defined for
+                # DatetimeColumn, not the ColumnBase class.
+                col = col._make_copy_with_na_as_null()  # type: ignore
         return col
 
     elif isinstance(arbitrary, (pa.Array, pa.ChunkedArray)):
@@ -1995,7 +2018,7 @@ def as_column(
                 data = as_column(
                     buffer, dtype=arbitrary.dtype, nan_as_null=nan_as_null
                 )
-                data = utils.time_col_replace_nulls(data)
+                data = data._make_copy_with_na_as_null()
                 mask = data.mask
 
             data = cudf.core.column.datetime.DatetimeColumn(
@@ -2015,7 +2038,7 @@ def as_column(
                 data = as_column(
                     buffer, dtype=arbitrary.dtype, nan_as_null=nan_as_null
                 )
-                data = utils.time_col_replace_nulls(data)
+                data = data._make_copy_with_na_as_null()
                 mask = data.mask
 
             data = cudf.core.column.timedelta.TimeDeltaColumn(
@@ -2196,58 +2219,6 @@ def _construct_array(
     return arbitrary
 
 
-def column_applymap(
-    udf: Callable[[ScalarLike], ScalarLike],
-    column: ColumnBase,
-    out_dtype: Dtype,
-) -> ColumnBase:
-    """Apply an element-wise function to transform the values in the Column.
-
-    Parameters
-    ----------
-    udf : function
-        Wrapped by numba jit for call on the GPU as a device function.
-    column : Column
-        The source column.
-    out_dtype  : numpy.dtype
-        The dtype for use in the output.
-
-    Returns
-    -------
-    result : Column
-    """
-    core = njit(udf)
-    results = column_empty(len(column), dtype=out_dtype)
-    values = column.data_array_view
-    if column.nullable:
-        # For masked columns
-        @cuda.jit
-        def kernel_masked(values, masks, results):
-            i = cuda.grid(1)
-            # in range?
-            if i < values.size:
-                # valid?
-                if utils.mask_get(masks, i):
-                    # call udf
-                    results[i] = core(values[i])
-
-        masks = column.mask_array_view
-        kernel_masked.forall(len(column))(values, masks, results)
-    else:
-        # For non-masked columns
-        @cuda.jit
-        def kernel_non_masked(values, results):
-            i = cuda.grid(1)
-            # in range?
-            if i < values.size:
-                # call udf
-                results[i] = core(values[i])
-
-        kernel_non_masked.forall(len(column))(values, results)
-
-    return as_column(results)
-
-
 def _data_from_cuda_array_interface_desc(obj) -> Buffer:
     desc = obj.__cuda_array_interface__
     ptr = desc["data"][0]
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 2a0b813ceb0..66141fec610 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -16,7 +16,13 @@
 from cudf._typing import DatetimeLikeScalar, Dtype, DtypeObj, ScalarLike
 from cudf.core._compat import PANDAS_GE_120
 from cudf.core.buffer import Buffer
-from cudf.core.column import ColumnBase, column, string
+from cudf.core.column import (
+    ColumnBase,
+    as_column,
+    column,
+    column_empty_like,
+    string,
+)
 from cudf.utils.dtypes import is_scalar
 from cudf.utils.utils import _fillna_natwise
 
@@ -306,7 +312,7 @@ def fillna(
         self, fill_value: Any = None, method: str = None, dtype: Dtype = None
     ) -> DatetimeColumn:
         if fill_value is not None:
-            if cudf.utils.utils.isnat(fill_value):
+            if cudf.utils.utils._isnat(fill_value):
                 return _fillna_natwise(self)
             if is_scalar(fill_value):
                 if not isinstance(fill_value, cudf.Scalar):
@@ -372,6 +378,23 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool:
         else:
             return False
 
+    def _make_copy_with_na_as_null(self):
+        """Return a copy with NaN values replaced with nulls."""
+        null = column_empty_like(self, masked=True, newsize=1)
+        out_col = cudf._lib.replace.replace(
+            self,
+            as_column(
+                Buffer(
+                    np.array([self.default_na_value()], dtype=self.dtype).view(
+                        "|u1"
+                    )
+                ),
+                dtype=self.dtype,
+            ),
+            null,
+        )
+        return out_col
+
 
 @annotate("BINARY_OP", color="orange", domain="cudf_python")
 def binop(
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 10a9ffbfbae..70b4569b180 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -7,6 +7,7 @@
 
 import numpy as np
 import pandas as pd
+from numba import cuda, njit
 from nvtx import annotate
 from pandas.api.types import is_integer_dtype
 
@@ -20,6 +21,7 @@
     as_column,
     build_column,
     column,
+    column_empty,
     string,
 )
 from cudf.core.dtypes import Decimal64Dtype
@@ -422,8 +424,22 @@ def applymap(
         """
         if out_dtype is None:
             out_dtype = self.dtype
-        out = column.column_applymap(udf=udf, column=self, out_dtype=out_dtype)
-        return out
+
+        core = njit(udf)
+
+        # For non-masked columns
+        @cuda.jit
+        def kernel_applymap(values, results):
+            i = cuda.grid(1)
+            # in range?
+            if i < values.size:
+                # call udf
+                results[i] = core(values[i])
+
+        results = column_empty(self.size, dtype=out_dtype)
+        values = self.data_array_view
+        kernel_applymap.forall(self.size)(values, results)
+        return as_column(results)
 
     def default_na_value(self) -> ScalarLike:
         """Returns the default NA value for this column
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index a39638106bb..d8ad11f41b3 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -306,7 +306,7 @@ def fillna(
         self, fill_value: Any = None, method: str = None, dtype: Dtype = None
     ) -> TimeDeltaColumn:
         if fill_value is not None:
-            if cudf.utils.utils.isnat(fill_value):
+            if cudf.utils.utils._isnat(fill_value):
                 return _fillna_natwise(self)
             col = self  # type: column.ColumnBase
             if is_scalar(fill_value):
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index d5393a724ec..beeccdce0a4 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -52,6 +52,7 @@
     is_struct_dtype,
     numeric_normalize_types,
 )
+from cudf.utils.utils import GetAttrGetItemMixin
 
 T = TypeVar("T", bound="DataFrame")
 
@@ -109,9 +110,9 @@ def _reverse_op(fn):
 }
 
 
-class DataFrame(Frame, Serializable):
+class DataFrame(Frame, Serializable, GetAttrGetItemMixin):
 
-    _internal_names = {"_data", "_index"}
+    _PROTECTED_KEYS = frozenset(("_data", "_index"))
 
     @annotate("DATAFRAME_INIT", color="blue", domain="cudf_python")
     def __init__(self, data=None, index=None, columns=None, dtype=None):
@@ -638,34 +639,26 @@ def __dir__(self):
         return list(o)
 
     def __setattr__(self, key, col):
-
-        # if an attribute already exists, set it.
         try:
+            # Preexisting attributes may be set. We cannot rely on checking the
+            # `_PROTECTED_KEYS` because we must also allow for settable
+            # properties, and we must call object.__getattribute__ to bypass
+            # the `__getitem__` behavior inherited from `GetAttrGetItemMixin`.
             object.__getattribute__(self, key)
-            object.__setattr__(self, key, col)
-            return
+            super().__setattr__(key, col)
         except AttributeError:
-            pass
-
-        # if a column already exists, set it.
-        if key not in self._internal_names:
-            try:
-                self[key]  # __getitem__ to verify key exists
-                self[key] = col
-                return
-            except KeyError:
-                pass
-
-        object.__setattr__(self, key, col)
-
-    def __getattr__(self, key):
-        if key in self._internal_names:
-            return object.__getattribute__(self, key)
-        else:
-            if key in self:
-                return self[key]
-
-        raise AttributeError("'DataFrame' object has no attribute %r" % key)
+            if key not in self._PROTECTED_KEYS:
+                try:
+                    # Check key existence.
+                    self[key]
+                    # If a column already exists, set it.
+                    self[key] = col
+                    return
+                except KeyError:
+                    pass
+
+            # Set a new attribute that is not already a column.
+            super().__setattr__(key, col)
 
     @annotate("DATAFRAME_GETITEM", color="blue", domain="cudf_python")
     def __getitem__(self, arg):
@@ -7930,7 +7923,12 @@ def _align_indices(lhs, rhs):
     return lhs_out, rhs_out
 
 
-def _setitem_with_dataframe(input_df, replace_df, input_cols=None, mask=None):
+def _setitem_with_dataframe(
+    input_df: DataFrame,
+    replace_df: DataFrame,
+    input_cols: Any = None,
+    mask: Optional[cudf.core.column.ColumnBase] = None,
+):
     """
         This function sets item dataframes relevant columns with replacement df
         :param input_df: Dataframe to be modified inplace
@@ -7947,6 +7945,9 @@ def _setitem_with_dataframe(input_df, replace_df, input_cols=None, mask=None):
             "Number of Input Columns must be same replacement Dataframe"
         )
 
+    if not input_df.index.equals(replace_df.index):
+        replace_df = replace_df.reindex(input_df.index)
+
     for col_1, col_2 in zip(input_cols, replace_df.columns):
         if col_1 in input_df.columns:
             if mask is not None:
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index cc94548d9a2..8d32c99b5b0 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1,6 +1,5 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 import collections
-import functools
 import pickle
 import warnings
 
@@ -10,7 +9,7 @@
 import cudf
 from cudf._lib import groupby as libgroupby
 from cudf.core.abc import Serializable
-from cudf.utils.utils import cached_property
+from cudf.utils.utils import GetAttrGetItemMixin, cached_property
 
 
 # Note that all valid aggregation methods (e.g. GroupBy.min) are bound to the
@@ -570,50 +569,111 @@ def rolling(self, *args, **kwargs):
         """
         return cudf.core.window.rolling.RollingGroupby(self, *args, **kwargs)
 
+    def count(self, dropna=True):
+        """Compute the number of values in each column.
 
-# Set of valid groupby aggregations that are monkey-patched into the GroupBy
-# namespace.
-_VALID_GROUPBY_AGGS = {
-    "count",
-    "sum",
-    "idxmin",
-    "idxmax",
-    "min",
-    "max",
-    "mean",
-    "var",
-    "std",
-    "quantile",
-    "median",
-    "nunique",
-    "collect",
-    "unique",
-}
-
-
-# Dynamically bind the different aggregation methods.
-def _agg_func_name_with_args(self, func_name, *args, **kwargs):
-    """
-    Aggregate given an aggregate function name and arguments to the
-    function, e.g., `_agg_func_name_with_args("quantile", 0.5)`. The named
-    aggregations must be members of _AggregationFactory.
-    """
+        Parameters
+        ----------
+        dropna : bool
+            If ``True``, don't include null values in the count.
+        """
+
+        def func(x):
+            return getattr(x, "count")(dropna=dropna)
+
+        return self.agg(func)
+
+    def sum(self):
+        """Compute the column-wise sum of the values in each group."""
+        return self.agg("sum")
+
+    def idxmin(self):
+        """Get the column-wise index of the minimum value in each group."""
+        return self.agg("idxmin")
+
+    def idxmax(self):
+        """Get the column-wise index of the maximum value in each group."""
+        return self.agg("idxmax")
 
-    def func(x):
-        """Compute the {} of the group.""".format(func_name)
-        return getattr(x, func_name)(*args, **kwargs)
+    def min(self):
+        """Get the column-wise minimum value in each group."""
+        return self.agg("min")
 
-    func.__name__ = func_name
-    return self.agg(func)
+    def max(self):
+        """Get the column-wise maximum value in each group."""
+        return self.agg("max")
 
+    def mean(self):
+        """Compute the column-wise mean of the values in each group."""
+        return self.agg("mean")
 
-for key in _VALID_GROUPBY_AGGS:
-    setattr(
-        GroupBy, key, functools.partialmethod(_agg_func_name_with_args, key)
-    )
+    def median(self):
+        """Get the column-wise median of the values in each group."""
+        return self.agg("median")
 
+    def var(self, ddof=1):
+        """Compute the column-wise variance of the values in each group.
+
+        Parameters
+        ----------
+        ddof : int
+            The delta degrees of freedom. N - ddof is the divisor used to
+            normalize the variance.
+        """
+
+        def func(x):
+            return getattr(x, "var")(ddof=ddof)
+
+        return self.agg(func)
+
+    def std(self, ddof=1):
+        """Compute the column-wise std of the values in each group.
+
+        Parameters
+        ----------
+        ddof : int
+            The delta degrees of freedom. N - ddof is the divisor used to
+            normalize the standard deviation.
+        """
+
+        def func(x):
+            return getattr(x, "std")(ddof=ddof)
+
+        return self.agg(func)
+
+    def quantile(self, q=0.5, interpolation="linear"):
+        """Compute the column-wise quantiles of the values in each group.
+
+        Parameters
+        ----------
+        q : float or array-like
+            The quantiles to compute.
+        interpolation : {"linear", "lower", "higher", "midpoint", "nearest"}
+            The interpolation method to use when the desired quantile lies
+            between two data points. Defaults to "linear".
+       """
+
+        def func(x):
+            return getattr(x, "quantile")(q=q, interpolation=interpolation)
+
+        return self.agg(func)
+
+    def nunique(self):
+        """Compute the number of unique values in each column in each group."""
+        return self.agg("nunique")
+
+    def collect(self):
+        """Get a list of all the values for each column in each group."""
+        return self.agg("collect")
+
+    def unique(self):
+        """Get a list of the unique values for each column in each group."""
+        return self.agg("unique")
+
+
+class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
+    _PROTECTED_KEYS = frozenset(("obj",))
 
-class DataFrameGroupBy(GroupBy):
     def __init__(
         self, obj, by=None, level=None, sort=False, as_index=True, dropna=True
     ):
@@ -708,17 +768,6 @@ def __init__(
             dropna=dropna,
         )
 
-    def __getattr__(self, key):
-        # Without this check, copying can trigger a RecursionError. See
-        # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html  # noqa: E501
-        # for an explanation.
-        if key == "obj":
-            raise AttributeError
-        try:
-            return self[key]
-        except KeyError:
-            raise AttributeError
-
     def __getitem__(self, key):
         return self.obj[key].groupby(
             self.grouping, dropna=self._dropna, sort=self._sort
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index f65afb6a1d4..0ffe0c11fef 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -30,7 +30,7 @@
 from cudf.core.column.string import StringMethods as StringMethods
 from cudf.core.dtypes import IntervalDtype
 from cudf.core.frame import Frame
-from cudf.utils import ioutils, utils
+from cudf.utils import ioutils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
     find_common_type,
@@ -1734,8 +1734,9 @@ def __len__(self):
         return len(range(self._start, self._stop, self._step))
 
     def __getitem__(self, index):
+        len_self = len(self)
         if isinstance(index, slice):
-            sl_start, sl_stop, sl_step = index.indices(len(self))
+            sl_start, sl_stop, sl_step = index.indices(len_self)
 
             lo = self._start + sl_start * self._step
             hi = self._start + sl_stop * self._step
@@ -1743,7 +1744,11 @@ def __getitem__(self, index):
             return RangeIndex(start=lo, stop=hi, step=st, name=self._name)
 
         elif isinstance(index, Number):
-            index = utils.normalize_index(index, len(self))
+            if index < 0:
+                index = len_self + index
+            if not (0 <= index < len_self):
+                raise IndexError("out-of-bound")
+            index = min(index, len_self)
             index = self._start + index * self._step
             return index
         else:
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index d879b2ec4e2..fbd4cf492de 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -2,6 +2,7 @@
 import decimal
 
 import numpy as np
+import pyarrow as pa
 
 from cudf._lib.scalar import DeviceScalar, _is_null_host_scalar
 from cudf.core.column.column import ColumnBase
@@ -114,44 +115,36 @@ def _device_value_to_host(self):
         self._host_value = self._device_value._to_host_scalar()
 
     def _preprocess_host_value(self, value, dtype):
-        if isinstance(dtype, Decimal64Dtype):
-            # TODO: Support coercion from decimal.Decimal to different dtype
-            # TODO: Support coercion from integer to Decimal64Dtype
-            raise NotImplementedError(
-                "dtype as cudf.Decimal64Dtype is not supported. Pass a "
-                "decimal.Decimal to construct a DecimalScalar."
-            )
-        if isinstance(value, decimal.Decimal) and dtype is not None:
-            raise TypeError(f"Can not coerce decimal to {dtype}")
-
-        value = to_cudf_compatible_scalar(value, dtype=dtype)
         valid = not _is_null_host_scalar(value)
 
-        if isinstance(value, decimal.Decimal):
-            # 0.0042 -> Decimal64Dtype(2, 4)
+        if isinstance(dtype, Decimal64Dtype):
+            value = pa.scalar(
+                value, type=pa.decimal128(dtype.precision, dtype.scale)
+            ).as_py()
+        if isinstance(value, decimal.Decimal) and dtype is None:
             dtype = Decimal64Dtype._from_decimal(value)
 
-        else:
-            if dtype is None:
-                if not valid:
-                    if isinstance(value, (np.datetime64, np.timedelta64)):
-                        unit, _ = np.datetime_data(value)
-                        if unit == "generic":
-                            raise TypeError(
-                                "Cant convert generic NaT to null scalar"
-                            )
-                        else:
-                            dtype = value.dtype
-                    else:
+        value = to_cudf_compatible_scalar(value, dtype=dtype)
+
+        if dtype is None:
+            if not valid:
+                if isinstance(value, (np.datetime64, np.timedelta64)):
+                    unit, _ = np.datetime_data(value)
+                    if unit == "generic":
                         raise TypeError(
-                            "dtype required when constructing a null scalar"
+                            "Cant convert generic NaT to null scalar"
                         )
+                    else:
+                        dtype = value.dtype
                 else:
-                    dtype = value.dtype
-            dtype = np.dtype(dtype)
+                    raise TypeError(
+                        "dtype required when constructing a null scalar"
+                    )
+            else:
+                dtype = value.dtype
 
-            # temporary
-            dtype = np.dtype("object") if dtype.char == "U" else dtype
+        if not isinstance(dtype, Decimal64Dtype):
+            dtype = np.dtype(dtype)
 
         if not valid:
             value = NA
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 55fd510f03a..4cc5fb56a4c 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -43,7 +43,7 @@
 from cudf.core.index import Index, RangeIndex, as_index
 from cudf.core.indexing import _SeriesIlocIndexer, _SeriesLocIndexer
 from cudf.core.window import Rolling
-from cudf.utils import cudautils, docutils, ioutils, utils
+from cudf.utils import cudautils, docutils, ioutils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
     can_convert_to_column,
@@ -52,7 +52,6 @@
     is_list_like,
     is_mixed_with_object_dtype,
     is_scalar,
-    is_string_dtype,
     min_scalar_type,
     numeric_normalize_types,
 )
@@ -1485,7 +1484,8 @@ def __repr__(self):
             lines[-1] = lines[-1] + "dtype: %s" % self.dtype
         else:
             lines = output.split(",")
-            return lines[0] + ", dtype: %s)" % self.dtype
+            lines[-1] = " dtype: %s)" % self.dtype
+            return ",".join(lines)
         if isinstance(preprocess._column, cudf.core.column.CategoricalColumn):
             lines.append(category_memory)
         return "\n".join(lines)
@@ -1504,7 +1504,6 @@ def _binaryop(
         if isinstance(other, cudf.DataFrame):
             return NotImplemented
 
-        result_name = utils.get_result_name(self, other)
         if isinstance(other, Series):
             if not can_reindex and fn in cudf.utils.utils._EQUALITY_OPS:
                 if not self.index.equals(other.index):
@@ -1543,8 +1542,19 @@ def _binaryop(
                     rhs = rhs.fillna(fill_value)
 
         outcol = lhs._column.binary_operator(fn, rhs, reflect=reflect)
-        result = lhs._copy_construct(data=outcol, name=result_name)
-        return result
+
+        # Get the appropriate name for output operations involving two objects
+        # that are a mix of pandas and cudf Series and Index. If the two inputs
+        # are identically named, the output shares this name.
+        if isinstance(other, (cudf.Series, cudf.Index, pd.Series, pd.Index)):
+            if self.name == other.name:
+                result_name = self.name
+            else:
+                result_name = None
+        else:
+            result_name = self.name
+
+        return lhs._copy_construct(data=outcol, name=result_name)
 
     def add(self, other, fill_value=None, axis=0):
         """
@@ -4364,14 +4374,6 @@ def applymap(self, udf, out_dtype=None):
         4    105
         dtype: int64
         """
-        if is_string_dtype(self._column.dtype) or isinstance(
-            self._column, cudf.core.column.CategoricalColumn
-        ):
-            raise TypeError(
-                "User defined functions are currently not "
-                "supported on Series with dtypes `str` and `category`."
-            )
-
         if callable(udf):
             res_col = self._unaryop(udf)
         else:
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index 7d1ab3a5435..ca0bd46d9e6 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -10,9 +10,10 @@
 from cudf.core import column
 from cudf.core.column.column import as_column
 from cudf.utils import cudautils
+from cudf.utils.utils import GetAttrGetItemMixin
 
 
-class Rolling:
+class Rolling(GetAttrGetItemMixin):
     """
     Rolling window calculations.
 
@@ -154,6 +155,8 @@ class Rolling:
     dtype: float64
     """
 
+    _PROTECTED_KEYS = frozenset(("obj",))
+
     _time_window = False
 
     def __init__(
@@ -181,15 +184,6 @@ def __init__(
                 )
         self.win_type = win_type
 
-    def __getattr__(self, key):
-        if key == "obj":
-            raise AttributeError()
-        return self.obj[key].rolling(
-            window=self.window,
-            min_periods=self.min_periods,
-            center=self.center,
-        )
-
     def __getitem__(self, arg):
         if isinstance(arg, tuple):
             arg = list(arg)
diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
index 70fc63baba8..f6502c4c1fd 100644
--- a/python/cudf/cudf/tests/test_decimal.py
+++ b/python/cudf/cudf/tests/test_decimal.py
@@ -201,3 +201,89 @@ def test_typecast_from_decimal(data, from_dtype, to_dtype):
     expected = cudf.Series(NumericalColumn.from_arrow(pa_arr))
 
     assert_eq(got, expected)
+    assert_eq(got.dtype, expected.dtype)
+
+
+def _decimal_series(input, dtype):
+    return cudf.Series(
+        [x if x is None else Decimal(x) for x in input], dtype=dtype,
+    )
+
+
+@pytest.mark.parametrize(
+    "args",
+    [
+        # scatter to a single index
+        (
+            ["1", "2", "3"],
+            Decimal64Dtype(1, 0),
+            Decimal(5),
+            1,
+            ["1", "5", "3"],
+        ),
+        (
+            ["1.5", "2.5", "3.5"],
+            Decimal64Dtype(2, 1),
+            Decimal("5.5"),
+            1,
+            ["1.5", "5.5", "3.5"],
+        ),
+        (
+            ["1.0042", "2.0042", "3.0042"],
+            Decimal64Dtype(5, 4),
+            Decimal("5.0042"),
+            1,
+            ["1.0042", "5.0042", "3.0042"],
+        ),
+        # scatter via boolmask
+        (
+            ["1", "2", "3"],
+            Decimal64Dtype(1, 0),
+            Decimal(5),
+            cudf.Series([True, False, True]),
+            ["5", "2", "5"],
+        ),
+        (
+            ["1.5", "2.5", "3.5"],
+            Decimal64Dtype(2, 1),
+            Decimal("5.5"),
+            cudf.Series([True, True, True]),
+            ["5.5", "5.5", "5.5"],
+        ),
+        (
+            ["1.0042", "2.0042", "3.0042"],
+            Decimal64Dtype(5, 4),
+            Decimal("5.0042"),
+            cudf.Series([False, False, True]),
+            ["1.0042", "2.0042", "5.0042"],
+        ),
+        # We will allow assigning a decimal with less precision
+        (
+            ["1.00", "2.00", "3.00"],
+            Decimal64Dtype(3, 2),
+            Decimal(5),
+            1,
+            ["1.00", "5.00", "3.00"],
+        ),
+        # But not truncation
+        (
+            ["1", "2", "3"],
+            Decimal64Dtype(1, 0),
+            Decimal("5.5"),
+            1,
+            pa.lib.ArrowInvalid,
+        ),
+    ],
+)
+def test_series_setitem_decimal(args):
+    data, dtype, item, to, expect = args
+    data = _decimal_series(data, dtype)
+
+    if expect is pa.lib.ArrowInvalid:
+        with pytest.raises(expect):
+            data[to] = item
+        return
+    else:
+        expect = _decimal_series(expect, dtype)
+        data[to] = item
+        assert_eq(data, expect)
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 4dbe608af82..868387b100e 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -1236,7 +1236,11 @@ def test_raise_data_error():
     pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]})
     gdf = cudf.from_pandas(pdf)
 
-    assert_exceptions_equal(pdf.groupby("a").mean, gdf.groupby("a").mean)
+    assert_exceptions_equal(
+        pdf.groupby("a").mean,
+        gdf.groupby("a").mean,
+        compare_error_message=False,
+    )
 
 
 def test_drop_unsupported_multi_agg():
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 7c274734980..d7b9f2fe1d7 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -1493,3 +1493,10 @@ def test_categorical_index_with_nan_repr():
     )
 
     assert cat_index[2:].__repr__() == sliced_expected_repr
+
+
+def test_empty_series_name():
+    ps = pd.Series([], name="abc", dtype="int")
+    gs = cudf.from_pandas(ps)
+
+    assert ps.__repr__() == gs.__repr__()
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index 916e73ea381..42939f8129a 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 import pytest
 
 import cudf
@@ -194,14 +195,13 @@ def test_scalar_roundtrip(value):
     + TEST_DECIMAL_TYPES,
 )
 def test_null_scalar(dtype):
-    if isinstance(dtype, cudf.Decimal64Dtype):
-        with pytest.raises(NotImplementedError):
-            s = cudf.Scalar(None, dtype=dtype)
-        return
-
     s = cudf.Scalar(None, dtype=dtype)
     assert s.value is cudf.NA
-    assert s.dtype == np.dtype(dtype)
+    assert s.dtype == (
+        np.dtype(dtype)
+        if not isinstance(dtype, cudf.Decimal64Dtype)
+        else dtype
+    )
     assert s.is_valid() is False
 
 
@@ -234,25 +234,36 @@ def test_generic_null_scalar_construction_fails(value):
 
 
 @pytest.mark.parametrize(
-    "dtype",
-    NUMERIC_TYPES
-    + DATETIME_TYPES
-    + TIMEDELTA_TYPES
-    + ["object"]
-    + TEST_DECIMAL_TYPES,
+    "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["object"]
 )
 def test_scalar_dtype_and_validity(dtype):
-    if isinstance(dtype, cudf.Decimal64Dtype):
-        with pytest.raises(NotImplementedError):
-            s = cudf.Scalar(None, dtype=dtype)
-        return
-
     s = cudf.Scalar(1, dtype=dtype)
 
     assert s.dtype == np.dtype(dtype)
     assert s.is_valid() is True
 
 
+@pytest.mark.parametrize(
+    "slr,dtype,expect",
+    [
+        (1, cudf.Decimal64Dtype(1, 0), Decimal("1")),
+        (Decimal(1), cudf.Decimal64Dtype(1, 0), Decimal("1")),
+        (Decimal("1.1"), cudf.Decimal64Dtype(2, 1), Decimal("1.1")),
+        (Decimal("1.1"), cudf.Decimal64Dtype(4, 3), Decimal("1.100")),
+        (Decimal("1.11"), cudf.Decimal64Dtype(2, 2), pa.lib.ArrowInvalid),
+    ],
+)
+def test_scalar_dtype_and_validity_decimal(slr, dtype, expect):
+    if expect is pa.lib.ArrowInvalid:
+        with pytest.raises(expect):
+            cudf.Scalar(slr, dtype=dtype)
+        return
+    else:
+        result = cudf.Scalar(slr, dtype=dtype)
+        assert result.dtype == dtype
+        assert result.is_valid
+
+
 @pytest.mark.parametrize(
     "value",
     [
diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
index 1005efec3ee..28cb2568908 100644
--- a/python/cudf/cudf/tests/test_setitem.py
+++ b/python/cudf/cudf/tests/test_setitem.py
@@ -5,7 +5,7 @@
 import pytest
 
 import cudf
-from cudf.core._compat import PANDAS_EQ_123, PANDAS_GE_120
+from cudf.core._compat import PANDAS_GE_120, PANDAS_LE_122
 from cudf.tests.utils import assert_eq, assert_exceptions_equal
 
 
@@ -21,7 +21,7 @@ def test_dataframe_setitem_bool_mask_scaler(df, arg, value):
 
 
 @pytest.mark.xfail(
-    condition=PANDAS_EQ_123 or not PANDAS_GE_120,
+    condition=PANDAS_GE_120 and PANDAS_LE_122,
     reason="https://github.com/pandas-dev/pandas/issues/40204",
 )
 def test_dataframe_setitem_scaler_bool():
diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py
index 722e0b12183..262fe304dd8 100755
--- a/python/cudf/cudf/utils/cudautils.py
+++ b/python/cudf/cudf/utils/cudautils.py
@@ -2,46 +2,12 @@
 from pickle import dumps
 
 import cachetools
-import cupy
 import numpy as np
 from numba import cuda
 
 import cudf
-from cudf.utils.utils import check_equals_float, check_equals_int
 
-try:
-    # Numba >= 0.49
-    from numba.np import numpy_support
-except ImportError:
-    # Numba <= 0.49
-    from numba import numpy_support
-
-
-# GPU array type casting
-
-
-def as_contiguous(arr):
-    assert arr.ndim == 1
-    cupy_dtype = arr.dtype
-    if np.issubdtype(cupy_dtype, np.datetime64):
-        cupy_dtype = np.dtype("int64")
-        arr = arr.view("int64")
-    out = cupy.ascontiguousarray(cupy.asarray(arr))
-    return cuda.as_cuda_array(out).view(arr.dtype)
-
-
-# Mask utils
-
-
-def full(size, value, dtype):
-    cupy_dtype = dtype
-    if np.issubdtype(cupy_dtype, np.datetime64):
-        time_unit, _ = np.datetime_data(cupy_dtype)
-        cupy_dtype = np.int64
-        value = np.datetime64(value, time_unit).view(cupy_dtype)
-
-    out = cupy.full(size, value, cupy_dtype)
-    return cuda.as_cuda_array(out).view(dtype)
+from numba.np import numpy_support
 
 
 #
@@ -77,7 +43,7 @@ def gpu_diff(in_col, out_col, out_mask, N):
 def gpu_mark_found_int(arr, val, out, not_found):
     i = cuda.grid(1)
     if i < arr.size:
-        if check_equals_int(arr[i], val):
+        if arr[i] == val:
             out[i] = i
         else:
             out[i] = not_found
@@ -92,7 +58,10 @@ def gpu_mark_found_float(arr, val, out, not_found):
         # at 0.51.1, this will have a very slight
         # performance improvement. Related
         # discussion in : https://github.com/rapidsai/cudf/pull/6073
-        if check_equals_float(arr[i], float(val)):
+        val = float(val)
+
+        # NaN-aware equality comparison.
+        if (arr[i] == val) or (arr[i] != arr[i] and val != val):
             out[i] = i
         else:
             out[i] = not_found
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index c69ccb0f42e..518bd374905 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -2,12 +2,11 @@
 
 import functools
 from collections.abc import Sequence
-from math import floor, isinf, isnan
+from typing import FrozenSet, Set, Union
 
 import cupy as cp
 import numpy as np
 import pandas as pd
-from numba import njit
 
 import rmm
 
@@ -16,9 +15,11 @@
 from cudf.core.buffer import Buffer
 from cudf.utils.dtypes import to_cudf_compatible_scalar
 
+# The size of the mask in bytes
 mask_dtype = np.dtype(np.int32)
 mask_bitsize = mask_dtype.itemsize * 8
 
+
 _EQUALITY_OPS = {
     "eq",
     "ne",
@@ -35,46 +36,6 @@
 }
 
 
-@njit
-def mask_get(mask, pos):
-    return (mask[pos // mask_bitsize] >> (pos % mask_bitsize)) & 1
-
-
-@njit
-def check_equals_float(a, b):
-    return (
-        a == b
-        or (isnan(a) and isnan(b))
-        or ((isinf(a) and a < 0) and (isinf(b) and b < 0))
-        or ((isinf(a) and a > 0) and (isinf(b) and b > 0))
-    )
-
-
-@njit
-def rint(x):
-    """Round to the nearest integer.
-
-    Returns
-    -------
-    The nearest integer, as a float.
-    """
-    y = floor(x)
-    r = x - y
-
-    if r > 0.5:
-        y += 1.0
-    if r == 0.5:
-        r = y - 2.0 * floor(0.5 * y)
-        if r == 1.0:
-            y += 1.0
-    return y
-
-
-@njit
-def check_equals_int(a, b):
-    return a == b
-
-
 def scalar_broadcast_to(scalar, size, dtype=None):
 
     if isinstance(size, (tuple, list)):
@@ -110,72 +71,6 @@ def scalar_broadcast_to(scalar, size, dtype=None):
         return out_col
 
 
-def normalize_index(index, size, doraise=True):
-    """Normalize negative index
-    """
-    if index < 0:
-        index = size + index
-    if doraise and not (0 <= index < size):
-        raise IndexError("out-of-bound")
-    return min(index, size)
-
-
-list_types_tuple = (list, np.array)
-
-
-def get_result_name(left, right):
-    """
-    This function will give appropriate name for the operations
-    involving two Series, Index's or combination of both.
-
-    Parameters
-    ----------
-    left : {Series, Index}
-    right : object
-
-    Returns
-    -------
-    name : object {string or None}
-    """
-
-    if isinstance(right, (cudf.Series, cudf.Index, pd.Series, pd.Index)):
-        name = compare_and_get_name(left, right)
-    else:
-        name = left.name
-    return name
-
-
-def compare_and_get_name(a, b):
-    """
-    If both a & b have name attribute, and they are
-    same return the common name.
-    Else, return either one of the name of a or b,
-    whichever is present.
-
-    Parameters
-    ----------
-    a : object
-    b : object
-
-    Returns
-    -------
-    name : str or None
-    """
-    a_has = hasattr(a, "name")
-    b_has = hasattr(b, "name")
-
-    if a_has and b_has:
-        if a.name == b.name:
-            return a.name
-        else:
-            return None
-    elif a_has:
-        return a.name
-    elif b_has:
-        return b.name
-    return None
-
-
 def initfunc(f):
     """
     Decorator for initialization functions that should
@@ -193,24 +88,6 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
-def get_null_series(size, dtype=np.bool_):
-    """
-    Creates a null series of provided dtype and size
-
-    Parameters
-    ----------
-    size:  length of series
-    dtype: dtype of series to create; defaults to bool.
-
-    Returns
-    -------
-    a null cudf series of provided `size` and `dtype`
-    """
-
-    empty_col = column.column_empty(size, dtype, True)
-    return cudf.Series(empty_col)
-
-
 # taken from dask array
 # https://github.com/dask/dask/blob/master/dask/array/utils.py#L352-L363
 def _is_nep18_active():
@@ -267,6 +144,9 @@ class cached_property:
     it with `del`.
     """
 
+    # TODO: Can be replaced with functools.cached_property when we drop support
+    # for Python 3.7.
+
     def __init__(self, func):
         self.func = func
 
@@ -279,22 +159,42 @@ def __get__(self, instance, cls):
             return value
 
 
-def time_col_replace_nulls(input_col):
-
-    null = column.column_empty_like(input_col, masked=True, newsize=1)
-    out_col = cudf._lib.replace.replace(
-        input_col,
-        column.as_column(
-            Buffer(
-                np.array(
-                    [input_col.default_na_value()], dtype=input_col.dtype
-                ).view("|u1")
-            ),
-            dtype=input_col.dtype,
-        ),
-        null,
-    )
-    return out_col
+class GetAttrGetItemMixin:
+    """This mixin changes `__getattr__` to attempt a `__getitem__` call.
+
+    Classes that include this mixin gain enhanced functionality for the
+    behavior of attribute access like `obj.foo`: if `foo` is not an attribute
+    of `obj`, obj['foo'] will be attempted, and the result returned.  To make
+    this behavior safe, classes that include this mixin must define a class
+    attribute `_PROTECTED_KEYS` that defines the attributes that are accessed
+    within `__getitem__`. For example, if `__getitem__` is defined as
+    `return self._data[key]`, we must define `_PROTECTED_KEYS={'_data'}`.
+    """
+
+    # Tracking of protected keys by each subclass is necessary to make the
+    # `__getattr__`->`__getitem__` call safe. See
+    # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html  # noqa: E501
+    # for an explanation. In brief, defining the `_PROTECTED_KEYS` allows this
+    # class to avoid calling `__getitem__` inside `__getattr__` when
+    # `__getitem__` will internally again call `__getattr__`, resulting in an
+    # infinite recursion.
+    # This problem only arises when the copy protocol is invoked (e.g. by
+    # `copy.copy` or `pickle.dumps`), and could also be avoided by redefining
+    # methods involved with the copy protocol such as `__reduce__` or
+    # `__setstate__`, but this class may be used in complex multiple
+    # inheritance hierarchies that might also override serialization.  The
+    # solution here is a minimally invasive change that avoids such conflicts.
+    _PROTECTED_KEYS: Union[FrozenSet[str], Set[str]] = frozenset()
+
+    def __getattr__(self, key):
+        if key in self._PROTECTED_KEYS:
+            raise AttributeError
+        try:
+            return self[key]
+        except KeyError:
+            raise AttributeError(
+                f"{type(self).__name__} object has no attribute {key}"
+            )
 
 
 def raise_iteration_error(obj):
@@ -317,7 +217,8 @@ def pa_mask_buffer_to_mask(mask_buf, size):
     return Buffer(mask_buf)
 
 
-def isnat(val):
+def _isnat(val):
+    """Wraps np.isnat to return False instead of error on invalid inputs."""
     if not isinstance(val, (np.datetime64, np.timedelta64, str)):
         return False
     else:
@@ -461,7 +362,7 @@ def get_appropriate_dispatched_func(
 def _cast_to_appropriate_cudf_type(val, index=None):
     # Handle scalar
     if val.ndim == 0:
-        return cudf.Scalar(val).value
+        return to_cudf_compatible_scalar(val)
     # 1D array
     elif (val.ndim == 1) or (val.ndim == 2 and val.shape[1] == 1):
         # if index is not None and is of a different length
diff --git a/python/cudf/requirements/cuda-10.1/dev_requirements.txt b/python/cudf/requirements/cuda-10.1/dev_requirements.txt
deleted file mode 100644
index 967974d38b5..00000000000
--- a/python/cudf/requirements/cuda-10.1/dev_requirements.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
-
-# pyarrow gpu package will have to be built from source :
-# https://arrow.apache.org/docs/python/install.html#installing-from-source
-
-cupy-cuda101
-cachetools
-cmake
-cmake-setuptools>=0.1.3
-cython>=0.29,<0.30
-dlpack
-fastavro>=0.22.9
-flatbuffers
-fsspec>=0.6.0
-hypothesis
-mimesis
-mypy==0.782
-nbsphinx
-numba>=0.49.0,!=0.51.0
-numpy
-numpydoc
-nvtx>=0.2.1
-packaging
-pandas>=1.0,<1.3.0dev0
-pandoc==2.0a4
-protobuf
-pyorc
-pytest
-pytest-benchmark
-pytest-xdist
-rapidjson
-recommonmark
-setuptools
-sphinx
-sphinx-copybutton
-sphinx-markdown-tables
-sphinx_rtd_theme
-sphinxcontrib-websupport
-typing_extensions
-typing_extensions
-wheel
\ No newline at end of file
diff --git a/python/cudf/requirements/cuda-10.2/dev_requirements.txt b/python/cudf/requirements/cuda-10.2/dev_requirements.txt
deleted file mode 100644
index 34450456b5a..00000000000
--- a/python/cudf/requirements/cuda-10.2/dev_requirements.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
-
-# pyarrow gpu package will have to be built from source :
-# https://arrow.apache.org/docs/python/install.html#installing-from-source
-
-cupy-cuda102
-cachetools
-cmake
-cmake-setuptools>=0.1.3
-cython>=0.29,<0.30
-dlpack
-fastavro>=0.22.9
-flatbuffers
-fsspec>=0.6.0
-hypothesis
-mimesis
-mypy==0.782
-nbsphinx
-numba>=0.49.0,!=0.51.0
-numpy
-numpydoc
-nvtx>=0.2.1
-packaging
-pandas>=1.0,<1.3.0dev0
-pandoc==2.0a4
-protobuf
-pyorc
-pytest
-pytest-benchmark
-pytest-xdist
-rapidjson
-recommonmark
-setuptools
-sphinx
-sphinx-copybutton
-sphinx-markdown-tables
-sphinx_rtd_theme
-sphinxcontrib-websupport
-typing_extensions
-typing_extensions
-wheel
\ No newline at end of file
diff --git a/python/cudf/requirements/cuda-11.0/dev_requirements.txt b/python/cudf/requirements/cuda-11.0/dev_requirements.txt
index 278b1a6bf61..db9a19537d2 100644
--- a/python/cudf/requirements/cuda-11.0/dev_requirements.txt
+++ b/python/cudf/requirements/cuda-11.0/dev_requirements.txt
@@ -21,7 +21,7 @@ numpy
 numpydoc
 nvtx>=0.2.1
 packaging
-pandas>=1.0,<1.3.0dev0
+pandas>=1.0,<=1.2.4
 pandoc==2.0a4
 protobuf
 pyorc
diff --git a/python/cudf/requirements/cuda-11.1/dev_requirements.txt b/python/cudf/requirements/cuda-11.1/dev_requirements.txt
index fafdc7d7d4f..488d0daabd7 100644
--- a/python/cudf/requirements/cuda-11.1/dev_requirements.txt
+++ b/python/cudf/requirements/cuda-11.1/dev_requirements.txt
@@ -21,7 +21,7 @@ numpy
 numpydoc
 nvtx>=0.2.1
 packaging
-pandas>=1.0,<1.3.0dev0
+pandas>=1.0,<=1.2.4
 pandoc==2.0a4
 protobuf
 pyorc
diff --git a/python/cudf/requirements/cuda-11.2/dev_requirements.txt b/python/cudf/requirements/cuda-11.2/dev_requirements.txt
index db434b7c8ec..33875e01c58 100644
--- a/python/cudf/requirements/cuda-11.2/dev_requirements.txt
+++ b/python/cudf/requirements/cuda-11.2/dev_requirements.txt
@@ -21,7 +21,7 @@ numpy
 numpydoc
 nvtx>=0.2.1
 packaging
-pandas>=1.0,<1.3.0dev0
+pandas>=1.0,<=1.2.4
 pandoc==2.0a4
 protobuf
 pyorc
diff --git a/python/cudf/setup.py b/python/cudf/setup.py
index 5d95516c0dd..67a2238eeca 100644
--- a/python/cudf/setup.py
+++ b/python/cudf/setup.py
@@ -24,7 +24,7 @@
     "fastavro>=0.22.9",
     "fsspec>=0.6.0",
     "numpy",
-    "pandas>=1.0,<1.3.0dev0",
+    "pandas>=1.0,<=1.2.4",
     "typing_extensions",
     "protobuf",
     "nvtx>=0.2.1",
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index 2803212a502..61cebbfd8db 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -285,7 +285,7 @@ def _is_supported(arg, supported: set):
             _global_set = set()
             for col in arg:
                 if isinstance(arg[col], list):
-                    _global_set.union(set(arg[col]))
+                    _global_set = _global_set.union(set(arg[col]))
                 else:
                     _global_set.add(arg[col])
         else:
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index f8ed00beb4f..db469b3b2a9 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -7,11 +7,12 @@
 import dask
 from dask import dataframe as dd
 
-import dask_cudf
-
 import cudf
 from cudf.core._compat import PANDAS_GE_120
 
+import dask_cudf
+from dask_cudf.groupby import _is_supported
+
 
 @pytest.mark.parametrize("aggregation", ["sum", "mean", "count", "min", "max"])
 def test_groupby_basic_aggs(aggregation):
@@ -533,3 +534,11 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
     )
 
     dd.assert_eq(gf, pf)
+
+
+@pytest.mark.parametrize(
+    "arg",
+    [["not_supported"], {"a": "not_supported"}, {"a": ["not_supported"]}],
+)
+def test_is_supported(arg):
+    assert _is_supported(arg, {"supported"}) is False
diff --git a/python/dask_cudf/dev_requirements.txt b/python/dask_cudf/dev_requirements.txt
index c157c0be86f..f98d2c50c99 100644
--- a/python/dask_cudf/dev_requirements.txt
+++ b/python/dask_cudf/dev_requirements.txt
@@ -5,7 +5,7 @@ distributed>=2.22.0,<=2021.4.0
 fsspec>=0.6.0
 numba>=0.49.0,!=0.51.0
 numpy
-pandas>=1.0,<1.3.0dev0
+pandas>=1.0,<=1.2.4
 pytest
 setuptools
 wheel
diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py
index f735d895095..9e2224338cc 100644
--- a/python/dask_cudf/setup.py
+++ b/python/dask_cudf/setup.py
@@ -14,13 +14,13 @@
     "distributed>=2.22.0,<=2021.4.0",
     "fsspec>=0.6.0",
     "numpy",
-    "pandas>=1.0,<1.3.0dev0",
+    "pandas>=1.0,<=1.2.4",
 ]
 
 extras_require = {
     "test": [
         "numpy",
-        "pandas>=1.0,<1.3.0dev0",
+        "pandas>=1.0,<=1.2.4",
         "pytest",
         "numba>=0.49.0,!=0.51.0",
         "dask==2021.4.0",