diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 3623db5a283..69f6634b5c2 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -32,6 +32,15 @@ repos:
                 language: system
                 files: \.(cu|cuh|h|hpp|cpp|inl)$
                 args: ['-fallback-style=none']
+      - repo: local
+        hooks:
+              - id: mypy
+                name: mypy
+                description: mypy
+                pass_filenames: false
+                entry: mypy --config-file=python/cudf/setup.cfg python/cudf/cudf
+                language: system
+                types: [python]
 
 default_language_version:
       python: python3
diff --git a/CHANGELOG.md b/CHANGELOG.md
index a3c84ba1b72..3b027220032 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,8 +8,8 @@
 - PR #6929 Add `Index.set_names` api
 - PR #6907 Add `replace_null` API with `replace_policy` parameter, `fixed_width` column support
 - PR #6885 Share `factorize` implementation with Index and cudf module
-
 - PR #6775 Implement cudf.DateOffset for months
+- PR #7039 Support contains() on lists of primitives
 
 ## Improvements
 
diff --git a/ci/checks/style.sh b/ci/checks/style.sh
index 2534f857ee4..17599c6d74d 100755
--- a/ci/checks/style.sh
+++ b/ci/checks/style.sh
@@ -29,6 +29,10 @@ FLAKE_RETVAL=$?
 FLAKE_CYTHON=`flake8 --config=python/.flake8.cython`
 FLAKE_CYTHON_RETVAL=$?
 
+# Run mypy and get results/return code
+MYPY_CUDF=`mypy --config=python/cudf/setup.cfg python/cudf/cudf`
+MYPY_CUDF_RETVAL=$?
+
 # Run clang-format and check for a consistent code format
 CLANG_FORMAT=`python cpp/scripts/run-clang-format.py 2>&1`
 CLANG_FORMAT_RETVAL=$?
@@ -66,6 +70,14 @@ else
   echo -e "\n\n>>>> PASSED: flake8-cython style check\n\n"
 fi
 
+if [ "$MYPY_CUDF_RETVAL" != "0" ]; then
+  echo -e "\n\n>>>> FAILED: mypy style check; begin output\n\n"
+  echo -e "$MYPY_CUDF"
+  echo -e "\n\n>>>> FAILED: mypy style check; end output\n\n"
+else
+  echo -e "\n\n>>>> PASSED: mypy style check\n\n"
+fi
+
 if [ "$CLANG_FORMAT_RETVAL" != "0" ]; then
   echo -e "\n\n>>>> FAILED: clang format check; begin output\n\n"
   echo -e "$CLANG_FORMAT"
@@ -79,7 +91,7 @@ HEADER_META=`ci/checks/headers_test.sh`
 HEADER_META_RETVAL=$?
 echo -e "$HEADER_META"
 
-RETVALS=($ISORT_RETVAL $BLACK_RETVAL $FLAKE_RETVAL $FLAKE_CYTHON_RETVAL $CLANG_FORMAT_RETVAL $HEADER_META_RETVAL)
+RETVALS=($ISORT_RETVAL $BLACK_RETVAL $FLAKE_RETVAL $FLAKE_CYTHON_RETVAL $CLANG_FORMAT_RETVAL $HEADER_META_RETVAL $MYPY_CUDF_RETVAL)
 IFS=$'\n'
 RETVAL=`echo "${RETVALS[*]}" | sort -nr | head -n1`
 
diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml
index 24882d9b3e2..b810b87111a 100644
--- a/conda/environments/cudf_dev_cuda10.1.yml
+++ b/conda/environments/cudf_dev_cuda10.1.yml
@@ -40,6 +40,8 @@ dependencies:
   - flake8=3.8.3
   - black=19.10
   - isort=5.0.7
+  - mypy=0.782
+  - typing_extensions
   - pre_commit
   - dask>=2.22.0
   - distributed>=2.22.0
diff --git a/conda/environments/cudf_dev_cuda10.2.yml b/conda/environments/cudf_dev_cuda10.2.yml
index 49675fe2154..b4e95bc6730 100644
--- a/conda/environments/cudf_dev_cuda10.2.yml
+++ b/conda/environments/cudf_dev_cuda10.2.yml
@@ -40,6 +40,8 @@ dependencies:
   - flake8=3.8.3
   - black=19.10
   - isort=5.0.7
+  - mypy=0.782
+  - typing_extensions
   - pre_commit
   - dask>=2.22.0
   - distributed>=2.22.0
diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index 2917c2c3ce0..3b21f00ab16 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -40,6 +40,8 @@ dependencies:
   - flake8=3.8.3
   - black=19.10
   - isort=5.0.7
+  - mypy=0.782
+  - typing_extensions
   - pre_commit
   - dask>=2.22.0
   - distributed>=2.22.0
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index ea93c5eb279..c5f7bd34c25 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -34,6 +34,7 @@ requirements:
   run:
     - protobuf
     - python
+    - typing_extensions
     - pandas >=1.0,<1.2.0dev0
     - cupy >7.1.0,<9.0.0a0
     - numba >=0.49.0
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 0da16cd83b8..1d660e2cd74 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -123,7 +123,9 @@ test:
     - test -f $PREFIX/include/cudf/join.hpp
     - test -f $PREFIX/include/cudf/lists/detail/concatenate.hpp
     - test -f $PREFIX/include/cudf/lists/detail/copying.hpp
+    - test -f $PREFIX/include/cudf/lists/count_elements.hpp
     - test -f $PREFIX/include/cudf/lists/extract.hpp
+    - test -f $PREFIX/include/cudf/lists/contains.hpp
     - test -f $PREFIX/include/cudf/lists/gather.hpp
     - test -f $PREFIX/include/cudf/lists/lists_column_view.hpp
     - test -f $PREFIX/include/cudf/merge.hpp
@@ -170,6 +172,7 @@ test:
     - test -f $PREFIX/include/cudf/strings/replace_re.hpp
     - test -f $PREFIX/include/cudf/strings/split/partition.hpp
     - test -f $PREFIX/include/cudf/strings/split/split.hpp
+    - test -f $PREFIX/include/cudf/strings/string_view.hpp
     - test -f $PREFIX/include/cudf/strings/strings_column_view.hpp
     - test -f $PREFIX/include/cudf/strings/strip.hpp
     - test -f $PREFIX/include/cudf/strings/substring.hpp
@@ -200,7 +203,6 @@ test:
     - test -f $PREFIX/include/cudf_test/cudf_gtest.hpp
     - test -f $PREFIX/include/cudf_test/cxxopts.hpp
     - test -f $PREFIX/include/cudf_test/file_utilities.hpp
-    - test -f $PREFIX/include/cudf_test/scalar_utilities.hpp
     - test -f $PREFIX/include/cudf_test/table_utilities.hpp
     - test -f $PREFIX/include/cudf_test/timestamp_utilities.cuh
     - test -f $PREFIX/include/cudf_test/type_list_utilities.hpp
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 19bde0519db..073f0d62c0a 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -164,8 +164,8 @@ ConfigureBench(SEARCH_BENCH "${SEARCH_BENCH_SRC}")
 # - sort benchmark --------------------------------------------------------------------------------
 
 set(SORT_BENCH_SRC
-  "${CMAKE_CURRENT_SOURCE_DIR}/sort/sort_benchmark.cu"
-  "${CMAKE_CURRENT_SOURCE_DIR}/sort/sort_strings_benchmark.cu")
+  "${CMAKE_CURRENT_SOURCE_DIR}/sort/sort_benchmark.cpp"
+  "${CMAKE_CURRENT_SOURCE_DIR}/sort/sort_strings_benchmark.cpp")
 
 ConfigureBench(SORT_BENCH "${SORT_BENCH_SRC}")
 
diff --git a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
index 6006be505bc..d17e7b126c7 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
@@ -63,9 +63,9 @@ void BM_parq_write_varying_inout(benchmark::State& state)
 
 void BM_parq_write_varying_options(benchmark::State& state)
 {
-  auto const compression     = static_cast<cudf::io::compression_type>(state.range(0));
-  auto const enable_stats    = static_cast<cudf::io::statistics_freq>(state.range(1));
-  auto const output_metadata = state.range(2) != 0;
+  auto const compression  = static_cast<cudf::io::compression_type>(state.range(0));
+  auto const enable_stats = static_cast<cudf::io::statistics_freq>(state.range(1));
+  auto const file_path    = state.range(2) != 0 ? "unused_path.parquet" : "";
 
   auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
                                              int32_t(type_group_id::FLOATING_POINT),
@@ -82,8 +82,7 @@ void BM_parq_write_varying_options(benchmark::State& state)
       cudf_io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
         .compression(compression)
         .stats_level(enable_stats)
-        .return_filemetadata(output_metadata)
-        .column_chunks_file_path("dummy_path.parquet");
+        .column_chunks_file_path(file_path);
     cudf_io::write_parquet(options);
   }
 
diff --git a/cpp/benchmarks/io/parquet/parquet_writer_chunks_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_chunks_benchmark.cpp
index 3dd2c3782fa..b38dda4d17e 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer_chunks_benchmark.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer_chunks_benchmark.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -72,12 +72,11 @@ void PQ_write_chunked(benchmark::State& state)
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::chunked_parquet_writer_options opts =
       cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info());
-    auto writer_state = cudf_io::write_parquet_chunked_begin(opts);
-    std::for_each(
-      tables.begin(), tables.end(), [&writer_state](std::unique_ptr<cudf::table> const& tbl) {
-        cudf_io::write_parquet_chunked(*tbl, writer_state);
-      });
-    cudf_io::write_parquet_chunked_end(writer_state);
+    cudf_io::parquet_chunked_writer writer(opts);
+    std::for_each(tables.begin(), tables.end(), [&writer](std::unique_ptr<cudf::table> const& tbl) {
+      writer.write(*tbl);
+    });
+    writer.close();
   }
 
   state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0));
diff --git a/cpp/benchmarks/sort/sort_benchmark.cu b/cpp/benchmarks/sort/sort_benchmark.cpp
similarity index 68%
rename from cpp/benchmarks/sort/sort_benchmark.cu
rename to cpp/benchmarks/sort/sort_benchmark.cpp
index 2ba99eb53d9..89eea0f0ce9 100644
--- a/cpp/benchmarks/sort/sort_benchmark.cu
+++ b/cpp/benchmarks/sort/sort_benchmark.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
-
 #include <cudf/sorting.hpp>
 
 #include <cudf_test/base_fixture.hpp>
@@ -24,18 +22,17 @@
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
 
-#include <cudf/types.hpp>
-
-#include "../common/generate_benchmark_input.hpp"
-#include "../fixture/benchmark_fixture.hpp"
-#include "../synchronization/synchronization.hpp"
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
 
 template <bool stable>
 class Sort : public cudf::benchmark {
 };
 
 template <bool stable>
-static void BM_sort(benchmark::State& state)
+static void BM_sort(benchmark::State& state, bool nulls)
 {
   using Type           = int;
   using column_wrapper = cudf::test::fixed_width_column_wrapper<Type>;
@@ -44,16 +41,16 @@ static void BM_sort(benchmark::State& state)
 
   const cudf::size_type n_rows{(cudf::size_type)state.range(0)};
   const cudf::size_type n_cols{(cudf::size_type)state.range(1)};
-  auto type_size = cudf::size_of(cudf::data_type(cudf::type_to_id<Type>()));
 
   // Create columns with values in the range [0,100)
   std::vector<column_wrapper> columns;
   columns.reserve(n_cols);
   std::generate_n(std::back_inserter(columns), n_cols, [&, n_rows]() {
-    auto valids = cudf::test::make_counting_transform_iterator(
-      0, [](auto i) { return i % 100 == 0 ? false : true; });
     auto elements = cudf::test::make_counting_transform_iterator(
       0, [&](auto row) { return distribution(generator); });
+    if (!nulls) return column_wrapper(elements, elements + n_rows);
+    auto valids = cudf::test::make_counting_transform_iterator(
+      0, [](auto i) { return i % 100 == 0 ? false : true; });
     return column_wrapper(elements, elements + n_rows, valids);
   });
 
@@ -70,14 +67,16 @@ static void BM_sort(benchmark::State& state)
   }
 }
 
-#define SORT_BENCHMARK_DEFINE(name, stable)          \
-  BENCHMARK_TEMPLATE_DEFINE_F(Sort, name, stable)    \
-  (::benchmark::State & st) { BM_sort<stable>(st); } \
-  BENCHMARK_REGISTER_F(Sort, name)                   \
-    ->RangeMultiplier(8)                             \
-    ->Ranges({{1 << 10, 1 << 26}, {1, 8}})           \
-    ->UseManualTime()                                \
+#define SORT_BENCHMARK_DEFINE(name, stable, nulls)          \
+  BENCHMARK_TEMPLATE_DEFINE_F(Sort, name, stable)           \
+  (::benchmark::State & st) { BM_sort<stable>(st, nulls); } \
+  BENCHMARK_REGISTER_F(Sort, name)                          \
+    ->RangeMultiplier(8)                                    \
+    ->Ranges({{1 << 10, 1 << 26}, {1, 8}})                  \
+    ->UseManualTime()                                       \
     ->Unit(benchmark::kMillisecond);
 
-SORT_BENCHMARK_DEFINE(sort_stable, true)
-SORT_BENCHMARK_DEFINE(sort_unstable, false)
+SORT_BENCHMARK_DEFINE(unstable_no_nulls, false, false)
+SORT_BENCHMARK_DEFINE(stable_no_nulls, true, false)
+SORT_BENCHMARK_DEFINE(unstable, false, true)
+SORT_BENCHMARK_DEFINE(stable, true, true)
diff --git a/cpp/benchmarks/sort/sort_strings_benchmark.cu b/cpp/benchmarks/sort/sort_strings_benchmark.cpp
similarity index 100%
rename from cpp/benchmarks/sort/sort_strings_benchmark.cu
rename to cpp/benchmarks/sort/sort_strings_benchmark.cpp
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index 5ae1eaa2b9d..fbf68a20364 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -99,7 +99,6 @@ __launch_bounds__(block_size) __global__
 {
   T* __restrict__ output_data                   = output_view.data<T>();
   cudf::bitmask_type* __restrict__ output_valid = output_view.null_mask();
-  constexpr cudf::size_type leader_lane{0};
   static_assert(block_size <= 1024, "Maximum thread block size exceeded");
 
   int tid                      = threadIdx.x + per_thread * block_size * blockIdx.x;
@@ -109,8 +108,8 @@ __launch_bounds__(block_size) __global__
   __shared__ bool temp_valids[has_validity ? block_size + cudf::detail::warp_size : 1];
   __shared__ T temp_data[block_size];
 
-  cudf::size_type warp_valid_counts{0};
-  cudf::size_type block_sum = 0;
+  cudf::size_type warp_valid_counts{0};  // total valid sum over the `per_thread` loop below
+  cudf::size_type block_sum = 0;         // count passing filter over the `per_thread` loop below
 
   // Note that since the maximum gridDim.x on all supported GPUs is as big as
   // cudf::size_type, this loop is sufficient to cover our maximum column size
@@ -160,6 +159,8 @@ __launch_bounds__(block_size) __global__
       const int wid        = threadIdx.x / cudf::detail::warp_size;
       const int lane       = threadIdx.x % cudf::detail::warp_size;
 
+      cudf::size_type tmp_warp_valid_counts{0};
+
       if (tmp_block_sum > 0 && wid <= last_warp) {
         int valid_index = (block_offset / cudf::detail::warp_size) + wid;
 
@@ -168,9 +169,8 @@ __launch_bounds__(block_size) __global__
 
         // Note the atomicOr's below assume that output_valid has been set to
         // all zero before the kernel
-
         if (lane == 0 && valid_warp != 0) {
-          warp_valid_counts = __popc(valid_warp);
+          tmp_warp_valid_counts = __popc(valid_warp);
           if (wid > 0 && wid < last_warp)
             output_valid[valid_index] = valid_warp;
           else {
@@ -182,19 +182,22 @@ __launch_bounds__(block_size) __global__
         if ((wid == 0) && (last_warp == num_warps)) {
           uint32_t valid_warp = __ballot_sync(0xffffffff, temp_valids[block_size + threadIdx.x]);
           if (lane == 0 && valid_warp != 0) {
-            warp_valid_counts += __popc(valid_warp);
+            tmp_warp_valid_counts += __popc(valid_warp);
             atomicOr(&output_valid[valid_index + num_warps], valid_warp);
           }
         }
       }
+      warp_valid_counts += tmp_warp_valid_counts;
     }
 
     block_offset += tmp_block_sum;
     tid += block_size;
   }
   // Compute total null_count for this block and add it to global count
+  constexpr cudf::size_type leader_lane{0};
   cudf::size_type block_valid_count =
     cudf::detail::single_lane_block_sum_reduce<block_size, leader_lane>(warp_valid_counts);
+
   if (threadIdx.x == 0) {  // one thread computes and adds to null count
     atomicAdd(output_null_count, block_sum - block_valid_count);
   }
diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
index 75a710d1d5c..e95d932920e 100644
--- a/cpp/include/cudf/detail/iterator.cuh
+++ b/cpp/include/cudf/detail/iterator.cuh
@@ -174,6 +174,21 @@ auto inline make_validity_iterator(column_device_view const& column)
                                          validity_accessor{column});
 }
 
+/**
+ * @brief Constructs a constant device iterator over a scalar's validity.
+ *
+ * Dereferencing the returned iterator returns a `bool`.
+ *
+ * For `p = *(iter + i)`, `p` is the validity of the scalar.
+ *
+ * @param scalar_value The scalar to iterate
+ * @return auto Iterator that returns scalar validity
+ */
+auto inline make_validity_iterator(scalar const& scalar_value)
+{
+  return thrust::make_constant_iterator(scalar_value.is_valid());
+}
+
 /**
  * @brief value accessor for scalar with valid data.
  * The unary functor returns data of Element type of the scalar.
diff --git a/cpp/include/cudf/detail/utilities/trie.cuh b/cpp/include/cudf/detail/utilities/trie.cuh
index 5370c8678cf..77b184a4874 100644
--- a/cpp/include/cudf/detail/utilities/trie.cuh
+++ b/cpp/include/cudf/detail/utilities/trie.cuh
@@ -135,24 +135,22 @@ inline thrust::host_vector<SerialTrieNode> createSerializedTrie(
  * @return Boolean value, true if string is found, false otherwise
  */
 __host__ __device__ inline bool serialized_trie_contains(device_span<SerialTrieNode const> trie,
-                                                         char const *key,
-                                                         size_t key_len)
+                                                         device_span<char const> key)
 {
   if (trie.data() == nullptr || trie.empty()) return false;
-  if (key_len == 0) return trie[0].is_leaf;
-  int curr_node = 1;
-  for (size_t i = 0; i < key_len; ++i) {
+  if (key.empty()) return trie.front().is_leaf;
+  auto curr_node = trie.begin() + 1;
+  for (auto curr_key = key.begin(); curr_key < key.end(); ++curr_key) {
     // Don't jump away from root node
-    if (i != 0) { curr_node += trie[curr_node].children_offset; }
+    if (curr_key != key.begin()) { curr_node += curr_node->children_offset; }
     // Search for the next character in the array of children nodes
     // Nodes are sorted - terminate search if the node is larger or equal
-    while (trie[curr_node].character != trie_terminating_character &&
-           trie[curr_node].character < key[i]) {
+    while (curr_node->character != trie_terminating_character && curr_node->character < *curr_key) {
       ++curr_node;
     }
     // Could not find the next character, done with the search
-    if (trie[curr_node].character != key[i]) { return false; }
+    if (curr_node->character != *curr_key) { return false; }
   }
   // Even if the node is present, return true only if that node is at the end of a word
-  return trie[curr_node].is_leaf;
+  return curr_node->is_leaf;
 }
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index 163d8c9d735..2c946dae748 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,14 +20,26 @@
 
 #pragma once
 
-#include <cudf/io/parquet.hpp>
-
+#include <cudf/io/detail/utils.hpp>
+#include <cudf/io/types.hpp>
+#include <cudf/table/table_view.hpp>
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <string>
+#include <vector>
 
 namespace cudf {
 namespace io {
+
+// Forward declaration
+class parquet_reader_options;
+class parquet_writer_options;
+class chunked_parquet_writer_options;
+
 namespace detail {
 namespace parquet {
+
 /**
  * @brief Class to read Parquet dataset data into columns.
  */
@@ -90,63 +102,54 @@ class writer {
    *
    * @param sink The data sink to write the data to
    * @param options Settings for controlling writing behavior
+   * @param mode Option to write at once or in chunks
    * @param mr Device memory resource to use for device memory allocation
+   * @param stream CUDA stream used for device memory operations and kernel launches
    */
   explicit writer(std::unique_ptr<cudf::io::data_sink> sink,
                   parquet_writer_options const& options,
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-  /**
-   * @brief Destructor explicitly-declared to avoid inlined in header
-   */
-  ~writer();
+                  SingleWriteMode mode                = SingleWriteMode::YES,
+                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
+                  rmm::cuda_stream_view stream        = rmm::cuda_stream_default);
 
   /**
-   * @brief Writes the dataset as per options provided.
+   * @brief Constructor for writer to handle chunked parquet options.
    *
-   * @param table Set of columns to output
-   * @param metadata Table metadata and column names
-   * @param return_filemetadata If true, return the raw file metadata
-   * @param column_chunks_file_path Column chunks file path to be set in the raw output metadata
-   * @param int96_timestamps If true, write timestamps as INT96 values
-   * @param stream CUDA stream used for device memory operations and kernel launches.
+   * @param sink The data sink to write the data to
+   * @param options Settings for controlling writing behavior for chunked writer
+   * @param mode Option to write at once or in chunks
+   * @param mr Device memory resource to use for device memory allocation
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   *
+   * @return A parquet-compatible blob that contains the data for all rowgroups in the list
    */
-  std::unique_ptr<std::vector<uint8_t>> write(
-    table_view const& table,
-    const table_metadata* metadata                = nullptr,
-    bool return_filemetadata                      = false,
-    const std::string column_chunks_file_path     = "",
-    std::vector<uint8_t> const& decimal_precision = {},
-    rmm::cuda_stream_view stream                  = rmm::cuda_stream_default);
+  explicit writer(std::unique_ptr<cudf::io::data_sink> sink,
+                  chunked_parquet_writer_options const& options,
+                  SingleWriteMode mode                = SingleWriteMode::NO,
+                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
+                  rmm::cuda_stream_view stream        = rmm::cuda_stream_default);
 
   /**
-   * @brief Begins the chunked/streamed write process.
-   *
-   * @param[in] pq_chunked_state Internal state maintained between chunks.
+   * @brief Destructor explicitly-declared to avoid inlined in header
    */
-  void write_chunked_begin(struct pq_chunked_state& state);
+  ~writer();
 
   /**
    * @brief Writes a single subtable as part of a larger parquet file/table write.
    *
    * @param[in] table The table information to be written
-   * @param[in] pq_chunked_state Internal state maintained between chunks.
    */
-  void write_chunk(table_view const& table, struct pq_chunked_state& state);
+  void write(table_view const& table);
 
   /**
    * @brief Finishes the chunked/streamed write process.
    *
-   * @param[in] pq_chunked_state Internal state maintained between chunks.
-   * @param[in] return_filemetadata If true, return the raw file metadata
    * @param[in] column_chunks_file_path Column chunks file path to be set in the raw output metadata
    *
-   * @return A parquet-compatible blob that contains the data for all rowgroups in the list
+   * @return A parquet-compatible blob that contains the data for all rowgroups in the list only if
+   * `column_chunks_file_path` is provided, else null.
    */
-  std::unique_ptr<std::vector<uint8_t>> write_chunked_end(
-    struct pq_chunked_state& state,
-    bool return_filemetadata                   = false,
-    const std::string& column_chunks_file_path = "");
+  std::unique_ptr<std::vector<uint8_t>> close(std::string const& column_chunks_file_path = "");
 
   /**
    * @brief Merges multiple metadata blobs returned by write_all into a single metadata blob
diff --git a/cpp/include/cudf/io/detail/utils.hpp b/cpp/include/cudf/io/detail/utils.hpp
index 3c674985ef9..adb7078d96d 100644
--- a/cpp/include/cudf/io/detail/utils.hpp
+++ b/cpp/include/cudf/io/detail/utils.hpp
@@ -20,7 +20,7 @@ namespace cudf {
 namespace io {
 namespace detail {
 /**
- * @brief Whether writer writes in chunks or at once
+ * @brief Whether writer writes in chunks or all at once
  */
 enum class SingleWriteMode : bool { YES, NO };
 }  // namespace detail
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index a602fb2cfcf..cd3b7bf27da 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/io/detail/parquet.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
@@ -395,8 +396,6 @@ class parquet_writer_options {
   table_view _table;
   // Optional associated metadata
   const table_metadata* _metadata = nullptr;
-  // Optionally return the raw parquet file metadata output
-  bool _return_filemetadata = false;
   // Parquet writes can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
   bool _write_timestamps_as_int96 = false;
   // Column chunks file path to be set in the raw output metadata
@@ -473,11 +472,6 @@ class parquet_writer_options {
    */
   bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; }
 
-  /**
-   * @brief Returns `true` if metadata is required, `false` otherwise.
-   */
-  bool is_enabled_return_filemetadata() const { return _return_filemetadata; }
-
   /**
    * @brief Returns Column chunks file path to be set in the raw output metadata.
    */
@@ -509,13 +503,6 @@ class parquet_writer_options {
    */
   void set_compression(compression_type compression) { _compression = compression; }
 
-  /**
-   * @brief Sets whether filemetadata is required or not.
-   *
-   * @param req Boolean value to enable/disable return of file metadata.
-   */
-  void enable_return_filemetadata(bool req) { _return_filemetadata = req; }
-
   /**
    * @brief Sets timestamp writing preferences. INT96 timestamps will be written
    * if `true` and TIMESTAMP_MICROS will be written if `false`.
@@ -598,18 +585,6 @@ class parquet_writer_options_builder {
     return *this;
   }
 
-  /**
-   * @brief Sets whether filemetadata is required or not in parquet_writer_options.
-   *
-   * @param req Boolean value to enable/disable return of file metadata.
-   * @return this for chaining.
-   */
-  parquet_writer_options_builder& return_filemetadata(bool req)
-  {
-    options._return_filemetadata = req;
-    return *this;
-  }
-
   /**
    * @brief Sets column chunks file path to be set in the raw output metadata.
    *
@@ -899,82 +874,77 @@ class chunked_parquet_writer_options_builder {
 };
 
 /**
- * @brief Forward declaration of anonymous chunked-writer state struct.
+ * @brief Merges multiple raw metadata blobs that were previously created by write_parquet
+ * into a single metadata blob
+ *
+ * @ingroup io_writers
+ *
+ * @param[in] metadata_list List of input file metadata
+ * @return A parquet-compatible blob that contains the data for all rowgroups in the list
  */
-struct pq_chunked_state;
+std::unique_ptr<std::vector<uint8_t>> merge_rowgroup_metadata(
+  const std::vector<std::unique_ptr<std::vector<uint8_t>>>& metadata_list);
 
 /**
- * @brief Begin the process of writing a parquet file in a chunked/stream form.
+ * @brief chunked parquet writer class to handle options and write tables in chunks.
  *
- * The intent of the write_parquet_chunked_ path is to allow writing of an
+ * The intent of the parquet_chunked_writer is to allow writing of an
  * arbitrarily large / arbitrary number of rows to a parquet file in multiple passes.
  *
  * The following code snippet demonstrates how to write a single parquet file containing
  * one logical table by writing a series of individual cudf::tables.
+ *
  * @code
  *  ...
  *  std::string filepath = "dataset.parquet";
  *  cudf::io::chunked_parquet_writer_options options =
  *  cudf::io::chunked_parquet_writer_options::builder(cudf::sink_info(filepath), table->view());
  *  ...
- *  auto state = cudf::write_parquet_chunked_begin(options);
- *    cudf::write_parquet_chunked(table0, state);
- *    cudf::write_parquet_chunked(table1, state);
- *    ...
- *  cudf_write_parquet_chunked_end(state);
- * @endcode
- *
- * @param[in] options Settings for controlling writing behavior.
- * @param[in] mr Device memory resource to use for device memory allocation.
- *
- * @return pointer to an anonymous state structure storing information about the chunked write.
- * this pointer must be passed to all subsequent write_parquet_chunked() and
- * write_parquet_chunked_end() calls.
+ *  cudf::io::parquet_chunked_writer writer(options)
+ *  writer.write(table0)
+ *  writer.write(table1)
+ *  ...
+ *  writer.close()
+ *  @endcode
  */
-std::shared_ptr<pq_chunked_state> write_parquet_chunked_begin(
-  chunked_parquet_writer_options const& options,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+class parquet_chunked_writer {
+ public:
+  /**
+   * @brief Default constructor, this should never be used.
+   *        This is added just to satisfy cython.
+   */
+  parquet_chunked_writer() = default;
 
-/**
- * @brief Write a single table as a subtable of a larger logical parquet file/table.
- *
- * All tables passed into multiple calls of this function must contain the same # of columns and
- * have columns of the same type.
- *
- * @param[in] table The table data to be written.
- * @param[in] state Opaque state information about the writer process. Must be the same pointer
- * returned from write_parquet_chunked_begin().
- * @param[in] int96_timestamps Write out timestamps as INT96 type
- */
-void write_parquet_chunked(table_view const& table, std::shared_ptr<pq_chunked_state> state);
+  /**
+   * @brief Constructor with chunked writer options
+   *
+   * @param[in] op options used to write table
+   * @param[in] mr Device memory resource to use for device memory allocation
+   */
+  parquet_chunked_writer(
+    chunked_parquet_writer_options const& op,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Finish writing a chunked/stream parquet file.
- *
- * @param[in] state Opaque state information about the writer process. Must be the same pointer
- * returned from write_parquet_chunked_begin().
- * @param[in] return_filemetadata If true, return the raw file metadata.
- * @param[in] column_chunks_file_path Column chunks file path to be set in the raw output metadata.
- *
- * @return A blob that contains the file metadata (parquet FileMetadata thrift message) if
- *         requested in parquet_writer_options (empty blob otherwise).
- */
-std::unique_ptr<std::vector<uint8_t>> write_parquet_chunked_end(
-  std::shared_ptr<pq_chunked_state>& state,
-  bool return_filemetadata                   = false,
-  const std::string& column_chunks_file_path = "");
+  /**
+   * @brief Writes table to output.
+   *
+   * @param[in] table Table that needs to be written
+   * @return returns reference of the class object
+   */
+  parquet_chunked_writer& write(table_view const& table);
 
-/**
- * @brief Merges multiple raw metadata blobs that were previously created by write_parquet
- * into a single metadata blob
- *
- * @ingroup io_writers
- *
- * @param[in] metadata_list List of input file metadata
- * @return A parquet-compatible blob that contains the data for all rowgroups in the list
- */
-std::unique_ptr<std::vector<uint8_t>> merge_rowgroup_metadata(
-  const std::vector<std::unique_ptr<std::vector<uint8_t>>>& metadata_list);
+  /**
+   * @brief Finishes the chunked/streamed write process.
+   *
+   * @param[in] column_chunks_file_path Column chunks file path to be set in the raw output metadata
+   * @return A parquet-compatible blob that contains the data for all rowgroups in the list only if
+   * `column_chunks_file_path` is provided, else null.
+   */
+  std::unique_ptr<std::vector<uint8_t>> close(std::string const& column_chunks_file_path = "");
+
+  // Unique pointer to impl writer class
+  std::unique_ptr<cudf::io::detail::parquet::writer> writer;
+};
 
 /** @} */  // end of group
 }  // namespace io
diff --git a/cpp/include/cudf/lists/contains.hpp b/cpp/include/cudf/lists/contains.hpp
new file mode 100644
index 00000000000..7cd40bb2f86
--- /dev/null
+++ b/cpp/include/cudf/lists/contains.hpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+
+namespace cudf {
+namespace lists {
+/**
+ * @addtogroup lists_contains
+ * @{
+ * @file
+ */
+
+/**
+ * @brief Create a column of bool values indicating whether the specified scalar
+ * is an element of each row of a list column.
+ *
+ * The output column has as many elements as the input `lists` column.
+ * Output `column[i]` is set to true if the lists row `lists[i]` contains the value
+ * specified in `search_key`. Otherwise, it is set to false.
+ *
+ * Output `column[i]` is set to null if one or more of the following are true:
+ *   1. The search key `search_key` is null
+ *   2. The list row `lists[i]` is null
+ *   3. The list row `lists[i]` does not contain the search key, and contains at least
+ *      one null.
+ *
+ * @param lists Lists column whose `n` rows are to be searched
+ * @param search_key The scalar key to be looked up in each list row
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return std::unique_ptr<column> BOOL8 column of `n` rows with the result of the lookup
+ */
+std::unique_ptr<column> contains(
+  cudf::lists_column_view const& lists,
+  cudf::scalar const& search_key,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Create a column of bool values indicating whether the list rows of the first
+ * column contain the corresponding values in the second column
+ *
+ * The output column has as many elements as the input `lists` column.
+ * Output `column[i]` is set to true if the lists row `lists[i]` contains the value
+ * in `search_keys[i]`. Otherwise, it is set to false.
+ *
+ * Output `column[i]` is set to null if one or more of the following are true:
+ *   1. The row `search_keys[i]` is null
+ *   2. The list row `lists[i]` is null
+ *   3. The list row `lists[i]` does not contain the `search_keys[i]`, and contains at least
+ *      one null.
+ *
+ * @param lists Lists column whose `n` rows are to be searched
+ * @param search_keys Column of elements to be looked up in each list row
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return std::unique_ptr<column> BOOL8 column of `n` rows with the result of the lookup
+ */
+std::unique_ptr<column> contains(
+  cudf::lists_column_view const& lists,
+  cudf::column_view const& search_keys,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/** @} */  // end of group
+}  // namespace lists
+}  // namespace cudf
diff --git a/cpp/include/cudf/lists/count_elements.hpp b/cpp/include/cudf/lists/count_elements.hpp
new file mode 100644
index 00000000000..6b802d2ad5e
--- /dev/null
+++ b/cpp/include/cudf/lists/count_elements.hpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+
+namespace cudf {
+namespace lists {
+/**
+ * @addtogroup lists_elements
+ * @{
+ * @file
+ */
+
+/**
+ * @brief Returns a numeric column containing the number of rows in
+ * each list element in the given lists column.
+ *
+ * The output column will have the same number of rows as the
+ * input lists column. Each `output[i]` will be `input[i].size()`.
+ *
+ * @code{.pseudo}
+ * l = { {1, 2, 3}, {4}, {5, 6} }
+ * r = count_elements(l)
+ * r is now {3, 1, 2}
+ * @endcode
+ *
+ * Any null input element will result in a corresponding null entry
+ * in the output column.
+ *
+ * @param input Input lists column.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return New INT32 column with the number of elements for each row.
+ */
+std::unique_ptr<column> count_elements(
+  lists_column_view const& input,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/** @} */  // end of lists_elements group
+
+}  // namespace lists
+}  // namespace cudf
diff --git a/cpp/include/cudf/lists/list_device_view.cuh b/cpp/include/cudf/lists/list_device_view.cuh
index 38708d4878e..824b10ced83 100644
--- a/cpp/include/cudf/lists/list_device_view.cuh
+++ b/cpp/include/cudf/lists/list_device_view.cuh
@@ -112,12 +112,82 @@ class list_device_view {
    */
   CUDA_DEVICE_CALLABLE lists_column_device_view const& get_column() const { return lists_column; }
 
+  template <typename T>
+  struct pair_accessor;
+
+  template <typename T>
+  using const_pair_iterator =
+    thrust::transform_iterator<pair_accessor<T>, thrust::counting_iterator<cudf::size_type>>;
+
+  /**
+   * @brief Fetcher for a pair iterator to the first element in the list_device_view.
+   *
+   * Dereferencing the returned iterator yields a `thrust::pair<T, bool>`.
+   *
+   * If the element at index `i` is valid, then for `p = iter[i]`,
+   *   1. `p.first` is the value of the element at `i`
+   *   2. `p.second == true`
+   *
+   * If the element at index `i` is null,
+   *   1. `p.first` is undefined
+   *   2. `p.second == false`
+   */
+  template <typename T>
+  CUDA_DEVICE_CALLABLE const_pair_iterator<T> pair_begin() const
+  {
+    return const_pair_iterator<T>{thrust::counting_iterator<size_type>(0), pair_accessor<T>{*this}};
+  }
+
+  /**
+   * @brief Fetcher for a pair iterator to one position past the last element in the
+   * list_device_view.
+   */
+  template <typename T>
+  CUDA_DEVICE_CALLABLE const_pair_iterator<T> pair_end() const
+  {
+    return const_pair_iterator<T>{thrust::counting_iterator<size_type>(size()),
+                                  pair_accessor<T>{*this}};
+  }
+
  private:
   lists_column_device_view const& lists_column;
   size_type _row_index{};  // Row index in the Lists column vector.
   size_type _size{};       // Number of elements in *this* list row.
 
   size_type begin_offset;  // Offset in list_column_device_view where this list begins.
+
+  /**
+   * @brief pair accessor for elements in a `list_device_view`
+   *
+   * This unary functor returns a pair of:
+   *   1. data element at a specified index
+   *   2. boolean validity flag for that element
+   *
+   * @tparam T The element-type of the list row
+   */
+  template <typename T>
+  struct pair_accessor {
+    list_device_view const& list;
+
+    /**
+     * @brief constructor
+     *
+     * @param _list The `list_device_view` whose rows are being accessed.
+     */
+    explicit CUDA_HOST_DEVICE_CALLABLE pair_accessor(list_device_view const& _list) : list{_list} {}
+
+    /**
+     * @brief Accessor for the {data, validity} pair at the specified index
+     *
+     * @param i Index into the list_device_view
+     * @return A pair of data element and its validity flag.
+     */
+    CUDA_DEVICE_CALLABLE
+    thrust::pair<T, bool> operator()(cudf::size_type i) const
+    {
+      return {list.element<T>(i), !list.is_null(i)};
+    }
+  };
 };
 
 }  // namespace cudf
diff --git a/cpp/include/cudf/reshape.hpp b/cpp/include/cudf/reshape.hpp
index 4561554a0f5..29c9fa2e720 100644
--- a/cpp/include/cudf/reshape.hpp
+++ b/cpp/include/cudf/reshape.hpp
@@ -97,6 +97,48 @@ std::unique_ptr<column> byte_cast(
   flip_endianness endian_configuration,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Explodes a list column's elements.
+ *
+ * Any list is exploded, which means the elements of the list in each row are expanded into new rows
+ * in the output. The corresponding rows for other columns in the input are duplicated. Example:
+ * ```
+ * [[5,10,15], 100],
+ * [[20,25],   200],
+ * [[30],      300],
+ * returns
+ * [5,         100],
+ * [10,        100],
+ * [15,        100],
+ * [20,        200],
+ * [25,        200],
+ * [30,        300],
+ * ```
+ *
+ * Nulls and empty lists propagate in different ways depending on what is null or empty.
+ *```
+ * [[5,null,15], 100],
+ * [null,        200],
+ * [[],          300],
+ * returns
+ * [5,           100],
+ * [null,        100],
+ * [15,          100],
+ * ```
+ * Note that null lists are completely removed from the output
+ * and nulls and empty lists inside lists are pulled out and remain.
+ *
+ * @param input_table Table to explode.
+ * @param explode_column_idx Column index to explode inside the table.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ *
+ * @return A new table with explode_col exploded.
+ */
+std::unique_ptr<table> explode(
+  table_view const& input_table,
+  size_type explode_column_idx,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 
 }  // namespace cudf
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index 2f4a54e8143..ded833f4ca0 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -430,11 +430,7 @@ class string_scalar : public scalar {
   string_scalar(value_type const& source,
                 bool is_valid                       = true,
                 rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-    : scalar(data_type(type_id::STRING), is_valid),
-      _data(source.data(), source.size_bytes(), stream, mr)
-  {
-  }
+                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new string scalar object from string_view in device memory
@@ -448,10 +444,7 @@ class string_scalar : public scalar {
   string_scalar(rmm::device_scalar<value_type>& data,
                 bool is_valid                       = true,
                 rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-    : string_scalar(data.value(stream), is_valid, stream, mr)
-  {
-  }
+                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Implicit conversion operator to get the value of the scalar in a host std::string
@@ -470,10 +463,7 @@ class string_scalar : public scalar {
    *
    * @param stream CUDA stream used for device memory operations.
    */
-  value_type value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const
-  {
-    return value_type{data(), size()};
-  }
+  value_type value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
    * @brief Returns the size of the string in bytes
diff --git a/cpp/include/cudf/scalar/scalar_device_view.cuh b/cpp/include/cudf/scalar/scalar_device_view.cuh
index aa3cd932f4f..d1b542a6cf2 100644
--- a/cpp/include/cudf/scalar/scalar_device_view.cuh
+++ b/cpp/include/cudf/scalar/scalar_device_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 #pragma once
 
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/string_view.hpp>
 #include <cudf/types.hpp>
 
 /**
diff --git a/cpp/include/cudf/strings/detail/sorting.cuh b/cpp/include/cudf/strings/detail/sorting.cuh
deleted file mode 100644
index d23c6d3d4f4..00000000000
--- a/cpp/include/cudf/strings/detail/sorting.cuh
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/strings/string_view.cuh>
-#include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-
-namespace cudf {
-namespace strings {
-namespace detail {
-
-/**
- * @brief Comparator for sorting strings column rows.
- */
-struct sort_strings_comparator {
-  __device__ bool operator()(size_type lhs, size_type rhs)
-  {
-    if (has_nulls) {
-      bool lhs_null{d_column.is_null(lhs)};
-      bool rhs_null{d_column.is_null(rhs)};
-      if (lhs_null || rhs_null) {
-        if (!ascending) thrust::swap(lhs_null, rhs_null);
-        return null_prec == cudf::null_order::BEFORE ? !rhs_null : !lhs_null;
-      }
-    }
-    auto const lhs_str = d_column.element<string_view>(lhs);
-    auto const rhs_str = d_column.element<string_view>(rhs);
-    auto const cmp     = lhs_str.compare(rhs_str);
-    return ascending ? (cmp < 0) : (cmp > 0);
-  }
-  column_device_view const d_column;
-  bool has_nulls;
-  bool ascending;
-  cudf::null_order null_prec;
-};
-
-/**
- * @brief Returns an indices column that is the sorted rows of the
- * input strings column.
- *
- * @param strings Strings instance for this operation.
- * @param sort_order Sort strings in ascending or descending order.
- * @param null_precedence Sort nulls to the beginning or the end of the new column.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return Indices of the sorted rows.
- */
-template <bool stable = false>
-std::unique_ptr<cudf::column> sorted_order(
-  strings_column_view const strings,
-  cudf::order sort_order              = cudf::order::ASCENDING,
-  cudf::null_order null_precedence    = cudf::null_order::BEFORE,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_column       = *strings_column;
-
-  std::unique_ptr<column> sorted_indices = cudf::make_numeric_column(
-    data_type(type_to_id<size_type>()), strings.size(), mask_state::UNALLOCATED, stream, mr);
-  auto d_indices = sorted_indices->mutable_view();
-  thrust::sequence(
-    rmm::exec_policy(stream), d_indices.begin<size_type>(), d_indices.end<size_type>(), 0);
-
-  sort_strings_comparator comparator{
-    d_column, strings.has_nulls(), sort_order == cudf::order::ASCENDING, null_precedence};
-  if (stable) {
-    thrust::stable_sort(rmm::exec_policy(stream),
-                        d_indices.begin<size_type>(),
-                        d_indices.end<size_type>(),
-                        comparator);
-  } else {
-    thrust::sort(rmm::exec_policy(stream),
-                 d_indices.begin<size_type>(),
-                 d_indices.end<size_type>(),
-                 comparator);
-  }
-  return sorted_indices;
-}
-
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
diff --git a/cpp/include/cudf/strings/detail/utilities.cuh b/cpp/include/cudf/strings/detail/utilities.cuh
index aca719ad978..ba903c87485 100644
--- a/cpp/include/cudf/strings/detail/utilities.cuh
+++ b/cpp/include/cudf/strings/detail/utilities.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/strings/string_view.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index 802312d91b1..9a57ac1e20d 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,316 +13,21 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #pragma once
 
-#include <cuda_runtime.h>
-#include <cstddef>
-#include <cudf/types.hpp>
-#include <iterator>
+#include <cudf/strings/string_view.hpp>
 
-/**
- * @file
- * @brief Class definition for cudf::string_view.
- */
-
-namespace cudf {
+#include <thrust/count.h>
+#include <thrust/find.h>
+#include <cstdlib>
 
-using char_utf8 = uint32_t;  ///< UTF-8 characters are 1-4 bytes
-
-/**
- * @brief A non-owning, immutable view of device data that is a variable length
- * char array representing a UTF-8 string.
- *
- * @ingroup strings_classes
- *
- * The caller must maintain the device memory for the lifetime of this instance.
- *
- * It provides a simple wrapper and string operations for an individual string
- * within a column of strings.
- */
-class string_view {
- public:
-  /**
-   * @brief Default constructor represents an empty string.
-   */
-  __host__ __device__ string_view();
-
-  /**
-   * @brief Create instance from existing device char array.
-   *
-   * @param data Device char array encoded in UTF8.
-   * @param bytes Number of bytes in data array.
-   */
-  __host__ __device__ string_view(const char* data, size_type bytes);
-
-  string_view(const string_view&) = default;
-  string_view(string_view&&)      = default;
-  ~string_view()                  = default;
-  string_view& operator=(const string_view&) = default;
-  string_view& operator=(string_view&&) = default;
-
-  /**
-   * @brief Return the number of bytes in this string
-   */
-  __host__ __device__ size_type size_bytes() const;
-  /**
-   * @brief Return the number of characters in this string
-   */
-  __device__ size_type length() const;
-  /**
-   * @brief Return a pointer to the internal device array
-   */
-  __host__ __device__ const char* data() const;
-
-  /**
-   * @brief Return true if string has no characters
-   */
-  __host__ __device__ bool empty() const;
-
-  /**
-   * @brief Handy iterator for navigating through encoded characters.
-   */
-  class const_iterator {
-   public:
-    using difference_type   = ptrdiff_t;
-    using value_type        = char_utf8;
-    using reference         = char_utf8&;
-    using pointer           = char_utf8*;
-    using iterator_category = std::input_iterator_tag;
-    __device__ const_iterator(const string_view& str, size_type pos);
-    const_iterator(const const_iterator& mit) = default;
-    const_iterator(const_iterator&& mit)      = default;
-    const_iterator& operator=(const const_iterator&) = default;
-    const_iterator& operator=(const_iterator&&) = default;
-    __device__ const_iterator& operator++();
-    __device__ const_iterator operator++(int);
-    __device__ const_iterator& operator+=(difference_type);
-    __device__ const_iterator operator+(difference_type);
-    __device__ const_iterator& operator--();
-    __device__ const_iterator operator--(int);
-    __device__ const_iterator& operator-=(difference_type);
-    __device__ const_iterator operator-(difference_type);
-    __device__ bool operator==(const const_iterator&) const;
-    __device__ bool operator!=(const const_iterator&) const;
-    __device__ bool operator<(const const_iterator&) const;
-    __device__ bool operator<=(const const_iterator&) const;
-    __device__ bool operator>(const const_iterator&) const;
-    __device__ bool operator>=(const const_iterator&) const;
-    __device__ char_utf8 operator*() const;
-    __device__ size_type position() const;
-    __device__ size_type byte_offset() const;
-
-   private:
-    const char* p{};
-    size_type bytes{};
-    size_type char_pos{};
-    size_type byte_pos{};
-  };
-
-  /**
-   * @brief Return new iterator pointing to the beginning of this string
-   */
-  __device__ const_iterator begin() const;
-  /**
-   * @brief Return new iterator pointing past the end of this string
-   */
-  __device__ const_iterator end() const;
-
-  /**
-   * @brief Return single UTF-8 character at the given character position
-   *
-   * @param pos Character position
-   */
-  __device__ char_utf8 operator[](size_type pos) const;
-  /**
-   * @brief Return the byte offset from data() for a given character position
-   *
-   * @param pos Character position
-   */
-  __device__ size_type byte_offset(size_type pos) const;
-
-  /**
-   * @brief Comparing target string with this string. Each character is compared
-   * as a UTF-8 code-point value.
-   *
-   * @param str Target string to compare with this string.
-   * @return 0  If they compare equal.
-   *         <0 Either the value of the first character of this string that does
-   *            not match is lower in the arg string, or all compared characters
-   *            match but the arg string is shorter.
-   *         >0 Either the value of the first character of this string that does
-   *            not match is greater in the arg string, or all compared characters
-   *            match but the arg string is longer.
-   */
-  __device__ int compare(const string_view& str) const;
-  /**
-   * @brief Comparing target string with this string. Each character is compared
-   * as a UTF-8 code-point value.
-   *
-   * @param str Target string to compare with this string.
-   * @param bytes Number of bytes in str.
-   * @return 0  If they compare equal.
-   *         <0 Either the value of the first character of this string that does
-   *            not match is lower in the arg string, or all compared characters
-   *            match but the arg string is shorter.
-   *         >0 Either the value of the first character of this string that does
-   *            not match is greater in the arg string, or all compared characters
-   *            match but the arg string is longer.
-   */
-  __device__ int compare(const char* str, size_type bytes) const;
-
-  /**
-   * @brief Returns true if rhs matches this string exactly.
-   */
-  __device__ bool operator==(const string_view& rhs) const;
-  /**
-   * @brief Returns true if rhs does not match this string.
-   */
-  __device__ bool operator!=(const string_view& rhs) const;
-  /**
-   * @brief Returns true if this string is ordered before rhs.
-   */
-  __device__ bool operator<(const string_view& rhs) const;
-  /**
-   * @brief Returns true if rhs is ordered before this string.
-   */
-  __device__ bool operator>(const string_view& rhs) const;
-  /**
-   * @brief Returns true if this string matches or is ordered before rhs.
-   */
-  __device__ bool operator<=(const string_view& rhs) const;
-  /**
-   * @brief Returns true if rhs matches or is ordered before this string.
-   */
-  __device__ bool operator>=(const string_view& rhs) const;
-
-  /**
-   * @brief Returns the character position of the first occurrence where the
-   * argument str is found in this string within the character range [pos,pos+n).
-   *
-   * @param str Target string to search within this string.
-   * @param pos Character position to start search within this string.
-   * @param count Number of characters from pos to include in the search.
-   *              Specify -1 to indicate to the end of the string.
-   * @return -1 if str is not found in this string.
-   */
-  __device__ size_type find(const string_view& str, size_type pos = 0, size_type count = -1) const;
-  /**
-   * @brief Returns the character position of the first occurrence where the
-   * array str is found in this string within the character range [pos,pos+n).
-   *
-   * @param str Target array to search within this string.
-   * @param bytes Number of bytes in str.
-   * @param pos Character position to start search within this string.
-   * @param count Number of characters from pos to include in the search.
-   *              Specify -1 to indicate to the end of the string.
-   * @return -1 if arg string is not found in this string.
-   */
-  __device__ size_type find(const char* str,
-                            size_type bytes,
-                            size_type pos   = 0,
-                            size_type count = -1) const;
-  /**
-   * @brief Returns the character position of the first occurrence where
-   * character is found in this string within the character range [pos,pos+n).
-   *
-   * @param character Single encoded character.
-   * @param pos Character position to start search within this string.
-   * @param count Number of characters from pos to include in the search.
-   *              Specify -1 to indicate to the end of the string.
-   * @return -1 if arg string is not found in this string.
-   */
-  __device__ size_type find(char_utf8 character, size_type pos = 0, size_type count = -1) const;
-  /**
-   * @brief Returns the character position of the last occurrence where the
-   * argument str is found in this string within the character range [pos,pos+n).
-   *
-   * @param str Target string to search within this string.
-   * @param pos Character position to start search within this string.
-   * @param count Number of characters from pos to include in the search.
-   *              Specify -1 to indicate to the end of the string.
-   * @return -1 if arg string is not found in this string.
-   */
-  __device__ size_type rfind(const string_view& str, size_type pos = 0, size_type count = -1) const;
-  /**
-   * @brief Returns the character position of the last occurrence where the
-   * array str is found in this string within the character range [pos,pos+n).
-   *
-   * @param str Target string to search with this string.
-   * @param bytes Number of bytes in str.
-   * @param pos Character position to start search within this string.
-   * @param count Number of characters from pos to include in the search.
-   *              Specify -1 to indicate to the end of the string.
-   * @return -1 if arg string is not found in this string.
-   */
-  __device__ size_type rfind(const char* str,
-                             size_type bytes,
-                             size_type pos   = 0,
-                             size_type count = -1) const;
-  /**
-   * @brief Returns the character position of the last occurrence where
-   * character is found in this string within the character range [pos,pos+n).
-   *
-   * @param character Single encoded character.
-   * @param pos Character position to start search within this string.
-   * @param count Number of characters from pos to include in the search.
-   *              Specify -1 to indicate to the end of the string.
-   * @return -1 if arg string is not found in this string.
-   */
-  __device__ size_type rfind(char_utf8 character, size_type pos = 0, size_type count = -1) const;
-
-  /**
-   * @brief Return a sub-string of this string. The original string and device
-   * memory must still be maintained for the lifetime of the returned instance.
-   *
-   * @param start Character position to start the sub-string.
-   * @param length Number of characters from start to include in the sub-string.
-   * @return New instance pointing to a subset of the characters within this instance.
-   */
-  __device__ string_view substr(size_type start, size_type length) const;
-
- private:
-  const char* _data{};           ///< Pointer to device memory contain char array for this string
-  size_type _bytes{};            ///< Number of bytes in _data for this string
-  mutable size_type _length{};   ///< Number of characters in this string (computed)
-  mutable int8_t _char_width{};  ///< Number of bytes per character if uniform width (computed)
-
-  /**
-   * @brief Return the character position of the given byte offset.
-   *
-   * @param bytepos Byte position from start of _data.
-   * @return The character position for the specified byte.
-   */
-  __device__ size_type character_offset(size_type bytepos) const;
-};
+// This file should only include device code logic.
+// Host-only or host/device code should be defined in the string_view.hpp header file.
 
+namespace cudf {
 namespace strings {
 namespace detail {
-/**
- * @brief Returns the number of bytes in the specified character.
- *
- * @param character Single character
- * @return Number of bytes
- */
-__host__ __device__ size_type bytes_in_char_utf8(char_utf8 character);
-
-/**
- * @brief Convert a char array into a char_utf8 value.
- *
- * @param str String containing encoded char bytes.
- * @param[out] character Single char_utf8 value.
- * @return The number of bytes in the character
- */
-__host__ __device__ size_type to_char_utf8(const char* str, char_utf8& character);
-
-/**
- * @brief Place a char_utf8 value into a char array.
- *
- * @param character Single character
- * @param[out] str Allocated char array with enough space to hold the encoded characer.
- * @return The number of bytes in the character
- */
-__host__ __device__ size_type from_char_utf8(char_utf8 character, char* str);
 
 /**
  * @brief Return the number of UTF-8 characters in this provided char array.
@@ -331,22 +36,338 @@ __host__ __device__ size_type from_char_utf8(char_utf8 character, char* str);
  * @param bytes Number of bytes in str.
  * @return The number of characters in the array.
  */
-__host__ __device__ size_type characters_in_string(const char* str, size_type bytes);
-
-/**
- * @brief This will return true if passed the first byte of a UTF-8 character.
- *
- * @param byte Any byte from a valid UTF-8 character
- * @return true if this the first byte of the character
- */
-constexpr bool is_begin_utf8_char(uint8_t byte)
+__device__ inline size_type characters_in_string(const char* str, size_type bytes)
 {
-  // The (0xC0 & 0x80) bit pattern identifies a continuation byte of a character.
-  return (byte & 0xC0) != 0x80;
+  if ((str == 0) || (bytes == 0)) return 0;
+  auto ptr = reinterpret_cast<uint8_t const*>(str);
+  return thrust::count_if(
+    thrust::seq, ptr, ptr + bytes, [](uint8_t chr) { return is_begin_utf8_char(chr); });
 }
-
 }  // namespace detail
 }  // namespace strings
-}  // namespace cudf
 
-#include "./string_view.inl"
+__device__ inline size_type string_view::length() const
+{
+  if (_length == UNKNOWN_STRING_LENGTH)
+    _length = strings::detail::characters_in_string(_data, _bytes);
+  if (_length && (_char_width == UNKNOWN_CHAR_WIDTH)) {
+    uint8_t const* ptr = reinterpret_cast<uint8_t const*>(data());
+    auto const first   = strings::detail::bytes_in_utf8_byte(*ptr);
+    // see if they are all the same width
+    _char_width = (thrust::find_if(thrust::seq,
+                                   ptr,
+                                   ptr + size_bytes(),
+                                   [first](auto ch) {
+                                     auto width = strings::detail::bytes_in_utf8_byte(ch);
+                                     return (width != 0) && (width != first);
+                                   })) == (ptr + size_bytes())
+                    ? first
+                    : VARIABLE_CHAR_WIDTH;
+  }
+  return _length;
+}
+
+// this custom iterator knows about UTF8 encoding
+__device__ inline string_view::const_iterator::const_iterator(const string_view& str, size_type pos)
+  : p{str.data()}, bytes{str.size_bytes()}, char_pos{pos}, byte_pos{str.byte_offset(pos)}
+{
+}
+
+__device__ inline string_view::const_iterator& string_view::const_iterator::operator++()
+{
+  if (byte_pos < bytes)
+    byte_pos += strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[byte_pos]));
+  ++char_pos;
+  return *this;
+}
+
+__device__ inline string_view::const_iterator string_view::const_iterator::operator++(int)
+{
+  string_view::const_iterator tmp(*this);
+  operator++();
+  return tmp;
+}
+
+__device__ inline string_view::const_iterator string_view::const_iterator::operator+(
+  string_view::const_iterator::difference_type offset)
+{
+  const_iterator tmp(*this);
+  size_type adjust = abs(offset);
+  while (adjust-- > 0) offset > 0 ? ++tmp : --tmp;
+  return tmp;
+}
+
+__device__ inline string_view::const_iterator& string_view::const_iterator::operator+=(
+  string_view::const_iterator::difference_type offset)
+{
+  size_type adjust = abs(offset);
+  while (adjust-- > 0) offset > 0 ? operator++() : operator--();
+  return *this;
+}
+
+__device__ inline string_view::const_iterator& string_view::const_iterator::operator--()
+{
+  if (byte_pos > 0)
+    while (strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[--byte_pos])) == 0)
+      ;
+  --char_pos;
+  return *this;
+}
+
+__device__ inline string_view::const_iterator string_view::const_iterator::operator--(int)
+{
+  string_view::const_iterator tmp(*this);
+  operator--();
+  return tmp;
+}
+
+__device__ inline string_view::const_iterator& string_view::const_iterator::operator-=(
+  string_view::const_iterator::difference_type offset)
+{
+  size_type adjust = abs(offset);
+  while (adjust-- > 0) offset > 0 ? operator--() : operator++();
+  return *this;
+}
+
+__device__ inline string_view::const_iterator string_view::const_iterator::operator-(
+  string_view::const_iterator::difference_type offset)
+{
+  const_iterator tmp(*this);
+  size_type adjust = abs(offset);
+  while (adjust-- > 0) offset > 0 ? --tmp : ++tmp;
+  return tmp;
+}
+
+__device__ inline bool string_view::const_iterator::operator==(
+  const string_view::const_iterator& rhs) const
+{
+  return (p == rhs.p) && (char_pos == rhs.char_pos);
+}
+
+__device__ inline bool string_view::const_iterator::operator!=(
+  const string_view::const_iterator& rhs) const
+{
+  return (p != rhs.p) || (char_pos != rhs.char_pos);
+}
+
+__device__ inline bool string_view::const_iterator::operator<(
+  const string_view::const_iterator& rhs) const
+{
+  return (p == rhs.p) && (char_pos < rhs.char_pos);
+}
+
+__device__ inline bool string_view::const_iterator::operator<=(
+  const string_view::const_iterator& rhs) const
+{
+  return (p == rhs.p) && (char_pos <= rhs.char_pos);
+}
+
+__device__ inline bool string_view::const_iterator::operator>(
+  const string_view::const_iterator& rhs) const
+{
+  return (p == rhs.p) && (char_pos > rhs.char_pos);
+}
+
+__device__ inline bool string_view::const_iterator::operator>=(
+  const string_view::const_iterator& rhs) const
+{
+  return (p == rhs.p) && (char_pos >= rhs.char_pos);
+}
+
+__device__ inline char_utf8 string_view::const_iterator::operator*() const
+{
+  char_utf8 chr = 0;
+  strings::detail::to_char_utf8(p + byte_offset(), chr);
+  return chr;
+}
+
+__device__ inline size_type string_view::const_iterator::position() const { return char_pos; }
+
+__device__ inline size_type string_view::const_iterator::byte_offset() const { return byte_pos; }
+
+__device__ inline string_view::const_iterator string_view::begin() const
+{
+  return const_iterator(*this, 0);
+}
+
+__device__ inline string_view::const_iterator string_view::end() const
+{
+  return const_iterator(*this, length());
+}
+
+__device__ inline char_utf8 string_view::operator[](size_type pos) const
+{
+  size_type offset = byte_offset(pos);
+  if (offset >= _bytes) return 0;
+  char_utf8 chr = 0;
+  strings::detail::to_char_utf8(data() + offset, chr);
+  return chr;
+}
+
+__device__ inline size_type string_view::byte_offset(size_type pos) const
+{
+  size_type offset = 0;
+  const char* sptr = _data;
+  const char* eptr = sptr + _bytes;
+  if (_char_width > 0) return pos * _char_width;
+  while ((pos > 0) && (sptr < eptr)) {
+    size_type charbytes = strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(*sptr++));
+    if (charbytes) --pos;
+    offset += charbytes;
+  }
+  return offset;
+}
+
+__device__ inline int string_view::compare(const string_view& in) const
+{
+  return compare(in.data(), in.size_bytes());
+}
+
+__device__ inline int string_view::compare(const char* data, size_type bytes) const
+{
+  size_type const len1      = size_bytes();
+  const unsigned char* ptr1 = reinterpret_cast<const unsigned char*>(this->data());
+  const unsigned char* ptr2 = reinterpret_cast<const unsigned char*>(data);
+  size_type idx             = 0;
+  for (; (idx < len1) && (idx < bytes); ++idx) {
+    if (*ptr1 != *ptr2) return static_cast<int32_t>(*ptr1) - static_cast<int32_t>(*ptr2);
+    ++ptr1;
+    ++ptr2;
+  }
+  if (idx < len1) return 1;
+  if (idx < bytes) return -1;
+  return 0;
+}
+
+__device__ inline bool string_view::operator==(const string_view& rhs) const
+{
+  return compare(rhs) == 0;
+}
+
+__device__ inline bool string_view::operator!=(const string_view& rhs) const
+{
+  return compare(rhs) != 0;
+}
+
+__device__ inline bool string_view::operator<(const string_view& rhs) const
+{
+  return compare(rhs) < 0;
+}
+
+__device__ inline bool string_view::operator>(const string_view& rhs) const
+{
+  return compare(rhs) > 0;
+}
+
+__device__ inline bool string_view::operator<=(const string_view& rhs) const
+{
+  int rc = compare(rhs);
+  return (rc == 0) || (rc < 0);
+}
+
+__device__ inline bool string_view::operator>=(const string_view& rhs) const
+{
+  int rc = compare(rhs);
+  return (rc == 0) || (rc > 0);
+}
+
+__device__ inline size_type string_view::find(const string_view& str,
+                                              size_type pos,
+                                              size_type count) const
+{
+  return find(str.data(), str.size_bytes(), pos, count);
+}
+
+__device__ inline size_type string_view::find(const char* str,
+                                              size_type bytes,
+                                              size_type pos,
+                                              size_type count) const
+{
+  const char* sptr = data();
+  if (!str || !bytes) return -1;
+  size_type nchars = length();
+  if (count < 0) count = nchars;
+  size_type end = pos + count;
+  if (end < 0 || end > nchars) end = nchars;
+  size_type spos = byte_offset(pos);
+  size_type epos = byte_offset(end);
+
+  size_type len2 = bytes;
+  size_type len1 = (epos - spos) - len2 + 1;
+
+  const char* ptr1 = sptr + spos;
+  const char* ptr2 = str;
+  for (size_type idx = 0; idx < len1; ++idx) {
+    bool match = true;
+    for (size_type jdx = 0; match && (jdx < len2); ++jdx) match = (ptr1[jdx] == ptr2[jdx]);
+    if (match) return character_offset(idx + spos);
+    ptr1++;
+  }
+  return -1;
+}
+
+__device__ inline size_type string_view::find(char_utf8 chr, size_type pos, size_type count) const
+{
+  char str[sizeof(char_utf8)];
+  size_type chwidth = strings::detail::from_char_utf8(chr, str);
+  return find(str, chwidth, pos, count);
+}
+
+__device__ inline size_type string_view::rfind(const string_view& str,
+                                               size_type pos,
+                                               size_type count) const
+{
+  return rfind(str.data(), str.size_bytes(), pos, count);
+}
+
+__device__ inline size_type string_view::rfind(const char* str,
+                                               size_type bytes,
+                                               size_type pos,
+                                               size_type count) const
+{
+  const char* sptr = data();
+  if (!str || !bytes) return -1;
+  size_type nchars = length();
+  size_type end    = pos + count;
+  if (end < 0 || end > nchars) end = nchars;
+  size_type spos = byte_offset(pos);
+  size_type epos = byte_offset(end);
+
+  size_type len2 = bytes;
+  size_type len1 = (epos - spos) - len2 + 1;
+
+  const char* ptr1 = sptr + epos - len2;
+  const char* ptr2 = str;
+  for (int idx = 0; idx < len1; ++idx) {
+    bool match = true;
+    for (size_type jdx = 0; match && (jdx < len2); ++jdx) match = (ptr1[jdx] == ptr2[jdx]);
+    if (match) return character_offset(epos - len2 - idx);
+    ptr1--;  // go backwards
+  }
+  return -1;
+}
+
+__device__ inline size_type string_view::rfind(char_utf8 chr, size_type pos, size_type count) const
+{
+  char str[sizeof(char_utf8)];
+  size_type chwidth = strings::detail::from_char_utf8(chr, str);
+  return rfind(str, chwidth, pos, count);
+}
+
+// parameters are character position values
+__device__ inline string_view string_view::substr(size_type pos, size_type length) const
+{
+  size_type spos = byte_offset(pos);
+  size_type epos = byte_offset(pos + length);
+  if (epos > size_bytes()) epos = size_bytes();
+  if (spos >= epos) return string_view("", 0);
+  return string_view(data() + spos, epos - spos);
+}
+
+__device__ inline size_type string_view::character_offset(size_type bytepos) const
+{
+  if (_char_width > 0) return bytepos / _char_width;
+  return strings::detail::characters_in_string(data(), bytepos);
+}
+
+}  // namespace cudf
diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp
new file mode 100644
index 00000000000..9c42c216791
--- /dev/null
+++ b/cpp/include/cudf/strings/string_view.hpp
@@ -0,0 +1,422 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cstddef>
+#include <cudf/types.hpp>
+#include <iterator>
+
+/**
+ * @file
+ * @brief Class definition for cudf::string_view.
+ */
+
+namespace cudf {
+
+using char_utf8 = uint32_t;  ///< UTF-8 characters are 1-4 bytes
+
+/**
+ * @brief The string length is initialized to this value as a place-holder
+ *
+ * The number of characters in a string computed on-demand.
+ */
+constexpr cudf::size_type UNKNOWN_STRING_LENGTH{-1};
+
+/**
+ * @brief The char width is initialized to this value as a place-holder.
+ *
+ * The byte-width of the characters in a string is computed on-demand.
+ */
+constexpr int8_t UNKNOWN_CHAR_WIDTH{-1};
+
+/**
+ * @brief This value is assigned to the _char_width member if the string
+ * contains characters of different widths.
+ */
+constexpr int8_t VARIABLE_CHAR_WIDTH{0};
+
+/**
+ * @brief A non-owning, immutable view of device data that is a variable length
+ * char array representing a UTF-8 string.
+ *
+ * @ingroup strings_classes
+ *
+ * The caller must maintain the device memory for the lifetime of this instance.
+ *
+ * This may be used to wrap a device pointer and size but any member function
+ * that requires accessing the device memory must be called from a kernel.
+ */
+class string_view {
+ public:
+  /**
+   * @brief Return the number of bytes in this string
+   */
+  CUDA_HOST_DEVICE_CALLABLE size_type size_bytes() const { return _bytes; }
+  /**
+   * @brief Return the number of characters in this string
+   */
+  CUDA_DEVICE_CALLABLE size_type length() const;
+  /**
+   * @brief Return a pointer to the internal device array
+   */
+  CUDA_HOST_DEVICE_CALLABLE const char* data() const { return _data; }
+
+  /**
+   * @brief Return true if string has no characters
+   */
+  CUDA_HOST_DEVICE_CALLABLE bool empty() const { return size_bytes() == 0; }
+
+  /**
+   * @brief Handy iterator for navigating through encoded characters.
+   */
+  class const_iterator {
+   public:
+    using difference_type   = ptrdiff_t;
+    using value_type        = char_utf8;
+    using reference         = char_utf8&;
+    using pointer           = char_utf8*;
+    using iterator_category = std::input_iterator_tag;
+    CUDA_DEVICE_CALLABLE const_iterator(const string_view& str, size_type pos);
+    const_iterator(const const_iterator& mit) = default;
+    const_iterator(const_iterator&& mit)      = default;
+    const_iterator& operator=(const const_iterator&) = default;
+    const_iterator& operator=(const_iterator&&) = default;
+    CUDA_DEVICE_CALLABLE const_iterator& operator++();
+    CUDA_DEVICE_CALLABLE const_iterator operator++(int);
+    CUDA_DEVICE_CALLABLE const_iterator& operator+=(difference_type);
+    CUDA_DEVICE_CALLABLE const_iterator operator+(difference_type);
+    CUDA_DEVICE_CALLABLE const_iterator& operator--();
+    CUDA_DEVICE_CALLABLE const_iterator operator--(int);
+    CUDA_DEVICE_CALLABLE const_iterator& operator-=(difference_type);
+    CUDA_DEVICE_CALLABLE const_iterator operator-(difference_type);
+    CUDA_DEVICE_CALLABLE bool operator==(const const_iterator&) const;
+    CUDA_DEVICE_CALLABLE bool operator!=(const const_iterator&) const;
+    CUDA_DEVICE_CALLABLE bool operator<(const const_iterator&) const;
+    CUDA_DEVICE_CALLABLE bool operator<=(const const_iterator&) const;
+    CUDA_DEVICE_CALLABLE bool operator>(const const_iterator&) const;
+    CUDA_DEVICE_CALLABLE bool operator>=(const const_iterator&) const;
+    CUDA_DEVICE_CALLABLE char_utf8 operator*() const;
+    CUDA_DEVICE_CALLABLE size_type position() const;
+    CUDA_DEVICE_CALLABLE size_type byte_offset() const;
+
+   private:
+    const char* p{};
+    size_type bytes{};
+    size_type char_pos{};
+    size_type byte_pos{};
+  };
+
+  /**
+   * @brief Return new iterator pointing to the beginning of this string
+   */
+  CUDA_DEVICE_CALLABLE const_iterator begin() const;
+  /**
+   * @brief Return new iterator pointing past the end of this string
+   */
+  CUDA_DEVICE_CALLABLE const_iterator end() const;
+
+  /**
+   * @brief Return single UTF-8 character at the given character position
+   *
+   * @param pos Character position
+   */
+  CUDA_DEVICE_CALLABLE char_utf8 operator[](size_type pos) const;
+  /**
+   * @brief Return the byte offset from data() for a given character position
+   *
+   * @param pos Character position
+   */
+  CUDA_DEVICE_CALLABLE size_type byte_offset(size_type pos) const;
+
+  /**
+   * @brief Comparing target string with this string. Each character is compared
+   * as a UTF-8 code-point value.
+   *
+   * @param str Target string to compare with this string.
+   * @return 0  If they compare equal.
+   *         <0 Either the value of the first character of this string that does
+   *            not match is lower in the arg string, or all compared characters
+   *            match but the arg string is shorter.
+   *         >0 Either the value of the first character of this string that does
+   *            not match is greater in the arg string, or all compared characters
+   *            match but the arg string is longer.
+   */
+  CUDA_DEVICE_CALLABLE int compare(const string_view& str) const;
+  /**
+   * @brief Comparing target string with this string. Each character is compared
+   * as a UTF-8 code-point value.
+   *
+   * @param str Target string to compare with this string.
+   * @param bytes Number of bytes in str.
+   * @return 0  If they compare equal.
+   *         <0 Either the value of the first character of this string that does
+   *            not match is lower in the arg string, or all compared characters
+   *            match but the arg string is shorter.
+   *         >0 Either the value of the first character of this string that does
+   *            not match is greater in the arg string, or all compared characters
+   *            match but the arg string is longer.
+   */
+  CUDA_DEVICE_CALLABLE int compare(const char* str, size_type bytes) const;
+
+  /**
+   * @brief Returns true if rhs matches this string exactly.
+   */
+  CUDA_DEVICE_CALLABLE bool operator==(const string_view& rhs) const;
+  /**
+   * @brief Returns true if rhs does not match this string.
+   */
+  CUDA_DEVICE_CALLABLE bool operator!=(const string_view& rhs) const;
+  /**
+   * @brief Returns true if this string is ordered before rhs.
+   */
+  CUDA_DEVICE_CALLABLE bool operator<(const string_view& rhs) const;
+  /**
+   * @brief Returns true if rhs is ordered before this string.
+   */
+  CUDA_DEVICE_CALLABLE bool operator>(const string_view& rhs) const;
+  /**
+   * @brief Returns true if this string matches or is ordered before rhs.
+   */
+  CUDA_DEVICE_CALLABLE bool operator<=(const string_view& rhs) const;
+  /**
+   * @brief Returns true if rhs matches or is ordered before this string.
+   */
+  CUDA_DEVICE_CALLABLE bool operator>=(const string_view& rhs) const;
+
+  /**
+   * @brief Returns the character position of the first occurrence where the
+   * argument str is found in this string within the character range [pos,pos+n).
+   *
+   * @param str Target string to search within this string.
+   * @param pos Character position to start search within this string.
+   * @param count Number of characters from pos to include in the search.
+   *              Specify -1 to indicate to the end of the string.
+   * @return -1 if str is not found in this string.
+   */
+  CUDA_DEVICE_CALLABLE size_type find(const string_view& str,
+                                      size_type pos   = 0,
+                                      size_type count = -1) const;
+  /**
+   * @brief Returns the character position of the first occurrence where the
+   * array str is found in this string within the character range [pos,pos+n).
+   *
+   * @param str Target array to search within this string.
+   * @param bytes Number of bytes in str.
+   * @param pos Character position to start search within this string.
+   * @param count Number of characters from pos to include in the search.
+   *              Specify -1 to indicate to the end of the string.
+   * @return -1 if arg string is not found in this string.
+   */
+  CUDA_DEVICE_CALLABLE size_type find(const char* str,
+                                      size_type bytes,
+                                      size_type pos   = 0,
+                                      size_type count = -1) const;
+  /**
+   * @brief Returns the character position of the first occurrence where
+   * character is found in this string within the character range [pos,pos+n).
+   *
+   * @param character Single encoded character.
+   * @param pos Character position to start search within this string.
+   * @param count Number of characters from pos to include in the search.
+   *              Specify -1 to indicate to the end of the string.
+   * @return -1 if arg string is not found in this string.
+   */
+  CUDA_DEVICE_CALLABLE size_type find(char_utf8 character,
+                                      size_type pos   = 0,
+                                      size_type count = -1) const;
+  /**
+   * @brief Returns the character position of the last occurrence where the
+   * argument str is found in this string within the character range [pos,pos+n).
+   *
+   * @param str Target string to search within this string.
+   * @param pos Character position to start search within this string.
+   * @param count Number of characters from pos to include in the search.
+   *              Specify -1 to indicate to the end of the string.
+   * @return -1 if arg string is not found in this string.
+   */
+  CUDA_DEVICE_CALLABLE size_type rfind(const string_view& str,
+                                       size_type pos   = 0,
+                                       size_type count = -1) const;
+  /**
+   * @brief Returns the character position of the last occurrence where the
+   * array str is found in this string within the character range [pos,pos+n).
+   *
+   * @param str Target string to search with this string.
+   * @param bytes Number of bytes in str.
+   * @param pos Character position to start search within this string.
+   * @param count Number of characters from pos to include in the search.
+   *              Specify -1 to indicate to the end of the string.
+   * @return -1 if arg string is not found in this string.
+   */
+  CUDA_DEVICE_CALLABLE size_type rfind(const char* str,
+                                       size_type bytes,
+                                       size_type pos   = 0,
+                                       size_type count = -1) const;
+  /**
+   * @brief Returns the character position of the last occurrence where
+   * character is found in this string within the character range [pos,pos+n).
+   *
+   * @param character Single encoded character.
+   * @param pos Character position to start search within this string.
+   * @param count Number of characters from pos to include in the search.
+   *              Specify -1 to indicate to the end of the string.
+   * @return -1 if arg string is not found in this string.
+   */
+  CUDA_DEVICE_CALLABLE size_type rfind(char_utf8 character,
+                                       size_type pos   = 0,
+                                       size_type count = -1) const;
+
+  /**
+   * @brief Return a sub-string of this string. The original string and device
+   * memory must still be maintained for the lifetime of the returned instance.
+   *
+   * @param start Character position to start the sub-string.
+   * @param length Number of characters from start to include in the sub-string.
+   * @return New instance pointing to a subset of the characters within this instance.
+   */
+  CUDA_DEVICE_CALLABLE string_view substr(size_type start, size_type length) const;
+
+  /**
+   * @brief Default constructor represents an empty string.
+   */
+  CUDA_HOST_DEVICE_CALLABLE string_view() : _data(""), _bytes(0), _length(0), _char_width(0) {}
+
+  /**
+   * @brief Create instance from existing device char array.
+   *
+   * @param data Device char array encoded in UTF8.
+   * @param bytes Number of bytes in data array.
+   */
+  CUDA_HOST_DEVICE_CALLABLE string_view(const char* data, size_type bytes)
+    : _data(data), _bytes(bytes), _length(UNKNOWN_STRING_LENGTH), _char_width(UNKNOWN_CHAR_WIDTH)
+  {
+  }
+
+  string_view(const string_view&) = default;
+  string_view(string_view&&)      = default;
+  ~string_view()                  = default;
+  string_view& operator=(const string_view&) = default;
+  string_view& operator=(string_view&&) = default;
+
+ private:
+  const char* _data{};           ///< Pointer to device memory contain char array for this string
+  size_type _bytes{};            ///< Number of bytes in _data for this string
+  mutable size_type _length{};   ///< Number of characters in this string (computed)
+  mutable int8_t _char_width{};  ///< Number of bytes per character if uniform width (computed)
+
+  /**
+   * @brief Return the character position of the given byte offset.
+   *
+   * @param bytepos Byte position from start of _data.
+   * @return The character position for the specified byte.
+   */
+  CUDA_DEVICE_CALLABLE size_type character_offset(size_type bytepos) const;
+};
+
+namespace strings {
+namespace detail {
+
+/**
+ * @brief This will return true if passed the first byte of a UTF-8 character.
+ *
+ * @param byte Any byte from a valid UTF-8 character
+ * @return true if this the first byte of the character
+ */
+constexpr bool is_begin_utf8_char(uint8_t byte)
+{
+  // The (0xC0 & 0x80) bit pattern identifies a continuation byte of a character.
+  return (byte & 0xC0) != 0x80;
+}
+
+/**
+ * @brief Returns the number of bytes in the specified character.
+ *
+ * @param character Single character
+ * @return Number of bytes
+ */
+constexpr size_type bytes_in_char_utf8(char_utf8 character)
+{
+  return 1 + static_cast<size_type>((character & unsigned{0x0000FF00}) > 0) +
+         static_cast<size_type>((character & unsigned{0x00FF0000}) > 0) +
+         static_cast<size_type>((character & unsigned{0xFF000000}) > 0);
+}
+
+/**
+ * @brief Returns the number of bytes used to represent the provided byte.
+ *
+ * This could be 0 to 4 bytes. 0 is returned for intermediate bytes within a
+ * single character. For example, for the two-byte 0xC3A8 single character,
+ * the first byte would return 2 and the second byte would return 0.
+ *
+ * @param byte Byte from an encoded character.
+ * @return Number of bytes.
+ */
+constexpr size_type bytes_in_utf8_byte(uint8_t byte)
+{
+  return 1 + static_cast<size_type>((byte & 0xF0) == 0xF0)  // 4-byte character prefix
+         + static_cast<size_type>((byte & 0xE0) == 0xE0)    // 3-byte character prefix
+         + static_cast<size_type>((byte & 0xC0) == 0xC0)    // 2-byte character prefix
+         - static_cast<size_type>((byte & 0xC0) == 0x80);   // intermediate byte
+}
+
+/**
+ * @brief Convert a char array into a char_utf8 value.
+ *
+ * @param str String containing encoded char bytes.
+ * @param[out] character Single char_utf8 value.
+ * @return The number of bytes in the character
+ */
+CUDA_HOST_DEVICE_CALLABLE size_type to_char_utf8(const char* str, char_utf8& character)
+{
+  size_type const chr_width = bytes_in_utf8_byte(static_cast<uint8_t>(*str));
+
+  character = static_cast<char_utf8>(*str++) & 0xFF;
+  if (chr_width > 1) {
+    character = character << 8;
+    character |= (static_cast<char_utf8>(*str++) & 0xFF);  // << 8;
+    if (chr_width > 2) {
+      character = character << 8;
+      character |= (static_cast<char_utf8>(*str++) & 0xFF);  // << 16;
+      if (chr_width > 3) {
+        character = character << 8;
+        character |= (static_cast<char_utf8>(*str++) & 0xFF);  // << 24;
+      }
+    }
+  }
+  return chr_width;
+}
+
+/**
+ * @brief Place a char_utf8 value into a char array.
+ *
+ * @param character Single character
+ * @param[out] str Allocated char array with enough space to hold the encoded characer.
+ * @return The number of bytes in the character
+ */
+CUDA_HOST_DEVICE_CALLABLE size_type from_char_utf8(char_utf8 character, char* str)
+{
+  size_type const chr_width = bytes_in_char_utf8(character);
+  for (size_type idx = 0; idx < chr_width; ++idx) {
+    str[chr_width - idx - 1] = static_cast<char>(character) & 0xFF;
+    character                = character >> 8;
+  }
+  return chr_width;
+}
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/include/cudf/strings/string_view.inl b/cpp/include/cudf/strings/string_view.inl
deleted file mode 100644
index eee59604171..00000000000
--- a/cpp/include/cudf/strings/string_view.inl
+++ /dev/null
@@ -1,463 +0,0 @@
-/*
- * Copyright (c) 2019, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <thrust/find.h>
-#include <cstdlib>
-
-namespace {
-using BYTE = uint8_t;
-
-// number of characters in a string computed on-demand
-// the _length member is initialized to this value as a place-holder
-constexpr cudf::size_type UNKNOWN_STRING_LENGTH{-1};
-// the byte-width of the characters in a string is computed on-demand
-// the _char_width member is initialized to this value as a place-holder
-constexpr int8_t UNKNOWN_CHAR_WIDTH{-1};
-// this value is assigned to the _char_width member if the string
-// contains characters of different widths
-constexpr int8_t VARIABLE_CHAR_WIDTH{0};
-
-/**
- * @brief Returns the number of bytes used to represent the provided byte.
- * This could be 0 to 4 bytes. 0 is returned for intermediate bytes within a
- * single character. For example, for the two-byte 0xC3A8 single character,
- * the first byte would return 2 and the second byte would return 0.
- *
- * @param byte Byte from an encoded character.
- * @return Number of bytes.
- */
-__host__ __device__ inline cudf::size_type bytes_in_utf8_byte(BYTE byte)
-{
-  cudf::size_type count = 1;
-  count += (int)((byte & 0xF0) == 0xF0);  // 4-byte character prefix
-  count += (int)((byte & 0xE0) == 0xE0);  // 3-byte character prefix
-  count += (int)((byte & 0xC0) == 0xC0);  // 2-byte character prefix
-  count -= (int)((byte & 0xC0) == 0x80);  // intermediate byte
-  return count;
-}
-
-/**
- * @brief Returns the number of bytes used in the provided char array by
- * searching for a null-terminator byte.
- *
- * @param str Null-terminated array of chars.
- * @return Number of bytes.
- */
-__device__ inline cudf::size_type string_bytes(const char* str)
-{
-  if (!str) return 0;
-  cudf::size_type bytes = 0;
-  while (*str++) ++bytes;
-  return bytes;
-}
-
-}  // namespace
-
-namespace cudf {
-
-__host__ __device__ inline string_view::string_view()
-  : _data(""), _bytes(0), _length(0), _char_width(0)
-{
-}
-
-__host__ __device__ inline string_view::string_view(const char* data, size_type bytes)
-  : _data(data), _bytes(bytes), _length(UNKNOWN_STRING_LENGTH), _char_width(UNKNOWN_CHAR_WIDTH)
-{
-}
-
-//
-__host__ __device__ inline size_type string_view::size_bytes() const { return _bytes; }
-
-__device__ inline size_type string_view::length() const
-{
-  if (_length == UNKNOWN_STRING_LENGTH)
-    _length = strings::detail::characters_in_string(_data, _bytes);
-  if (_length && (_char_width == UNKNOWN_CHAR_WIDTH)) {
-    const BYTE* bytes = reinterpret_cast<const BYTE*>(data());
-    auto chwidth      = bytes_in_utf8_byte(*bytes);  // see if they are all the same width
-    _char_width       = (thrust::find_if(thrust::seq,
-                                   bytes,
-                                   bytes + size_bytes(),
-                                   [chwidth](auto ch) {
-                                     auto width = bytes_in_utf8_byte(ch);
-                                     return (width != 0) && (width != chwidth);
-                                   })) == (bytes + size_bytes())
-                    ? chwidth
-                    : VARIABLE_CHAR_WIDTH;
-  }
-  return _length;
-}
-
-__host__ __device__ inline const char* string_view::data() const { return _data; }
-
-__host__ __device__ inline bool string_view::empty() const { return _bytes == 0; }
-
-// this custom iterator knows about UTF8 encoding
-__device__ inline string_view::const_iterator::const_iterator(const string_view& str, size_type pos)
-  : p{str.data()}, bytes{str.size_bytes()}, char_pos{pos}, byte_pos{str.byte_offset(pos)}
-{
-}
-
-__device__ inline string_view::const_iterator& string_view::const_iterator::operator++()
-{
-  if (byte_pos < bytes) byte_pos += bytes_in_utf8_byte((BYTE)p[byte_pos]);
-  ++char_pos;
-  return *this;
-}
-
-__device__ inline string_view::const_iterator string_view::const_iterator::operator++(int)
-{
-  string_view::const_iterator tmp(*this);
-  operator++();
-  return tmp;
-}
-
-__device__ inline string_view::const_iterator string_view::const_iterator::operator+(
-  string_view::const_iterator::difference_type offset)
-{
-  const_iterator tmp(*this);
-  size_type adjust = abs(offset);
-  while (adjust-- > 0) offset > 0 ? ++tmp : --tmp;
-  return tmp;
-}
-
-__device__ inline string_view::const_iterator& string_view::const_iterator::operator+=(
-  string_view::const_iterator::difference_type offset)
-{
-  size_type adjust = abs(offset);
-  while (adjust-- > 0) offset > 0 ? operator++() : operator--();
-  return *this;
-}
-
-__device__ inline string_view::const_iterator& string_view::const_iterator::operator--()
-{
-  if (byte_pos > 0)
-    while (bytes_in_utf8_byte((BYTE)p[--byte_pos]) == 0)
-      ;
-  --char_pos;
-  return *this;
-}
-
-__device__ inline string_view::const_iterator string_view::const_iterator::operator--(int)
-{
-  string_view::const_iterator tmp(*this);
-  operator--();
-  return tmp;
-}
-
-__device__ inline string_view::const_iterator& string_view::const_iterator::operator-=(
-  string_view::const_iterator::difference_type offset)
-{
-  size_type adjust = abs(offset);
-  while (adjust-- > 0) offset > 0 ? operator--() : operator++();
-  return *this;
-}
-
-__device__ inline string_view::const_iterator string_view::const_iterator::operator-(
-  string_view::const_iterator::difference_type offset)
-{
-  const_iterator tmp(*this);
-  size_type adjust = abs(offset);
-  while (adjust-- > 0) offset > 0 ? --tmp : ++tmp;
-  return tmp;
-}
-
-__device__ inline bool string_view::const_iterator::operator==(
-  const string_view::const_iterator& rhs) const
-{
-  return (p == rhs.p) && (char_pos == rhs.char_pos);
-}
-
-__device__ inline bool string_view::const_iterator::operator!=(
-  const string_view::const_iterator& rhs) const
-{
-  return (p != rhs.p) || (char_pos != rhs.char_pos);
-}
-
-__device__ inline bool string_view::const_iterator::operator<(
-  const string_view::const_iterator& rhs) const
-{
-  return (p == rhs.p) && (char_pos < rhs.char_pos);
-}
-
-__device__ inline bool string_view::const_iterator::operator<=(
-  const string_view::const_iterator& rhs) const
-{
-  return (p == rhs.p) && (char_pos <= rhs.char_pos);
-}
-
-__device__ inline bool string_view::const_iterator::operator>(
-  const string_view::const_iterator& rhs) const
-{
-  return (p == rhs.p) && (char_pos > rhs.char_pos);
-}
-
-__device__ inline bool string_view::const_iterator::operator>=(
-  const string_view::const_iterator& rhs) const
-{
-  return (p == rhs.p) && (char_pos >= rhs.char_pos);
-}
-
-__device__ inline char_utf8 string_view::const_iterator::operator*() const
-{
-  char_utf8 chr = 0;
-  strings::detail::to_char_utf8(p + byte_offset(), chr);
-  return chr;
-}
-
-__device__ inline size_type string_view::const_iterator::position() const { return char_pos; }
-
-__device__ inline size_type string_view::const_iterator::byte_offset() const { return byte_pos; }
-
-__device__ inline string_view::const_iterator string_view::begin() const
-{
-  return const_iterator(*this, 0);
-}
-
-__device__ inline string_view::const_iterator string_view::end() const
-{
-  return const_iterator(*this, length());
-}
-
-__device__ inline char_utf8 string_view::operator[](size_type pos) const
-{
-  size_type offset = byte_offset(pos);
-  if (offset >= _bytes) return 0;
-  char_utf8 chr = 0;
-  strings::detail::to_char_utf8(data() + offset, chr);
-  return chr;
-}
-
-__device__ inline size_type string_view::byte_offset(size_type pos) const
-{
-  size_type offset = 0;
-  const char* sptr = _data;
-  const char* eptr = sptr + _bytes;
-  if (_char_width > 0) return pos * _char_width;
-  while ((pos > 0) && (sptr < eptr)) {
-    size_type charbytes = bytes_in_utf8_byte((BYTE)*sptr++);
-    if (charbytes) --pos;
-    offset += charbytes;
-  }
-  return offset;
-}
-
-__device__ inline int string_view::compare(const string_view& in) const
-{
-  return compare(in.data(), in.size_bytes());
-}
-
-__device__ inline int string_view::compare(const char* data, size_type bytes) const
-{
-  size_type const len1      = size_bytes();
-  const unsigned char* ptr1 = reinterpret_cast<const unsigned char*>(this->data());
-  const unsigned char* ptr2 = reinterpret_cast<const unsigned char*>(data);
-  size_type idx             = 0;
-  for (; (idx < len1) && (idx < bytes); ++idx) {
-    if (*ptr1 != *ptr2) return static_cast<int32_t>(*ptr1) - static_cast<int32_t>(*ptr2);
-    ++ptr1;
-    ++ptr2;
-  }
-  if (idx < len1) return 1;
-  if (idx < bytes) return -1;
-  return 0;
-}
-
-__device__ inline bool string_view::operator==(const string_view& rhs) const
-{
-  return compare(rhs) == 0;
-}
-
-__device__ inline bool string_view::operator!=(const string_view& rhs) const
-{
-  return compare(rhs) != 0;
-}
-
-__device__ inline bool string_view::operator<(const string_view& rhs) const
-{
-  return compare(rhs) < 0;
-}
-
-__device__ inline bool string_view::operator>(const string_view& rhs) const
-{
-  return compare(rhs) > 0;
-}
-
-__device__ inline bool string_view::operator<=(const string_view& rhs) const
-{
-  int rc = compare(rhs);
-  return (rc == 0) || (rc < 0);
-}
-
-__device__ inline bool string_view::operator>=(const string_view& rhs) const
-{
-  int rc = compare(rhs);
-  return (rc == 0) || (rc > 0);
-}
-
-__device__ inline size_type string_view::find(const string_view& str,
-                                              size_type pos,
-                                              size_type count) const
-{
-  return find(str.data(), str.size_bytes(), pos, count);
-}
-
-__device__ inline size_type string_view::find(const char* str,
-                                              size_type bytes,
-                                              size_type pos,
-                                              size_type count) const
-{
-  const char* sptr = data();
-  if (!str || !bytes) return -1;
-  size_type nchars = length();
-  if (count < 0) count = nchars;
-  size_type end = pos + count;
-  if (end < 0 || end > nchars) end = nchars;
-  size_type spos = byte_offset(pos);
-  size_type epos = byte_offset(end);
-
-  size_type len2 = bytes;
-  size_type len1 = (epos - spos) - len2 + 1;
-
-  const char* ptr1 = sptr + spos;
-  const char* ptr2 = str;
-  for (size_type idx = 0; idx < len1; ++idx) {
-    bool match = true;
-    for (size_type jdx = 0; match && (jdx < len2); ++jdx) match = (ptr1[jdx] == ptr2[jdx]);
-    if (match) return character_offset(idx + spos);
-    ptr1++;
-  }
-  return -1;
-}
-
-__device__ inline size_type string_view::find(char_utf8 chr, size_type pos, size_type count) const
-{
-  char str[sizeof(char_utf8)];
-  size_type chwidth = strings::detail::from_char_utf8(chr, str);
-  return find(str, chwidth, pos, count);
-}
-
-__device__ inline size_type string_view::rfind(const string_view& str,
-                                               size_type pos,
-                                               size_type count) const
-{
-  return rfind(str.data(), str.size_bytes(), pos, count);
-}
-
-__device__ inline size_type string_view::rfind(const char* str,
-                                               size_type bytes,
-                                               size_type pos,
-                                               size_type count) const
-{
-  const char* sptr = data();
-  if (!str || !bytes) return -1;
-  size_type nchars = length();
-  size_type end    = pos + count;
-  if (end < 0 || end > nchars) end = nchars;
-  size_type spos = byte_offset(pos);
-  size_type epos = byte_offset(end);
-
-  size_type len2 = bytes;
-  size_type len1 = (epos - spos) - len2 + 1;
-
-  const char* ptr1 = sptr + epos - len2;
-  const char* ptr2 = str;
-  for (int idx = 0; idx < len1; ++idx) {
-    bool match = true;
-    for (size_type jdx = 0; match && (jdx < len2); ++jdx) match = (ptr1[jdx] == ptr2[jdx]);
-    if (match) return character_offset(epos - len2 - idx);
-    ptr1--;  // go backwards
-  }
-  return -1;
-}
-
-__device__ inline size_type string_view::rfind(char_utf8 chr, size_type pos, size_type count) const
-{
-  char str[sizeof(char_utf8)];
-  size_type chwidth = strings::detail::from_char_utf8(chr, str);
-  return rfind(str, chwidth, pos, count);
-}
-
-// parameters are character position values
-__device__ inline string_view string_view::substr(size_type pos, size_type length) const
-{
-  size_type spos = byte_offset(pos);
-  size_type epos = byte_offset(pos + length);
-  if (epos > size_bytes()) epos = size_bytes();
-  if (spos >= epos) return string_view("", 0);
-  return string_view(data() + spos, epos - spos);
-}
-
-__device__ inline size_type string_view::character_offset(size_type bytepos) const
-{
-  if (_char_width > 0) return bytepos / _char_width;
-  return strings::detail::characters_in_string(data(), bytepos);
-}
-
-namespace strings {
-namespace detail {
-__host__ __device__ inline size_type bytes_in_char_utf8(char_utf8 chr)
-{
-  size_type count = 1;
-  count += (int)((chr & (unsigned)0x0000FF00) > 0);
-  count += (int)((chr & (unsigned)0x00FF0000) > 0);
-  count += (int)((chr & (unsigned)0xFF000000) > 0);
-  return count;
-}
-
-__host__ __device__ inline size_type to_char_utf8(const char* pSrc, char_utf8& chr)
-{
-  size_type chwidth = bytes_in_utf8_byte((BYTE)*pSrc);
-  chr               = (char_utf8)(*pSrc++) & 0xFF;
-  if (chwidth > 1) {
-    chr = chr << 8;
-    chr |= ((char_utf8)(*pSrc++) & 0xFF);  // << 8;
-    if (chwidth > 2) {
-      chr = chr << 8;
-      chr |= ((char_utf8)(*pSrc++) & 0xFF);  // << 16;
-      if (chwidth > 3) {
-        chr = chr << 8;
-        chr |= ((char_utf8)(*pSrc++) & 0xFF);  // << 24;
-      }
-    }
-  }
-  return chwidth;
-}
-
-__host__ __device__ inline size_type from_char_utf8(char_utf8 chr, char* dst)
-{
-  size_type chwidth = bytes_in_char_utf8(chr);
-  for (size_type idx = 0; idx < chwidth; ++idx) {
-    dst[chwidth - idx - 1] = (char)chr & 0xFF;
-    chr                    = chr >> 8;
-  }
-  return chwidth;
-}
-
-// counts the number of characters in the given char array
-__host__ __device__ inline size_type characters_in_string(const char* str, size_type bytes)
-{
-  if ((str == 0) || (bytes == 0)) return 0;
-  //
-  unsigned int nchars = 0;
-  for (size_type idx = 0; idx < bytes; ++idx)
-    nchars += (unsigned int)(((BYTE)str[idx] & 0xC0) != 0x80);
-  return (size_type)nchars;
-}
-
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
diff --git a/cpp/include/cudf/strings/translate.hpp b/cpp/include/cudf/strings/translate.hpp
index 9588214488c..e014f88c451 100644
--- a/cpp/include/cudf/strings/translate.hpp
+++ b/cpp/include/cudf/strings/translate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/string_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index 69035a36c58..0e89058050d 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cudf/lists/list_view.cuh>
-#include <cudf/strings/string_view.cuh>
 #include <cudf/structs/struct_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
@@ -505,6 +504,8 @@ constexpr inline bool is_fixed_width(data_type type)
   return cudf::type_dispatcher(type, is_fixed_width_impl{});
 }
 
+class string_view;
+
 /**
  * @brief Indicates whether the type `T` is a compound type.
  *
diff --git a/cpp/include/cudf_test/scalar_utilities.hpp b/cpp/include/cudf_test/scalar_utilities.hpp
deleted file mode 100644
index 7e34630365e..00000000000
--- a/cpp/include/cudf_test/scalar_utilities.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/scalar/scalar.hpp>
-
-namespace cudf {
-namespace test {
-/**
- * @brief Verifies the equality of two scalars.
- *
- * Treats invalid scalars as equivalent.
- *
- * @param lhs                   The first scalar
- * @param rhs                   The second scalar
- */
-void expect_scalars_equal(cudf::scalar const& lhs, cudf::scalar const& rhs);
-
-}  // namespace test
-}  // namespace cudf
diff --git a/cpp/include/cudf_test/type_lists.hpp b/cpp/include/cudf_test/type_lists.hpp
index 1d7174e05d7..71c2b74b37b 100644
--- a/cpp/include/cudf_test/type_lists.hpp
+++ b/cpp/include/cudf_test/type_lists.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/strings/string_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h
index 03e00b881d8..e732a13e67c 100644
--- a/cpp/include/doxygen_groups.h
+++ b/cpp/include/doxygen_groups.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -143,6 +143,9 @@
  * @defgroup lists_apis Lists
  * @{
  *   @defgroup lists_extract Extracting
+ *   @defgroup lists_contains Searching
+ *   @defgroup lists_gather Gathering
+ *   @defgroup lists_elements Counting
  * @}
  * @defgroup nvtext_apis NVText
  * @{
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index 5f4fcb1c108..46d070e14af 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,6 +34,12 @@ struct get_column_data_impl {
   }
 };
 
+template <>
+void const* get_column_data_impl::operator()<string_view>(column_view const& col)
+{
+  return nullptr;
+}
+
 void const* get_column_data(column_view const& col)
 {
   return type_dispatcher(col.type(), get_column_data_impl{}, col);
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index 69d894f9b49..ef1c17aa817 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -52,37 +52,6 @@ namespace gpu {
 /// Block dimension for dtype detection and conversion kernels
 constexpr uint32_t csvparse_block_dim = 128;
 
-/*
- * @brief Checks whether the given character is a whitespace character.
- *
- * @param ch The character to check
- *
- * @return True if the input is whitespace, False otherwise
- */
-__device__ __inline__ bool is_whitespace(char c) { return c == '\t' || c == ' '; }
-
-// TODO: replace with `trim_whitespaces_quotes` once `end` semantics is fixed
-/*
- * @brief Scans a character stream within a range, and adjusts the start and end
- * indices of the range to ignore whitespace and quotation characters.
- *
- * @param data The character stream to scan
- * @param start The start index to adjust
- * @param end The end index to adjust
- * @param quotechar The character used to denote quotes
- *
- * @return Adjusted or unchanged start_idx and end_idx
- */
-__device__ __inline__ void trim_field_start_end(const char **start,
-                                                const char **end,
-                                                char quotechar = '\0')
-{
-  while ((*start < *end) && is_whitespace(**start)) { (*start)++; }
-  if ((*start < *end) && **start == quotechar) { (*start)++; }
-  while ((*start <= *end) && is_whitespace(**end)) { (*end)--; }
-  if ((*start <= *end) && **end == quotechar) { (*end)--; }
-}
-
 /*
  * @brief Returns true is the input character is a valid digit.
  * Supports both decimal and hexadecimal digits (uppercase and lowercase).
@@ -217,19 +186,16 @@ __global__ void __launch_bounds__(csvparse_block_dim)
   while (col < column_flags.size() && field_start <= row_end) {
     auto next_delimiter = cudf::io::gpu::seek_field_end(field_start, row_end, opts);
 
-    // Checking if this is a column that the user wants --- user can filter
-    // columns
+    // Checking if this is a column that the user wants --- user can filter columns
     if (column_flags[col] & column_parse::enabled) {
       // points to last character in the field
-      auto field_end = next_delimiter - 1;
-      long field_len = next_delimiter - field_start;
-
-      if (serialized_trie_contains(opts.trie_na, field_start, field_len)) {
+      auto const field_len = static_cast<size_t>(next_delimiter - field_start);
+      if (serialized_trie_contains(opts.trie_na, {field_start, field_len})) {
         atomicAdd(&d_columnData[actual_col].null_count, 1);
-      } else if (serialized_trie_contains(opts.trie_true, field_start, field_len) ||
-                 serialized_trie_contains(opts.trie_false, field_start, field_len)) {
+      } else if (serialized_trie_contains(opts.trie_true, {field_start, field_len}) ||
+                 serialized_trie_contains(opts.trie_false, {field_start, field_len})) {
         atomicAdd(&d_columnData[actual_col].bool_count, 1);
-      } else if (cudf::io::gpu::is_infinity(field_start, field_end)) {
+      } else if (cudf::io::gpu::is_infinity(field_start, next_delimiter)) {
         atomicAdd(&d_columnData[actual_col].float_count, 1);
       } else {
         long countNumber   = 0;
@@ -243,10 +209,10 @@ __global__ void __launch_bounds__(csvparse_block_dim)
 
         // Modify field_start & end to ignore whitespace and quotechars
         // This could possibly result in additional empty fields
-        trim_field_start_end(&field_start, &field_end);
-        field_len = field_end - field_start + 1;
+        auto const trimmed_field_range = trim_whitespaces_quotes(field_start, next_delimiter);
+        auto const trimmed_field_len   = trimmed_field_range.second - trimmed_field_range.first;
 
-        for (auto cur = field_start; cur <= field_end; cur++) {
+        for (auto cur = trimmed_field_range.first; cur < trimmed_field_range.second; ++cur) {
           if (is_digit(*cur)) {
             countNumber++;
             continue;
@@ -260,16 +226,18 @@ __global__ void __launch_bounds__(csvparse_block_dim)
             case ':': countColon++; break;
             case 'e':
             case 'E':
-              if (cur > field_start && cur < field_end) countExponent++;
+              if (cur > trimmed_field_range.first && cur < trimmed_field_range.second - 1)
+                countExponent++;
               break;
             default: countString++; break;
           }
         }
 
         // Integers have to have the length of the string
-        long int_req_number_cnt = field_len;
         // Off by one if they start with a minus sign
-        if ((*field_start == '-' || *field_start == '+') && field_len > 1) { --int_req_number_cnt; }
+        auto const int_req_number_cnt = trimmed_field_len - ((*trimmed_field_range.first == '-' ||
+                                                              *trimmed_field_range.first == '+') &&
+                                                             trimmed_field_len > 1);
 
         if (column_flags[col] & column_parse::as_datetime) {
           // PANDAS uses `object` dtype if the date is unparseable
@@ -279,13 +247,17 @@ __global__ void __launch_bounds__(csvparse_block_dim)
             atomicAdd(&d_columnData[actual_col].string_count, 1);
           }
         } else if (countNumber == int_req_number_cnt) {
-          bool is_negative       = (*field_start == '-');
-          char const *data_begin = field_start + (is_negative || (*field_start == '+'));
-          cudf::size_type *ptr   = cudf::io::gpu::infer_integral_field_counter(
+          auto const is_negative = (*trimmed_field_range.first == '-');
+          auto const data_begin =
+            trimmed_field_range.first + (is_negative || (*trimmed_field_range.first == '+'));
+          cudf::size_type *ptr = cudf::io::gpu::infer_integral_field_counter(
             data_begin, data_begin + countNumber, is_negative, d_columnData[actual_col]);
           atomicAdd(ptr, 1);
-        } else if (is_floatingpoint(
-                     field_len, countNumber, countDecimal, countDash + countPlus, countExponent)) {
+        } else if (is_floatingpoint(trimmed_field_len,
+                                    countNumber,
+                                    countDecimal,
+                                    countDash + countPlus,
+                                    countExponent)) {
           atomicAdd(&d_columnData[actual_col].float_count, 1);
         } else {
           atomicAdd(&d_columnData[actual_col].string_count, 1);
@@ -470,21 +442,13 @@ struct decode_op {
                                                       parse_options_view const &opts,
                                                       column_parse::flags flags)
   {
-    static_cast<T *>(out_buffer)[row] = [&]() {
-      // Check for user-specified true/false values first, where the output is
-      // replaced with 1/0 respectively
-      const size_t field_len = end - begin + 1;
-      if (serialized_trie_contains(opts.trie_true, begin, field_len)) {
-        return static_cast<T>(1);
-      } else if (serialized_trie_contains(opts.trie_false, begin, field_len)) {
-        return static_cast<T>(0);
-      } else {
-        if (flags & column_parse::as_hexadecimal) {
-          return decode_value<T, 16>(begin, end, opts);
-        } else {
-          return decode_value<T>(begin, end, opts);
-        }
-      }
+    static_cast<T *>(out_buffer)[row] = [&flags, &opts, begin, end]() -> T {
+      // Check for user-specified true/false values
+      auto const field_len = static_cast<size_t>(end - begin);
+      if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return 1; }
+      if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return 0; }
+      return flags & column_parse::as_hexadecimal ? decode_value<T, 16>(begin, end, opts)
+                                                  : decode_value<T>(begin, end, opts);
     }();
 
     return true;
@@ -501,18 +465,14 @@ struct decode_op {
                                                       parse_options_view const &opts,
                                                       column_parse::flags flags)
   {
-    auto &value{static_cast<T *>(out_buffer)[row]};
-
-    // Check for user-specified true/false values first, where the output is
-    // replaced with 1/0 respectively
-    const size_t field_len = end - begin + 1;
-    if (serialized_trie_contains(opts.trie_true, begin, field_len)) {
-      value = 1;
-    } else if (serialized_trie_contains(opts.trie_false, begin, field_len)) {
-      value = 0;
-    } else {
-      value = decode_value<T>(begin, end, opts);
-    }
+    static_cast<T *>(out_buffer)[row] = [&opts, begin, end]() {
+      // Check for user-specified true/false values
+      auto const field_len = static_cast<size_t>(end - begin);
+      if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return true; }
+      if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return false; }
+      return decode_value<T>(begin, end, opts);
+    }();
+
     return true;
   }
 
@@ -528,9 +488,9 @@ struct decode_op {
                                                       parse_options_view const &opts,
                                                       column_parse::flags flags)
   {
-    auto &value{static_cast<T *>(out_buffer)[row]};
+    T const value                     = decode_value<T>(begin, end, opts);
+    static_cast<T *>(out_buffer)[row] = value;
 
-    value = decode_value<T>(begin, end, opts);
     return !std::isnan(value);
   }
 
@@ -547,9 +507,8 @@ struct decode_op {
                                                       parse_options_view const &opts,
                                                       column_parse::flags flags)
   {
-    auto &value{static_cast<T *>(out_buffer)[row]};
+    static_cast<T *>(out_buffer)[row] = decode_value<T>(begin, end, opts);
 
-    value = decode_value<T>(begin, end, opts);
     return true;
   }
 };
@@ -601,13 +560,16 @@ __global__ void __launch_bounds__(csvparse_block_dim)
 
     if (column_flags[col] & column_parse::enabled) {
       // check if the entire field is a NaN string - consistent with pandas
-      auto const is_valid =
-        !serialized_trie_contains(options.trie_na, field_start, next_delimiter - field_start);
+      auto const is_valid = !serialized_trie_contains(
+        options.trie_na, {field_start, static_cast<size_t>(next_delimiter - field_start)});
 
       // Modify field_start & end to ignore whitespace and quotechars
-      auto field_end = next_delimiter - 1;
+      auto field_end = next_delimiter;
       if (is_valid && dtypes[actual_col].id() != cudf::type_id::STRING) {
-        trim_field_start_end(&field_start, &field_end, options.quotechar);
+        auto const trimmed_field =
+          trim_whitespaces_quotes(field_start, field_end, options.quotechar);
+        field_start = trimmed_field.first;
+        field_end   = trimmed_field.second;
       }
       if (is_valid) {
         // Type dispatcher does not handle STRING
diff --git a/cpp/src/io/csv/datetime.cuh b/cpp/src/io/csv/datetime.cuh
index f0aead071fd..7f3c2ab4942 100644
--- a/cpp/src/io/csv/datetime.cuh
+++ b/cpp/src/io/csv/datetime.cuh
@@ -16,9 +16,13 @@
 
 #pragma once
 
+#include "thrust/reduce.h"
+
 #include <cudf/wrappers/durations.hpp>
+#include <io/utilities/parsing_utils.cuh>
 
-#include "thrust/reduce.h"
+namespace cudf {
+namespace io {
 
 /**
  * @brief Parses non-negative integral vales.
@@ -27,7 +31,7 @@
  * character string is expected to be well-formed.
  *
  * @param begin Pointer to the first element of the string
- * @param end Pointer to the last element of the string
+ * @param end Pointer to the first element after the string
  * @return The parsed and converted value
  */
 template <typename T>
@@ -35,7 +39,7 @@ __inline__ __device__ T to_non_negative_integer(char const* begin, char const* e
 {
   T value = 0;
 
-  for (; begin <= end; ++begin) {
+  for (; begin < end; ++begin) {
     if (*begin >= '0' && *begin <= '9') {
       value *= 10;
       value += *begin - '0';
@@ -130,7 +134,7 @@ __inline__ __device__ constexpr int64_t seconds_since_epoch(
  * @brief Extracts the Day, Month, and Year from a string.
  *
  * @param[in] begin Pointer to the first element of the string
- * @param[in] end Pointer to the last element of the string
+ * @param[in] end Pointer to the first element after the string
  * @param[in] dayfirst Flag indicating that first field is the day
  * @param[out] year
  * @param[out] month
@@ -153,7 +157,7 @@ __inline__ __device__ bool extract_date(
 
   //--- is year the first filed?
   if ((sep_pos - begin) == 4) {
-    *year = to_non_negative_integer<int>(begin, (sep_pos - 1));
+    *year = to_non_negative_integer<int>(begin, sep_pos);
 
     // Month
     auto s2 = sep_pos + 1;
@@ -165,23 +169,23 @@ __inline__ __device__ bool extract_date(
       *day   = 1;
 
     } else {
-      *month = to_non_negative_integer<int>(s2, (sep_pos - 1));
+      *month = to_non_negative_integer<int>(s2, sep_pos);
       *day   = to_non_negative_integer<int>((sep_pos + 1), end);
     }
 
   } else {
     //--- if the dayfirst flag is set, then restricts the format options
     if (dayfirst) {
-      *day = to_non_negative_integer<int>(begin, (sep_pos - 1));
+      *day = to_non_negative_integer<int>(begin, sep_pos);
 
       auto s2 = sep_pos + 1;
       sep_pos = thrust::find(thrust::seq, s2, end, sep);
 
-      *month = to_non_negative_integer<int>(s2, (sep_pos - 1));
+      *month = to_non_negative_integer<int>(s2, sep_pos);
       *year  = to_non_negative_integer<int>((sep_pos + 1), end);
 
     } else {
-      *month = to_non_negative_integer<int>(begin, (sep_pos - 1));
+      *month = to_non_negative_integer<int>(begin, sep_pos);
 
       auto s2 = sep_pos + 1;
       sep_pos = thrust::find(thrust::seq, s2, end, sep);
@@ -192,7 +196,7 @@ __inline__ __device__ bool extract_date(
         *day  = 1;
 
       } else {
-        *day  = to_non_negative_integer<int>(s2, (sep_pos - 1));
+        *day  = to_non_negative_integer<int>(s2, sep_pos);
         *year = to_non_negative_integer<int>((sep_pos + 1), end);
       }
     }
@@ -211,7 +215,7 @@ __inline__ __device__ bool extract_date(
  * at the end.
  *
  * @param[in] begin Pointer to the first element of the string
- * @param[in] end Pointer to the last element of the string
+ * @param[in] end Pointer to the first element after the string
  * @param[out] hour The hour value
  * @param[out] minute The minute value
  * @param[out] second The second value (0 if not present)
@@ -224,15 +228,17 @@ __inline__ __device__ void extract_time(
 
   // Adjust for AM/PM and any whitespace before
   int hour_adjust = 0;
-  if (*end == 'M' || *end == 'm') {
-    if (*(end - 1) == 'P' || *(end - 1) == 'p') { hour_adjust = 12; }
-    end = end - 2;
-    while (*end == ' ') { --end; }
+  auto last       = end - 1;
+  if (*last == 'M' || *last == 'm') {
+    if (*(last - 1) == 'P' || *(last - 1) == 'p') { hour_adjust = 12; }
+    last = last - 2;
+    while (*last == ' ') { --last; }
   }
+  end = last + 1;
 
   // Find hour-minute separator
   const auto hm_sep = thrust::find(thrust::seq, begin, end, sep);
-  *hour             = to_non_negative_integer<int>(begin, hm_sep - 1) + hour_adjust;
+  *hour             = to_non_negative_integer<int>(begin, hm_sep) + hour_adjust;
 
   // Find minute-second separator (if present)
   const auto ms_sep = thrust::find(thrust::seq, hm_sep + 1, end, sep);
@@ -241,7 +247,7 @@ __inline__ __device__ void extract_time(
     *second      = 0;
     *millisecond = 0;
   } else {
-    *minute = to_non_negative_integer<int>(hm_sep + 1, ms_sep - 1);
+    *minute = to_non_negative_integer<int>(hm_sep + 1, ms_sep);
 
     // Find second-millisecond separator (if present)
     const auto sms_sep = thrust::find(thrust::seq, ms_sep + 1, end, '.');
@@ -249,7 +255,7 @@ __inline__ __device__ void extract_time(
       *second      = to_non_negative_integer<int>(ms_sep + 1, end);
       *millisecond = 0;
     } else {
-      *second      = to_non_negative_integer<int>(ms_sep + 1, sms_sep - 1);
+      *second      = to_non_negative_integer<int>(ms_sep + 1, sms_sep);
       *millisecond = to_non_negative_integer<int>(sms_sep + 1, end);
     }
   }
@@ -262,20 +268,17 @@ __inline__ __device__ void extract_time(
  * Acceptable formats are a combination of `MM/YYYY` and `MM/DD/YYYY`.
  *
  * @param[in] begin Pointer to the first element of the string
- * @param[in] end Pointer to the last element of the string
+ * @param[in] end Pointer to the first element after the string
  * @param[in] dayfirst Flag to indicate that day is the first field - `DD/MM/YYYY`
  * @return Number of days since epoch
  */
 __inline__ __device__ int32_t to_date(char const* begin, char const* end, bool dayfirst)
 {
   int day, month, year;
-  int32_t e = -1;
 
-  bool status = extract_date(begin, end, dayfirst, &year, &month, &day);
-
-  if (status) e = days_since_epoch(year, month, day);
-
-  return e;
+  return extract_date(begin, end, dayfirst, &year, &month, &day)
+           ? days_since_epoch(year, month, day)
+           : -1;
 }
 
 /**
@@ -284,9 +287,9 @@ __inline__ __device__ int32_t to_date(char const* begin, char const* end, bool d
  * This function takes a string and produces a `date32` representation.
  * Acceptable formats are a combination of `MM/YYYY` and `MM/DD/YYYY`.
  *
- * @param[in] begin Pointer to the first element of the string
- * @param[in] end Pointer to the last element of the string
- * @param[in] dayfirst Flag to indicate day/month or month/day order
+ * @param begin Pointer to the first element of the string
+ * @param end Pointer to the first element after the string
+ * @param dayfirst Flag to indicate day/month or month/day order
  * @return Milliseconds since epoch
  */
 __inline__ __device__ int64_t to_date_time(char const* begin, char const* end, bool dayfirst)
@@ -303,7 +306,7 @@ __inline__ __device__ int64_t to_date_time(char const* begin, char const* end, b
     // Attempt to locate the position between date and time, ignore premature space separators
     // around the day/month/year portions
     int count = 0;
-    for (auto i = begin; i <= end; ++i) {
+    for (auto i = begin; i < end; ++i) {
       if (count == 3 && *i == ' ') {
         sep_pos = i;
         break;
@@ -315,7 +318,7 @@ __inline__ __device__ int64_t to_date_time(char const* begin, char const* end, b
 
   // There is only date if there's no separator, otherwise it's malformed
   if (sep_pos != end) {
-    if (extract_date(begin, sep_pos - 1, dayfirst, &year, &month, &day)) {
+    if (extract_date(begin, sep_pos, dayfirst, &year, &month, &day)) {
       extract_time(sep_pos + 1, end, &hour, &minute, &second, &millisecond);
       answer = seconds_since_epoch(year, month, day, hour, minute, second) * 1000 + millisecond;
     }
@@ -334,7 +337,7 @@ __inline__ __device__ int64_t to_date_time(char const* begin, char const* end, b
  * Moves the `begin` iterator past the parsed value.
  *
  * @param begin[in, out] Pointer to the first element of the string
- * @param end Pointer to the last element of the string
+ * @param end Pointer to the first element after the string
  * @return The parsed and converted value
  */
 template <typename T>
@@ -364,7 +367,7 @@ __inline__ __device__ T parse_integer(char const** begin, char const* end)
  * Moves the `begin` iterator past the parsed value.
  *
  * @param begin[in, out] Pointer to the first element of the string
- * @param end Pointer to the last element of the string
+ * @param end Pointer to the first element after the string
  * @return The parsed and converted value, zero is delimiter is not present
  */
 template <typename T>
@@ -376,49 +379,16 @@ __inline__ __device__ T parse_optional_integer(char const** begin, char const* e
   return parse_integer<T>(begin, end);
 }
 
-/**
- * @brief Excludes the prefix from the input range if the string starts with the prefix.
- *
- * @tparam N length on the prefix, plus one
- * @param begin[in, out] Pointer to the first element of the string
- * @param end Pointer to the first element after the string
- * @param prefix String we're searching for at the start of the input range
- * @return true if the input range starts with the given prefix
- */
-template <int N>
-__inline__ __device__ bool skip_if_starts_with(char const** begin,
-                                               char const* end,
-                                               const char (&prefix)[N])
-{
-  static constexpr size_t prefix_len = N - 1;
-  if (end - *begin < prefix_len) return false;
-  auto const found = thrust::equal(thrust::seq, *begin, *begin + prefix_len, prefix);
-  if (found) (*begin) += prefix_len;
-  return found;
-}
-
-/**
- * @brief Modifies the input range to exclude the leading space characters.
- *
- * @param begin[in, out] Pointer to the first element of the string
- * @param end Pointer to the first element after the string
- */
-__inline__ __device__ void skip_spaces(char const** begin, char const* end)
-{
-  *begin = thrust::find_if(thrust::seq, *begin, end, [](auto elem) { return elem != ' '; });
-}
-
 /**
  * @brief Parses the input string into a duration of the given type.
  *
  * @param begin Pointer to the first element of the string
- * @param end Pointer to the last element of the string
+ * @param end Pointer to the first element after the string
  * @return The parsed duration
  */
 template <typename T>
 __inline__ __device__ int64_t to_time_delta(char const* begin, char const* end)
 {
-  ++end;
   // %d days [+]%H:%M:%S.n => %d days, %d days [+]%H:%M:%S,  %H:%M:%S.n, %H:%M:%S, %value.
   constexpr char sep = ':';
 
@@ -427,13 +397,14 @@ __inline__ __device__ int64_t to_time_delta(char const* begin, char const* end)
   // single pass to parse days, hour, minute, seconds, nanosecond
   auto cur         = begin;
   auto const value = parse_integer<int32_t>(&cur, end);
-  skip_spaces(&cur, end + 1);
+  cur              = skip_spaces(cur, end);
   if (std::is_same<T, cudf::duration_D>::value || cur >= end) {  // %value
     return value;
   }
   // " days [+]"
-  bool const has_days_seperator = skip_if_starts_with(&cur, end + 1, "days");
-  skip_spaces(&cur, end + 1);
+  auto const after_days_sep     = skip_if_starts_with(cur, end, "days");
+  auto const has_days_seperator = (after_days_sep != cur);
+  cur                           = skip_spaces(after_days_sep, end);
   cur += (*cur == '+');
   if (has_days_seperator) {
     days = value;
@@ -462,3 +433,6 @@ __inline__ __device__ int64_t to_time_delta(char const* begin, char const* end)
            .count() +
          cuda::std::chrono::duration_cast<T>(cudf::duration_ns{nanosecond}).count();
 }
+
+}  // namespace io
+}  // namespace cudf
\ No newline at end of file
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index c761dadf198..1b7635f8d0d 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -32,7 +32,6 @@
 #include <cudf/io/parquet.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
-#include <io/parquet/chunked_state.hpp>
 
 namespace cudf {
 namespace io {
@@ -410,20 +409,6 @@ table_with_metadata read_parquet(parquet_reader_options const& options,
   return reader->read(options);
 }
 
-// Freeform API wraps the detail writer class API
-std::unique_ptr<std::vector<uint8_t>> write_parquet(parquet_writer_options const& options,
-                                                    rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  auto writer = make_writer<detail_parquet::writer>(options.get_sink(), options, mr);
-
-  return writer->write(options.get_table(),
-                       options.get_metadata(),
-                       options.is_enabled_return_filemetadata(),
-                       options.get_column_chunks_file_path(),
-                       options.get_decimal_precision());
-}
-
 /**
  * @copydoc cudf::io::merge_rowgroup_metadata
  */
@@ -435,54 +420,52 @@ std::unique_ptr<std::vector<uint8_t>> merge_rowgroup_metadata(
 }
 
 /**
- * @copydoc cudf::io::write_parquet_chunked_begin
+ * @copydoc cudf::io::write_parquet
  */
-std::shared_ptr<pq_chunked_state> write_parquet_chunked_begin(
-  chunked_parquet_writer_options const& op, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<std::vector<uint8_t>> write_parquet(parquet_writer_options const& options,
+                                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  parquet_writer_options options = parquet_writer_options::builder()
-                                     .compression(op.get_compression())
-                                     .stats_level(op.get_stats_level())
-                                     .int96_timestamps(op.is_enabled_int96_timestamps());
-
-  auto state = std::make_shared<pq_chunked_state>();
-  state->wp  = make_writer<detail_parquet::writer>(op.get_sink(), options, mr);
-
-  // have to make a copy of the metadata here since we can't really
-  // guarantee the lifetime of the incoming pointer
-  if (op.get_nullable_metadata() != nullptr) {
-    state->user_metadata_with_nullability = *op.get_nullable_metadata();
-    state->user_metadata                  = &state->user_metadata_with_nullability;
-  }
-  state->int96_timestamps   = op.is_enabled_int96_timestamps();
-  state->_decimal_precision = op.get_decimal_precision();
-  state->stream             = 0;
-  state->wp->write_chunked_begin(*state);
-  return state;
+  namespace io_detail = cudf::io::detail;
+
+  auto writer = make_writer<detail_parquet::writer>(
+    options.get_sink(), options, io_detail::SingleWriteMode::YES, mr, rmm::cuda_stream_default);
+
+  writer->write(options.get_table());
+  return writer->close(options.get_column_chunks_file_path());
 }
 
 /**
- * @copydoc cudf::io::write_parquet_chunked
+ * @copydoc cudf::io::parquet_chunked_writer::parquet_chunked_writer
  */
-void write_parquet_chunked(table_view const& table, std::shared_ptr<pq_chunked_state> state)
+parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options const& op,
+                                               rmm::mr::device_memory_resource* mr)
+{
+  namespace io_detail = cudf::io::detail;
+  writer              = make_writer<detail_parquet::writer>(
+    op.get_sink(), op, io_detail::SingleWriteMode::NO, mr, rmm::cuda_stream_default);
+}
+
+/**
+ * @copydoc cudf::io::parquet_chunked_writer::write
+ */
+parquet_chunked_writer& parquet_chunked_writer::write(table_view const& table)
 {
   CUDF_FUNC_RANGE();
-  state->wp->write_chunk(table, *state);
+
+  writer->write(table);
+
+  return *this;
 }
 
 /**
- * @copydoc cudf::io::write_parquet_chunked_end
+ * @copydoc cudf::io::parquet_chunked_writer::close
  */
-std::unique_ptr<std::vector<uint8_t>> write_parquet_chunked_end(
-  std::shared_ptr<pq_chunked_state>& state,
-  bool return_filemetadata,
-  const std::string& column_chunks_file_path)
+std::unique_ptr<std::vector<uint8_t>> parquet_chunked_writer::close(
+  std::string const& column_chunks_file_path)
 {
   CUDF_FUNC_RANGE();
-  auto meta = state->wp->write_chunked_end(*state, return_filemetadata, column_chunks_file_path);
-  state.reset();
-  return meta;
+  return writer->close(column_chunks_file_path);
 }
 
 }  // namespace io
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index 4deae310a53..7448d49e117 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -61,14 +61,14 @@ namespace {
 __device__ std::pair<char const *, char const *> limit_range_to_brackets(char const *begin,
                                                                          char const *end)
 {
-  begin = thrust::find_if(
-    thrust::seq, begin, end, [] __device__(auto c) { return c == '[' || c == '{'; });
-  end = thrust::find_if(thrust::seq,
-                        thrust::make_reverse_iterator(end),
-                        thrust::make_reverse_iterator(++begin),
-                        [](auto c) { return c == ']' || c == '}'; })
-          .base();
-  return {begin, --end};
+  auto const data_begin = thrust::next(thrust::find_if(
+    thrust::seq, begin, end, [] __device__(auto c) { return c == '[' || c == '{'; }));
+  auto const data_end   = thrust::next(thrust::find_if(thrust::seq,
+                                                     thrust::make_reverse_iterator(end),
+                                                     thrust::make_reverse_iterator(data_begin),
+                                                     [](auto c) { return c == ']' || c == '}'; }))
+                          .base();
+  return {data_begin, data_end};
 }
 
 /**
@@ -307,16 +307,12 @@ struct ConvertFunctor {
   {
     T &value{static_cast<T *>(output_column)[row]};
 
-    // Check for user-specified true/false values first, where the output is
-    // replaced with 1/0 respectively
     value = [&opts, end, begin]() -> T {
-      if (serialized_trie_contains(opts.trie_true, begin, end - begin)) {
-        return 1;
-      } else if (serialized_trie_contains(opts.trie_false, begin, end - begin)) {
-        return 0;
-      } else {
-        return decode_value<T>(begin, end - 1, opts);
-      }
+      // Check for user-specified true/false values
+      auto const len = static_cast<size_t>(end - begin);
+      if (serialized_trie_contains(opts.trie_true, {begin, len})) { return 1; }
+      if (serialized_trie_contains(opts.trie_false, {begin, len})) { return 0; }
+      return decode_value<T>(begin, end, opts);
     }();
 
     return true;
@@ -333,8 +329,9 @@ struct ConvertFunctor {
                                                       size_t row,
                                                       parse_options_view const &opts)
   {
-    auto &value{static_cast<T *>(out_buffer)[row]};
-    value = decode_value<T>(begin, end - 1, opts);
+    T const value                     = decode_value<T>(begin, end, opts);
+    static_cast<T *>(out_buffer)[row] = value;
+
     return !std::isnan(value);
   }
 
@@ -351,46 +348,12 @@ struct ConvertFunctor {
                                                       cudf::size_type row,
                                                       const parse_options_view &opts)
   {
-    T &value{static_cast<T *>(output_column)[row]};
-    value = decode_value<T>(begin, end - 1, opts);
+    static_cast<T *>(output_column)[row] = decode_value<T>(begin, end, opts);
 
     return true;
   }
 };
 
-/**
- * @brief Checks whether the given character is a whitespace character.
- *
- * @param[in] ch The character to check
- *
- * @return True if the input is whitespace, False otherwise
- */
-__inline__ __device__ bool is_whitespace(char ch) { return ch == '\t' || ch == ' '; }
-
-/**
- * @brief Adjusts the range to ignore starting/trailing whitespace and quotation characters.
- *
- * @param[in] begin Pointer to the first character in the parsing range
- * @param[in] end pointer to the first character after the parsing range
- * @param[in] quotechar The character used to denote quotes; '\0' if none
- *
- * @return Trimmed range
- */
-__inline__ __device__ std::pair<char const *, char const *> trim_whitespaces_quotes(
-  char const *begin, char const *end, char quotechar = '\0')
-{
-  auto not_whitespace = [] __device__(auto c) { return !is_whitespace(c); };
-
-  begin = thrust::find_if(thrust::seq, begin, end, not_whitespace);
-  end   = thrust::find_if(thrust::seq,
-                        thrust::make_reverse_iterator(end),
-                        thrust::make_reverse_iterator(begin),
-                        not_whitespace)
-          .base();
-
-  return {(*begin == quotechar) ? ++begin : begin, (*(end - 1) == quotechar) ? end - 1 : end};
-}
-
 /**
  * @brief Returns true is the input character is a valid digit.
  * Supports both decimal and hexadecimal digits (uppercase and lowercase).
@@ -550,7 +513,7 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts,
     current = desc.value_end + 1;
 
     // Empty fields are not legal values
-    if (!serialized_trie_contains(opts.trie_na, desc.value_begin, value_len)) {
+    if (!serialized_trie_contains(opts.trie_na, {desc.value_begin, value_len})) {
       // Type dispatcher does not handle strings
       if (column_types[desc.column].id() == type_id::STRING) {
         auto str_list           = static_cast<string_pair *>(output_columns[desc.column]);
@@ -622,7 +585,7 @@ __global__ void detect_data_types_kernel(
     current = desc.value_end + 1;
 
     // Checking if the field is empty/valid
-    if (serialized_trie_contains(opts.trie_na, desc.value_begin, value_len)) {
+    if (serialized_trie_contains(opts.trie_na, {desc.value_begin, value_len})) {
       // Increase the null count for array rows, where the null count is initialized to zero.
       if (!are_rows_objects) { atomicAdd(&column_infos[desc.column].null_count, 1); }
       continue;
@@ -678,8 +641,8 @@ __global__ void detect_data_types_kernel(
     }
     // Off by one if they are a hexadecimal number
     if (maybe_hex) { --int_req_number_cnt; }
-    if (serialized_trie_contains(opts.trie_true, desc.value_begin, value_len) ||
-        serialized_trie_contains(opts.trie_false, desc.value_begin, value_len)) {
+    if (serialized_trie_contains(opts.trie_true, {desc.value_begin, value_len}) ||
+        serialized_trie_contains(opts.trie_false, {desc.value_begin, value_len})) {
       atomicAdd(&column_infos[desc.column].bool_count, 1);
     } else if (digit_count == int_req_number_cnt) {
       bool is_negative       = (*desc.value_begin == '-');
diff --git a/cpp/src/io/parquet/chunked_state.hpp b/cpp/src/io/parquet/chunked_state.hpp
deleted file mode 100644
index d6758efe417..00000000000
--- a/cpp/src/io/parquet/chunked_state.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file chunked_state.hpp
- * @brief definition for chunked state structure used by Parquet writer
- */
-
-#pragma once
-
-#include <io/parquet/parquet.hpp>
-
-#include <cudf/io/detail/parquet.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-
-namespace cudf {
-namespace io {
-
-enum class SingleWriteMode : bool { YES, NO };
-
-/**
- * @brief Chunked writer state struct. Contains various pieces of information
- *        needed that span the begin() / write() / end() call process.
- */
-struct pq_chunked_state {
-  /// The writer to be used
-  std::unique_ptr<cudf::io::detail::parquet::writer> wp;
-  /// Cuda stream to be used
-  rmm::cuda_stream_view stream;
-  /// Overall file metadata.  Filled in during the process and written during write_chunked_end()
-  cudf::io::parquet::FileMetaData md;
-  /// current write position for rowgroups/chunks
-  std::size_t current_chunk_offset;
-  /// optional user metadata
-  table_metadata_with_nullability user_metadata_with_nullability;
-  /// special parameter only used by detail::write() to indicate that we are guaranteeing
-  /// a single table write.  this enables some internal optimizations.
-  table_metadata const* user_metadata = nullptr;
-  /// only used in the write_chunked() case. copied from the (optionally) user supplied
-  /// argument to write_parquet_chunked_begin()
-  bool single_write_mode;
-  /// timestamps should be written as int96 types
-  bool int96_timestamps;
-  /// vector of precision values for decimal writing. Exactly one entry
-  /// per decimal column.
-  std::vector<uint8_t> _decimal_precision;
-
-  pq_chunked_state() = default;
-
-  pq_chunked_state(table_metadata const* metadata,
-                   SingleWriteMode mode                          = SingleWriteMode::NO,
-                   bool write_int96_timestamps                   = false,
-                   std::vector<uint8_t> const& decimal_precision = {},
-                   rmm::cuda_stream_view stream                  = rmm::cuda_stream_default)
-    : stream{stream},
-      user_metadata{metadata},
-      single_write_mode{mode == SingleWriteMode::YES},
-      int96_timestamps(write_int96_timestamps),
-      _decimal_precision(decimal_precision)
-  {
-  }
-};
-
-}  // namespace io
-}  // namespace cudf
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 9a4eab260b0..5f572e7544f 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -503,8 +503,7 @@ void writer::impl::init_page_fragments(hostdevice_vector<gpu::PageFragment> &fra
                                        uint32_t num_columns,
                                        uint32_t num_fragments,
                                        uint32_t num_rows,
-                                       uint32_t fragment_size,
-                                       rmm::cuda_stream_view stream)
+                                       uint32_t fragment_size)
 {
   CUDA_TRY(cudaMemcpyAsync(col_desc.device_ptr(),
                            col_desc.host_ptr(),
@@ -531,8 +530,7 @@ void writer::impl::gather_fragment_statistics(statistics_chunk *frag_stats_chunk
                                               hostdevice_vector<gpu::EncColumnDesc> &col_desc,
                                               uint32_t num_columns,
                                               uint32_t num_fragments,
-                                              uint32_t fragment_size,
-                                              rmm::cuda_stream_view stream)
+                                              uint32_t fragment_size)
 {
   rmm::device_vector<statistics_group> frag_stats_group(num_fragments * num_columns);
 
@@ -552,8 +550,7 @@ void writer::impl::build_chunk_dictionaries(hostdevice_vector<gpu::EncColumnChun
                                             hostdevice_vector<gpu::EncColumnDesc> &col_desc,
                                             uint32_t num_rowgroups,
                                             uint32_t num_columns,
-                                            uint32_t num_dictionaries,
-                                            rmm::cuda_stream_view stream)
+                                            uint32_t num_dictionaries)
 {
   size_t dict_scratch_size = (size_t)num_dictionaries * gpu::kDictScratchSize;
   rmm::device_vector<uint32_t> dict_scratch(dict_scratch_size / sizeof(uint32_t));
@@ -591,8 +588,7 @@ void writer::impl::init_encoder_pages(hostdevice_vector<gpu::EncColumnChunk> &ch
                                       uint32_t num_rowgroups,
                                       uint32_t num_columns,
                                       uint32_t num_pages,
-                                      uint32_t num_stats_bfr,
-                                      rmm::cuda_stream_view stream)
+                                      uint32_t num_stats_bfr)
 {
   rmm::device_vector<statistics_merge_group> page_stats_mrg(num_stats_bfr);
   CUDA_TRY(cudaMemcpyAsync(chunks.device_ptr(),
@@ -631,8 +627,7 @@ void writer::impl::encode_pages(hostdevice_vector<gpu::EncColumnChunk> &chunks,
                                 gpu_inflate_input_s *comp_in,
                                 gpu_inflate_status_s *comp_out,
                                 const statistics_chunk *page_stats,
-                                const statistics_chunk *chunk_stats,
-                                rmm::cuda_stream_view stream)
+                                const statistics_chunk *chunk_stats)
 {
   gpu::EncodePages(
     pages, chunks.device_ptr(), pages_in_batch, first_page_in_batch, comp_in, comp_out, stream);
@@ -672,42 +667,59 @@ void writer::impl::encode_pages(hostdevice_vector<gpu::EncColumnChunk> &chunks,
 
 writer::impl::impl(std::unique_ptr<data_sink> sink,
                    parquet_writer_options const &options,
-                   rmm::mr::device_memory_resource *mr)
+                   SingleWriteMode mode,
+                   rmm::mr::device_memory_resource *mr,
+                   rmm::cuda_stream_view stream)
   : _mr(mr),
+    stream(stream),
     compression_(to_parquet_compression(options.get_compression())),
     stats_granularity_(options.get_stats_level()),
     int96_timestamps(options.is_enabled_int96_timestamps()),
-    out_sink_(std::move(sink))
+    out_sink_(std::move(sink)),
+    decimal_precision(options.get_decimal_precision()),
+    single_write_mode(mode == SingleWriteMode::YES),
+    user_metadata(options.get_metadata())
 {
+  init_state();
 }
 
-std::unique_ptr<std::vector<uint8_t>> writer::impl::write(
-  table_view const &table,
-  const table_metadata *metadata,
-  bool return_filemetadata,
-  const std::string &column_chunks_file_path,
-  std::vector<uint8_t> const &decimal_precisions,
-  rmm::cuda_stream_view stream)
+writer::impl::impl(std::unique_ptr<data_sink> sink,
+                   chunked_parquet_writer_options const &options,
+                   SingleWriteMode mode,
+                   rmm::mr::device_memory_resource *mr,
+                   rmm::cuda_stream_view stream)
+  : _mr(mr),
+    stream(stream),
+    compression_(to_parquet_compression(options.get_compression())),
+    stats_granularity_(options.get_stats_level()),
+    int96_timestamps(options.is_enabled_int96_timestamps()),
+    decimal_precision(options.get_decimal_precision()),
+    single_write_mode(mode == SingleWriteMode::YES),
+    out_sink_(std::move(sink))
 {
-  pq_chunked_state state{
-    metadata, SingleWriteMode::YES, int96_timestamps, decimal_precisions, stream};
+  if (options.get_nullable_metadata() != nullptr) {
+    user_metadata_with_nullability = *options.get_nullable_metadata();
+    user_metadata                  = &user_metadata_with_nullability;
+  }
 
-  write_chunked_begin(state);
-  write_chunk(table, state);
-  return write_chunked_end(state, return_filemetadata, column_chunks_file_path);
+  init_state();
 }
 
-void writer::impl::write_chunked_begin(pq_chunked_state &state)
+writer::impl::~impl() { close(); }
+
+void writer::impl::init_state()
 {
   // Write file header
   file_header_s fhdr;
   fhdr.magic = parquet_magic;
   out_sink_->host_write(&fhdr, sizeof(fhdr));
-  state.current_chunk_offset = sizeof(file_header_s);
+  current_chunk_offset = sizeof(file_header_s);
 }
 
-void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state)
+void writer::impl::write(table_view const &table)
 {
+  CUDF_EXPECTS(not closed, "Data has already been flushed to out and closed");
+
   size_type num_columns = table.num_columns();
   size_type num_rows    = 0;
 
@@ -724,9 +736,9 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state)
   // The user can pass in information about the nullability of a column to be enforced across
   // write_chunk() calls, in a flattened bool vector. Figure out that per column.
   auto per_column_nullability =
-    (state.single_write_mode)
+    (single_write_mode)
       ? std::vector<std::vector<bool>>{}
-      : get_per_column_nullability(table, state.user_metadata_with_nullability.column_nullable);
+      : get_per_column_nullability(table, user_metadata_with_nullability.column_nullable);
 
   uint decimal_precision_idx = 0;
 
@@ -740,19 +752,19 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state)
     // one table tell us everything we need to know about their nullability.
     // Empty nullability means the writer figures out the nullability from the cudf columns.
     auto const &this_column_nullability =
-      (state.single_write_mode) ? std::vector<bool>{} : per_column_nullability[current_id];
+      (single_write_mode) ? std::vector<bool>{} : per_column_nullability[current_id];
 
     parquet_columns.emplace_back(current_id,
                                  col,
                                  this_column_nullability,
-                                 state.user_metadata,
-                                 state.int96_timestamps,
-                                 state._decimal_precision,
+                                 user_metadata,
+                                 int96_timestamps,
+                                 decimal_precision,
                                  decimal_precision_idx,
-                                 state.stream);
+                                 stream);
   }
 
-  CUDF_EXPECTS(decimal_precision_idx == state._decimal_precision.size(),
+  CUDF_EXPECTS(decimal_precision_idx == decimal_precision.size(),
                "Too many decimal precision values!");
 
   // first call. setup metadata. num_rows will get incremented as write_chunk is
@@ -825,8 +837,7 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state)
           physical_type == parquet::Type::INT96 ? ConvertedType::UNKNOWN : col.converted_type();
 
         col_schema.repetition_type =
-          (col.max_def_level() == 1 ||
-           (state.single_write_mode && col.row_count() < (size_t)num_rows))
+          (col.max_def_level() == 1 || (single_write_mode && col.row_count() < (size_t)num_rows))
             ? OPTIONAL
             : REQUIRED;
 
@@ -840,27 +851,27 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state)
     }
   }
 
-  if (state.md.version == 0) {
-    state.md.version  = 1;
-    state.md.num_rows = num_rows;
-    state.md.column_order_listsize =
+  if (md.version == 0) {
+    md.version  = 1;
+    md.num_rows = num_rows;
+    md.column_order_listsize =
       (stats_granularity_ != statistics_freq::STATISTICS_NONE) ? num_columns : 0;
-    if (state.user_metadata != nullptr) {
-      std::transform(state.user_metadata->user_data.begin(),
-                     state.user_metadata->user_data.end(),
-                     std::back_inserter(state.md.key_value_metadata),
+    if (user_metadata != nullptr) {
+      std::transform(user_metadata->user_data.begin(),
+                     user_metadata->user_data.end(),
+                     std::back_inserter(md.key_value_metadata),
                      [](auto const &kv) {
                        return KeyValue{kv.first, kv.second};
                      });
     }
-    state.md.schema = this_table_schema;
+    md.schema = this_table_schema;
   } else {
     // verify the user isn't passing mismatched tables
-    CUDF_EXPECTS(state.md.schema == this_table_schema,
+    CUDF_EXPECTS(md.schema == this_table_schema,
                  "Mismatch in schema between multiple calls to write_chunk");
 
     // increment num rows
-    state.md.num_rows += num_rows;
+    md.num_rows += num_rows;
   }
 
   // Initialize column description
@@ -920,11 +931,10 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state)
   uint32_t num_fragments = (uint32_t)((num_rows + fragment_size - 1) / fragment_size);
   hostdevice_vector<gpu::PageFragment> fragments(num_columns * num_fragments);
   if (fragments.size() != 0) {
-    init_page_fragments(
-      fragments, col_desc, num_columns, num_fragments, num_rows, fragment_size, state.stream);
+    init_page_fragments(fragments, col_desc, num_columns, num_fragments, num_rows, fragment_size);
   }
 
-  size_t global_rowgroup_base = state.md.row_groups.size();
+  size_t global_rowgroup_base = md.row_groups.size();
 
   // Decide row group boundaries based on uncompressed data size
   size_t rowgroup_size   = 0;
@@ -939,8 +949,8 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state)
     if (f > rowgroup_start && (rowgroup_size + fragment_data_size > max_rowgroup_size_ ||
                                (f + 1 - rowgroup_start) * fragment_size > max_rowgroup_rows_)) {
       // update schema
-      state.md.row_groups.resize(state.md.row_groups.size() + 1);
-      state.md.row_groups[global_r++].num_rows = (f - rowgroup_start) * fragment_size;
+      md.row_groups.resize(md.row_groups.size() + 1);
+      md.row_groups[global_r++].num_rows = (f - rowgroup_start) * fragment_size;
       num_rowgroups++;
       rowgroup_start = f;
       rowgroup_size  = 0;
@@ -948,8 +958,8 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state)
     rowgroup_size += fragment_data_size;
     if (f + 1 == num_fragments) {
       // update schema
-      state.md.row_groups.resize(state.md.row_groups.size() + 1);
-      state.md.row_groups[global_r++].num_rows = num_rows - rowgroup_start * fragment_size;
+      md.row_groups.resize(md.row_groups.size() + 1);
+      md.row_groups[global_r++].num_rows = num_rows - rowgroup_start * fragment_size;
       num_rowgroups++;
     }
   }
@@ -959,13 +969,8 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state)
   if (stats_granularity_ != statistics_freq::STATISTICS_NONE) {
     frag_stats.resize(num_fragments * num_columns);
     if (frag_stats.size() != 0) {
-      gather_fragment_statistics(frag_stats.data().get(),
-                                 fragments,
-                                 col_desc,
-                                 num_columns,
-                                 num_fragments,
-                                 fragment_size,
-                                 state.stream);
+      gather_fragment_statistics(
+        frag_stats.data().get(), fragments, col_desc, num_columns, num_fragments, fragment_size);
     }
   }
   // Initialize row groups and column chunks
@@ -975,9 +980,9 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state)
   for (uint32_t r = 0, global_r = global_rowgroup_base, f = 0, start_row = 0; r < num_rowgroups;
        r++, global_r++) {
     uint32_t fragments_in_chunk =
-      (uint32_t)((state.md.row_groups[global_r].num_rows + fragment_size - 1) / fragment_size);
-    state.md.row_groups[global_r].total_byte_size = 0;
-    state.md.row_groups[global_r].columns.resize(num_columns);
+      (uint32_t)((md.row_groups[global_r].num_rows + fragment_size - 1) / fragment_size);
+    md.row_groups[global_r].total_byte_size = 0;
+    md.row_groups[global_r].columns.resize(num_columns);
     for (int i = 0; i < num_columns; i++) {
       gpu::EncColumnChunk *ck = &chunks[r * num_columns + i];
       bool dict_enable        = false;
@@ -991,7 +996,7 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state)
       ck->stats =
         (frag_stats.size() != 0) ? frag_stats.data().get() + i * num_fragments + f : nullptr;
       ck->start_row      = start_row;
-      ck->num_rows       = (uint32_t)state.md.row_groups[global_r].num_rows;
+      ck->num_rows       = (uint32_t)md.row_groups[global_r].num_rows;
       ck->first_fragment = i * num_fragments + f;
       ck->num_values =
         std::accumulate(fragments.host_ptr(i * num_fragments + f),
@@ -1020,21 +1025,20 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state)
           num_dictionaries++;
         }
       }
-      ck->has_dictionary                                      = dict_enable;
-      state.md.row_groups[global_r].columns[i].meta_data.type = parquet_columns[i].physical_type();
-      state.md.row_groups[global_r].columns[i].meta_data.encodings = {Encoding::PLAIN,
-                                                                      Encoding::RLE};
+      ck->has_dictionary                                     = dict_enable;
+      md.row_groups[global_r].columns[i].meta_data.type      = parquet_columns[i].physical_type();
+      md.row_groups[global_r].columns[i].meta_data.encodings = {Encoding::PLAIN, Encoding::RLE};
       if (dict_enable) {
-        state.md.row_groups[global_r].columns[i].meta_data.encodings.push_back(
+        md.row_groups[global_r].columns[i].meta_data.encodings.push_back(
           Encoding::PLAIN_DICTIONARY);
       }
-      state.md.row_groups[global_r].columns[i].meta_data.path_in_schema =
+      md.row_groups[global_r].columns[i].meta_data.path_in_schema =
         parquet_columns[i].get_path_in_schema();
-      state.md.row_groups[global_r].columns[i].meta_data.codec      = UNCOMPRESSED;
-      state.md.row_groups[global_r].columns[i].meta_data.num_values = ck->num_values;
+      md.row_groups[global_r].columns[i].meta_data.codec      = UNCOMPRESSED;
+      md.row_groups[global_r].columns[i].meta_data.num_values = ck->num_values;
     }
     f += fragments_in_chunk;
-    start_row += (uint32_t)state.md.row_groups[global_r].num_rows;
+    start_row += (uint32_t)md.row_groups[global_r].num_rows;
   }
 
   // Free unused dictionaries
@@ -1042,8 +1046,7 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state)
 
   // Build chunk dictionaries and count pages
   if (num_chunks != 0) {
-    build_chunk_dictionaries(
-      chunks, col_desc, num_rowgroups, num_columns, num_dictionaries, state.stream);
+    build_chunk_dictionaries(chunks, col_desc, num_rowgroups, num_columns, num_dictionaries);
   }
 
   // Initialize batches of rowgroups to encode (mainly to limit peak memory usage)
@@ -1092,8 +1095,8 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state)
     (compression_ != parquet::Compression::UNCOMPRESSED) ? max_pages_in_batch : 0;
   uint32_t num_stats_bfr =
     (stats_granularity_ != statistics_freq::STATISTICS_NONE) ? num_pages + num_chunks : 0;
-  rmm::device_buffer uncomp_bfr(max_uncomp_bfr_size, state.stream);
-  rmm::device_buffer comp_bfr(max_comp_bfr_size, state.stream);
+  rmm::device_buffer uncomp_bfr(max_uncomp_bfr_size, stream);
+  rmm::device_buffer comp_bfr(max_comp_bfr_size, stream);
   rmm::device_vector<gpu_inflate_input_s> comp_in(max_comp_pages);
   rmm::device_vector<gpu_inflate_status_s> comp_out(max_comp_pages);
   rmm::device_vector<gpu::EncPage> pages(num_pages);
@@ -1121,8 +1124,7 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state)
                        num_rowgroups,
                        num_columns,
                        num_pages,
-                       num_stats_bfr,
-                       state.stream);
+                       num_stats_bfr);
   }
 
   auto host_bfr = [&]() {
@@ -1160,33 +1162,31 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state)
       comp_out.data().get(),
       (stats_granularity_ == statistics_freq::STATISTICS_PAGE) ? page_stats.data().get() : nullptr,
       (stats_granularity_ != statistics_freq::STATISTICS_NONE) ? page_stats.data().get() + num_pages
-                                                               : nullptr,
-      state.stream);
+                                                               : nullptr);
     for (; r < rnext; r++, global_r++) {
       for (auto i = 0; i < num_columns; i++) {
         gpu::EncColumnChunk *ck = &chunks[r * num_columns + i];
         uint8_t *dev_bfr;
         if (ck->is_compressed) {
-          state.md.row_groups[global_r].columns[i].meta_data.codec = compression_;
-          dev_bfr                                                  = ck->compressed_bfr;
+          md.row_groups[global_r].columns[i].meta_data.codec = compression_;
+          dev_bfr                                            = ck->compressed_bfr;
         } else {
           dev_bfr = ck->uncompressed_bfr;
         }
 
         if (out_sink_->supports_device_write()) {
           // let the writer do what it wants to retrieve the data from the gpu.
-          out_sink_->device_write(dev_bfr + ck->ck_stat_size, ck->compressed_size, state.stream);
+          out_sink_->device_write(dev_bfr + ck->ck_stat_size, ck->compressed_size, stream);
           // we still need to do a (much smaller) memcpy for the statistics.
           if (ck->ck_stat_size != 0) {
-            state.md.row_groups[global_r].columns[i].meta_data.statistics_blob.resize(
-              ck->ck_stat_size);
-            CUDA_TRY(cudaMemcpyAsync(
-              state.md.row_groups[global_r].columns[i].meta_data.statistics_blob.data(),
-              dev_bfr,
-              ck->ck_stat_size,
-              cudaMemcpyDeviceToHost,
-              state.stream.value()));
-            state.stream.synchronize();
+            md.row_groups[global_r].columns[i].meta_data.statistics_blob.resize(ck->ck_stat_size);
+            CUDA_TRY(
+              cudaMemcpyAsync(md.row_groups[global_r].columns[i].meta_data.statistics_blob.data(),
+                              dev_bfr,
+                              ck->ck_stat_size,
+                              cudaMemcpyDeviceToHost,
+                              stream.value()));
+            stream.synchronize();
           }
         } else {
           // copy the full data
@@ -1194,54 +1194,54 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state)
                                    dev_bfr,
                                    ck->ck_stat_size + ck->compressed_size,
                                    cudaMemcpyDeviceToHost,
-                                   state.stream.value()));
-          state.stream.synchronize();
+                                   stream.value()));
+          stream.synchronize();
           out_sink_->host_write(host_bfr.get() + ck->ck_stat_size, ck->compressed_size);
           if (ck->ck_stat_size != 0) {
-            state.md.row_groups[global_r].columns[i].meta_data.statistics_blob.resize(
-              ck->ck_stat_size);
-            memcpy(state.md.row_groups[global_r].columns[i].meta_data.statistics_blob.data(),
+            md.row_groups[global_r].columns[i].meta_data.statistics_blob.resize(ck->ck_stat_size);
+            memcpy(md.row_groups[global_r].columns[i].meta_data.statistics_blob.data(),
                    host_bfr.get(),
                    ck->ck_stat_size);
           }
         }
-        state.md.row_groups[global_r].total_byte_size += ck->compressed_size;
-        state.md.row_groups[global_r].columns[i].meta_data.data_page_offset =
-          state.current_chunk_offset + ((ck->has_dictionary) ? ck->dictionary_size : 0);
-        state.md.row_groups[global_r].columns[i].meta_data.dictionary_page_offset =
-          (ck->has_dictionary) ? state.current_chunk_offset : 0;
-        state.md.row_groups[global_r].columns[i].meta_data.total_uncompressed_size = ck->bfr_size;
-        state.md.row_groups[global_r].columns[i].meta_data.total_compressed_size =
-          ck->compressed_size;
-        state.current_chunk_offset += ck->compressed_size;
+        md.row_groups[global_r].total_byte_size += ck->compressed_size;
+        md.row_groups[global_r].columns[i].meta_data.data_page_offset =
+          current_chunk_offset + ((ck->has_dictionary) ? ck->dictionary_size : 0);
+        md.row_groups[global_r].columns[i].meta_data.dictionary_page_offset =
+          (ck->has_dictionary) ? current_chunk_offset : 0;
+        md.row_groups[global_r].columns[i].meta_data.total_uncompressed_size = ck->bfr_size;
+        md.row_groups[global_r].columns[i].meta_data.total_compressed_size   = ck->compressed_size;
+        current_chunk_offset += ck->compressed_size;
       }
     }
   }
 }
 
-std::unique_ptr<std::vector<uint8_t>> writer::impl::write_chunked_end(
-  pq_chunked_state &state, bool return_filemetadata, const std::string &column_chunks_file_path)
+std::unique_ptr<std::vector<uint8_t>> writer::impl::close(
+  std::string const &column_chunks_file_path)
 {
+  if (closed) { return nullptr; }
+  closed = true;
   CompactProtocolWriter cpw(&buffer_);
   file_ender_s fendr;
   buffer_.resize(0);
-  fendr.footer_len = static_cast<uint32_t>(cpw.write(state.md));
+  fendr.footer_len = static_cast<uint32_t>(cpw.write(md));
   fendr.magic      = parquet_magic;
   out_sink_->host_write(buffer_.data(), buffer_.size());
   out_sink_->host_write(&fendr, sizeof(fendr));
   out_sink_->flush();
 
   // Optionally output raw file metadata with the specified column chunk file path
-  if (return_filemetadata) {
+  if (column_chunks_file_path.length() > 0) {
     file_header_s fhdr = {parquet_magic};
     buffer_.resize(0);
     buffer_.insert(buffer_.end(),
                    reinterpret_cast<const uint8_t *>(&fhdr),
                    reinterpret_cast<const uint8_t *>(&fhdr) + sizeof(fhdr));
-    for (auto &rowgroup : state.md.row_groups) {
+    for (auto &rowgroup : md.row_groups) {
       for (auto &col : rowgroup.columns) { col.file_path = column_chunks_file_path; }
     }
-    fendr.footer_len = static_cast<uint32_t>(cpw.write(state.md));
+    fendr.footer_len = static_cast<uint32_t>(cpw.write(md));
     buffer_.insert(buffer_.end(),
                    reinterpret_cast<const uint8_t *>(&fendr),
                    reinterpret_cast<const uint8_t *>(&fendr) + sizeof(fendr));
@@ -1254,43 +1254,32 @@ std::unique_ptr<std::vector<uint8_t>> writer::impl::write_chunked_end(
 // Forward to implementation
 writer::writer(std::unique_ptr<data_sink> sink,
                parquet_writer_options const &options,
-               rmm::mr::device_memory_resource *mr)
-  : _impl(std::make_unique<impl>(std::move(sink), options, mr))
+               SingleWriteMode mode,
+               rmm::mr::device_memory_resource *mr,
+               rmm::cuda_stream_view stream)
+  : _impl(std::make_unique<impl>(std::move(sink), options, mode, mr, stream))
 {
 }
 
-// Destructor within this translation unit
-writer::~writer() = default;
-
-// Forward to implementation
-std::unique_ptr<std::vector<uint8_t>> writer::write(table_view const &table,
-                                                    const table_metadata *metadata,
-                                                    bool return_filemetadata,
-                                                    const std::string column_chunks_file_path,
-                                                    std::vector<uint8_t> const &decimal_precisions,
-                                                    rmm::cuda_stream_view stream)
+writer::writer(std::unique_ptr<data_sink> sink,
+               chunked_parquet_writer_options const &options,
+               SingleWriteMode mode,
+               rmm::mr::device_memory_resource *mr,
+               rmm::cuda_stream_view stream)
+  : _impl(std::make_unique<impl>(std::move(sink), options, mode, mr, stream))
 {
-  return _impl->write(
-    table, metadata, return_filemetadata, column_chunks_file_path, decimal_precisions, stream);
 }
 
-// Forward to implementation
-void writer::write_chunked_begin(pq_chunked_state &state)
-{
-  return _impl->write_chunked_begin(state);
-}
+// Destructor within this translation unit
+writer::~writer() = default;
 
 // Forward to implementation
-void writer::write_chunk(table_view const &table, pq_chunked_state &state)
-{
-  _impl->write_chunk(table, state);
-}
+void writer::write(table_view const &table) { _impl->write(table); }
 
 // Forward to implementation
-std::unique_ptr<std::vector<uint8_t>> writer::write_chunked_end(
-  pq_chunked_state &state, bool return_filemetadata, const std::string &column_chunks_file_path)
+std::unique_ptr<std::vector<uint8_t>> writer::close(std::string const &column_chunks_file_path)
 {
-  return _impl->write_chunked_end(state, return_filemetadata, column_chunks_file_path);
+  return _impl->close(column_chunks_file_path);
 }
 
 std::unique_ptr<std::vector<uint8_t>> writer::merge_rowgroup_metadata(
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index b664926f970..df76fb093fa 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,8 +21,6 @@
 
 #pragma once
 
-#include "chunked_state.hpp"
-
 #include <io/parquet/parquet.hpp>
 #include <io/parquet/parquet_gpu.hpp>
 
@@ -68,57 +66,57 @@ class writer::impl {
    *
    * @param filepath Filepath if storing dataset to a file
    * @param options Settings for controlling behavior
+   * @param mode Option to write at once or in chunks
    * @param mr Device memory resource to use for device memory allocation
+   * @param stream CUDA stream used for device memory operations and kernel launches.
    */
   explicit impl(std::unique_ptr<data_sink> sink,
                 parquet_writer_options const& options,
-                rmm::mr::device_memory_resource* mr);
+                SingleWriteMode mode,
+                rmm::mr::device_memory_resource* mr,
+                rmm::cuda_stream_view stream);
 
   /**
-   * @brief Write an entire dataset to parquet format.
+   * @brief Constructor with chunked writer options.
    *
-   * @param table The set of columns
-   * @param metadata The metadata associated with the table
-   * @param return_filemetadata If true, return the raw parquet file metadata
-   * @param column_chunks_file_path Column chunks file path to be set in the raw output metadata
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   * @return unique_ptr to FileMetadata thrift message if requested
+   * @param filepath Filepath if storing dataset to a file
+   * @param options Settings for controlling behavior
+   * @param mode Option to write at once or in chunks
+   * @param mr Device memory resource to use for device memory allocation
+   * @param stream CUDA stream used for device memory operations and kernel launches
    */
-  std::unique_ptr<std::vector<uint8_t>> write(table_view const& table,
-                                              const table_metadata* metadata,
-                                              bool return_filemetadata,
-                                              const std::string& column_chunks_file_path,
-                                              std::vector<uint8_t> const& decimal_precisions,
-                                              rmm::cuda_stream_view stream);
+  explicit impl(std::unique_ptr<data_sink> sink,
+                chunked_parquet_writer_options const& options,
+                SingleWriteMode mode,
+                rmm::mr::device_memory_resource* mr,
+                rmm::cuda_stream_view stream);
 
   /**
-   * @brief Begins the chunked/streamed write process.
-   *
-   * @param[in] pq_chunked_state Internal state maintained between chunks.
+   * @brief Destructor to complete any incomplete write and release resources.
    */
-  void write_chunked_begin(pq_chunked_state& state);
+  ~impl();
 
   /**
-   * @brief Writes a single subtable as part of a larger parquet file/table write.
+   * @brief Initializes the states before writing.
+   */
+  void init_state();
+
+  /**
+   * @brief Writes a single subtable as part of a larger parquet file/table write,
+   * normally used for chunked writing.
    *
    * @param[in] table The table information to be written
-   * @param[in] pq_chunked_state Internal state maintained between chunks.
-   * boundaries.
    */
-  void write_chunk(table_view const& table, pq_chunked_state& state);
+  void write(table_view const& table);
 
   /**
    * @brief Finishes the chunked/streamed write process.
    *
-   * @param[in] pq_chunked_state Internal state maintained between chunks.
-   * @param return_filemetadata If true, return the raw parquet file metadata
-   * @param column_chunks_file_path Column chunks file path to be set in the raw output metadata
-   * @return unique_ptr to FileMetadata thrift message if requested
+   * @param[in] column_chunks_file_path Column chunks file path to be set in the raw output metadata
+   * @return A parquet-compatible blob that contains the data for all rowgroups in the list only if
+   * `column_chunks_file_path` is provided, else null.
    */
-  std::unique_ptr<std::vector<uint8_t>> write_chunked_end(
-    pq_chunked_state& state,
-    bool return_filemetadata                   = false,
-    const std::string& column_chunks_file_path = "");
+  std::unique_ptr<std::vector<uint8_t>> close(std::string const& column_chunks_file_path = "");
 
  private:
   /**
@@ -130,15 +128,13 @@ class writer::impl {
    * @param num_fragments Total number of fragments per column
    * @param num_rows Total number of rows
    * @param fragment_size Number of rows per fragment
-   * @param stream CUDA stream used for device memory operations and kernel launches.
    */
   void init_page_fragments(hostdevice_vector<gpu::PageFragment>& frag,
                            hostdevice_vector<gpu::EncColumnDesc>& col_desc,
                            uint32_t num_columns,
                            uint32_t num_fragments,
                            uint32_t num_rows,
-                           uint32_t fragment_size,
-                           rmm::cuda_stream_view stream);
+                           uint32_t fragment_size);
   /**
    * @brief Gather per-fragment statistics
    *
@@ -148,15 +144,13 @@ class writer::impl {
    * @param num_columns Total number of columns
    * @param num_fragments Total number of fragments per column
    * @param fragment_size Number of rows per fragment
-   * @param stream CUDA stream used for device memory operations and kernel launches.
    */
   void gather_fragment_statistics(statistics_chunk* dst_stats,
                                   hostdevice_vector<gpu::PageFragment>& frag,
                                   hostdevice_vector<gpu::EncColumnDesc>& col_desc,
                                   uint32_t num_columns,
                                   uint32_t num_fragments,
-                                  uint32_t fragment_size,
-                                  rmm::cuda_stream_view stream);
+                                  uint32_t fragment_size);
   /**
    * @brief Build per-chunk dictionaries and count data pages
    *
@@ -165,14 +159,12 @@ class writer::impl {
    * @param num_rowgroups Total number of rowgroups
    * @param num_columns Total number of columns
    * @param num_dictionaries Total number of dictionaries
-   * @param stream CUDA stream used for device memory operations and kernel launches.
    */
   void build_chunk_dictionaries(hostdevice_vector<gpu::EncColumnChunk>& chunks,
                                 hostdevice_vector<gpu::EncColumnDesc>& col_desc,
                                 uint32_t num_rowgroups,
                                 uint32_t num_columns,
-                                uint32_t num_dictionaries,
-                                rmm::cuda_stream_view stream);
+                                uint32_t num_dictionaries);
   /**
    * @brief Initialize encoder pages
    *
@@ -183,7 +175,6 @@ class writer::impl {
    * @param num_columns Total number of columns
    * @param num_pages Total number of pages
    * @param num_stats_bfr Number of statistics buffers
-   * @param stream CUDA stream used for device memory operations and kernel launches.
    */
   void init_encoder_pages(hostdevice_vector<gpu::EncColumnChunk>& chunks,
                           hostdevice_vector<gpu::EncColumnDesc>& col_desc,
@@ -193,8 +184,7 @@ class writer::impl {
                           uint32_t num_rowgroups,
                           uint32_t num_columns,
                           uint32_t num_pages,
-                          uint32_t num_stats_bfr,
-                          rmm::cuda_stream_view stream);
+                          uint32_t num_stats_bfr);
   /**
    * @brief Encode a batch pages
    *
@@ -209,7 +199,6 @@ class writer::impl {
    * @param comp_out compressor status array
    * @param page_stats optional page-level statistics (nullptr if none)
    * @param chunk_stats optional chunk-level statistics (nullptr if none)
-   * @param stream CUDA stream used for device memory operations and kernel launches.
    */
   void encode_pages(hostdevice_vector<gpu::EncColumnChunk>& chunks,
                     gpu::EncPage* pages,
@@ -221,12 +210,13 @@ class writer::impl {
                     gpu_inflate_input_s* comp_in,
                     gpu_inflate_status_s* comp_out,
                     const statistics_chunk* page_stats,
-                    const statistics_chunk* chunk_stats,
-                    rmm::cuda_stream_view stream);
+                    const statistics_chunk* chunk_stats);
 
  private:
   // TODO : figure out if we want to keep this. It is currently unused.
   rmm::mr::device_memory_resource* _mr = nullptr;
+  // Cuda stream to be used
+  rmm::cuda_stream_view stream = rmm::cuda_stream_default;
 
   size_t max_rowgroup_size_          = DEFAULT_ROWGROUP_MAXSIZE;
   size_t max_rowgroup_rows_          = DEFAULT_ROWGROUP_MAXROWS;
@@ -234,6 +224,23 @@ class writer::impl {
   Compression compression_           = Compression::UNCOMPRESSED;
   statistics_freq stats_granularity_ = statistics_freq::STATISTICS_NONE;
   bool int96_timestamps              = false;
+  // Overall file metadata.  Filled in during the process and written during write_chunked_end()
+  cudf::io::parquet::FileMetaData md;
+  // optional user metadata
+  table_metadata_with_nullability user_metadata_with_nullability;
+  // only used in the write_chunked() case. copied from the (optionally) user supplied
+  // argument to write()
+  table_metadata const* user_metadata = nullptr;
+  // to track if the output has been written to sink
+  bool closed = false;
+  // vector of precision values for decimal writing. Exactly one entry
+  // per decimal column.
+  std::vector<uint8_t> decimal_precision;
+  // current write position for rowgroups/chunks
+  std::size_t current_chunk_offset;
+  // special parameter only used by detail::write() to indicate that we are guaranteeing
+  // a single table write.  this enables some internal optimizations.
+  bool const single_write_mode = true;
 
   std::vector<uint8_t> buffer_;
   std::unique_ptr<data_sink> out_sink_;
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 49f5d285647..c7f405e1cc0 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -89,10 +89,10 @@ namespace gpu {
  * Also iterates over (one or more) delimiter characters after the field.
  * Function applies to formats with field delimiters and line terminators.
  *
- * @param[in] begin Beginning of the character string
- * @param[in] end End of the character string
- * @param[in] opts A set of parsing options
- * @param[in] escape_char A boolean value to signify whether to consider `\` as escape character or
+ * @param begin Pointer to the first element of the string
+ * @param end Pointer to the first element after the string
+ * @param opts A set of parsing options
+ * @param escape_char A boolean value to signify whether to consider `\` as escape character or
  * just a character.
  *
  * @return Pointer to the last character in the field, including the
@@ -191,33 +191,33 @@ __inline__ __device__ char to_lower(char const c)
 }
 
 /**
- * @brief Check if string is infinity, case insensitive with/without sign
+ * @brief Checks if string is infinity, case insensitive with/without sign
  * Valid infinity strings are inf, +inf, -inf, infinity, +infinity, -infinity
  * String comparison is case insensitive.
  *
- * @param start The pointer to character array to start parsing from
- * @param end The pointer to character array to end parsing
+ * @param begin Pointer to the first element of the string
+ * @param end Pointer to the first element after the string
  * @return true if string is valid infinity, else false.
  */
-__inline__ __device__ bool is_infinity(char const* start, char const* end)
+__inline__ __device__ bool is_infinity(char const* begin, char const* end)
 {
-  if (*start == '-' || *start == '+') start++;
+  if (*begin == '-' || *begin == '+') begin++;
   char const* cinf = "infinity";
-  auto index       = start;
-  while (index <= end) {
+  auto index       = begin;
+  while (index < end) {
     if (*cinf != to_lower(*index)) break;
     index++;
     cinf++;
   }
-  return ((index == start + 3 || index == start + 8) && index > end);
+  return ((index == begin + 3 || index == begin + 8) && index >= end);
 }
 
 /**
  * @brief Parses a character string and returns its numeric value.
  *
- * @param[in] begin Beginning of the character string
- * @param[in] end End of the character string
- * @param[in] opts The global parsing behavior options
+ * @param begin Pointer to the first element of the string
+ * @param end Pointer to the first element after the string
+ * @param opts The global parsing behavior options
  * @tparam base Base (radix) to use for conversion
  *
  * @return The parsed and converted value
@@ -240,11 +240,11 @@ __inline__ __device__ T parse_numeric(const char* begin,
   if (*begin == '-' || *begin == '+') begin++;
 
   // Skip over the "0x" prefix for hex notation
-  if (base == 16 && begin + 2 <= end && *begin == '0' && *(begin + 1) == 'x') { begin += 2; }
+  if (base == 16 && begin + 2 < end && *begin == '0' && *(begin + 1) == 'x') { begin += 2; }
 
   // Handle the whole part of the number
   // auto index = begin;
-  while (begin <= end) {
+  while (begin < end) {
     if (*begin == opts.decimal) {
       ++begin;
       break;
@@ -259,7 +259,7 @@ __inline__ __device__ T parse_numeric(const char* begin,
   if (std::is_floating_point<T>::value) {
     // Handle fractional part of the number if necessary
     double divisor = 1;
-    while (begin <= end) {
+    while (begin < end) {
       if (*begin == 'e' || *begin == 'E') {
         ++begin;
         break;
@@ -271,11 +271,11 @@ __inline__ __device__ T parse_numeric(const char* begin,
     }
 
     // Handle exponential part of the number if necessary
-    if (begin <= end) {
+    if (begin < end) {
       const int32_t exponent_sign = *begin == '-' ? -1 : 1;
       if (*begin == '-' || *begin == '+') { ++begin; }
       int32_t exponent = 0;
-      while (begin <= end) {
+      while (begin < end) {
         exponent = (exponent * 10) + decode_digit<T>(*(begin++), &all_digits_valid);
       }
       if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); }
@@ -459,5 +459,75 @@ std::string infer_compression_type(
   const std::string& filename,
   const std::vector<std::pair<std::string, std::string>>& ext_to_comp_map);
 
+/**
+ * @brief Checks whether the given character is a whitespace character.
+ *
+ * @param[in] ch The character to check
+ *
+ * @return True if the input is whitespace, False otherwise
+ */
+__inline__ __device__ bool is_whitespace(char ch) { return ch == '\t' || ch == ' '; }
+
+/**
+ * @brief Skips past the current character if it matches the given value.
+ */
+template <typename It>
+__inline__ __device__ It skip_character(It const& it, char ch)
+{
+  return it + (*it == ch);
+}
+
+/**
+ * @brief Adjusts the range to ignore starting/trailing whitespace and quotation characters.
+ *
+ * @param[in] begin Pointer to the first character in the parsing range
+ * @param[in] end pointer to the first character after the parsing range
+ * @param[in] quotechar The character used to denote quotes; '\0' if none
+ *
+ * @return Trimmed range
+ */
+__inline__ __device__ std::pair<char const*, char const*> trim_whitespaces_quotes(
+  char const* begin, char const* end, char quotechar = '\0')
+{
+  auto not_whitespace = [] __device__(auto c) { return !is_whitespace(c); };
+
+  auto const trim_begin = thrust::find_if(thrust::seq, begin, end, not_whitespace);
+  auto const trim_end   = thrust::find_if(thrust::seq,
+                                        thrust::make_reverse_iterator(end),
+                                        thrust::make_reverse_iterator(trim_begin),
+                                        not_whitespace);
+
+  return {skip_character(trim_begin, quotechar), skip_character(trim_end, quotechar).base()};
+}
+
+/**
+ * @brief Excludes the prefix from the input range if the string starts with the prefix.
+ *
+ * @tparam N length on the prefix, plus one
+ * @param begin[in, out] Pointer to the first element of the string
+ * @param end Pointer to the first element after the string
+ * @param prefix String we're searching for at the start of the input range
+ */
+template <int N>
+__inline__ __device__ auto skip_if_starts_with(char const* begin,
+                                               char const* end,
+                                               const char (&prefix)[N])
+{
+  static constexpr size_t prefix_len = N - 1;
+  if (end - begin < prefix_len) return begin;
+  return thrust::equal(thrust::seq, begin, begin + prefix_len, prefix) ? begin + prefix_len : begin;
+}
+
+/**
+ * @brief Finds the first element after the leading space characters.
+ *
+ * @param begin Pointer to the first element of the string
+ * @param end Pointer to the first element after the string
+ */
+__inline__ __device__ auto skip_spaces(char const* begin, char const* end)
+{
+  return thrust::find_if(thrust::seq, begin, end, [](auto elem) { return elem != ' '; });
+}
+
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/jit/cache.cpp b/cpp/src/jit/cache.cpp
index 10647dd934d..c634aa8d06b 100644
--- a/cpp/src/jit/cache.cpp
+++ b/cpp/src/jit/cache.cpp
@@ -74,6 +74,19 @@ boost::filesystem::path getCacheDir()
   // empty, to disallow use of file cache at runtime.
   if (not kernel_cache_path.empty()) {
     kernel_cache_path /= std::string{CUDF_STRINGIFY(CUDF_VERSION)};
+
+    // Make per device cache based on compute capability. This is to avoid multiple devices of
+    // different compute capability to access the same kernel cache.
+    int device;
+    int cc_major;
+    int cc_minor;
+    CUDA_TRY(cudaGetDevice(&device));
+    CUDA_TRY(cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device));
+    CUDA_TRY(cudaDeviceGetAttribute(&cc_minor, cudaDevAttrComputeCapabilityMinor, device));
+    int cc = cc_major * 10 + cc_minor;
+
+    kernel_cache_path /= std::to_string(cc);
+
     try {
       // `mkdir -p` the kernel cache path if it doesn't exist
       boost::filesystem::create_directories(kernel_cache_path);
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
new file mode 100644
index 00000000000..49f06d5acfd
--- /dev/null
+++ b/cpp/src/lists/contains.cu
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/logical.h>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/valid_if.cuh>
+#include <cudf/lists/contains.hpp>
+#include <cudf/lists/list_device_view.cuh>
+#include <cudf/lists/lists_column_device_view.cuh>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/scalar/scalar_device_view.cuh>
+#include <cudf/utilities/type_dispatcher.hpp>
+#include <rmm/exec_policy.hpp>
+#include <type_traits>
+
+namespace cudf {
+namespace lists {
+
+namespace {
+
+auto get_search_keys_device_iterable_view(cudf::column_view const& search_keys,
+                                          rmm::cuda_stream_view stream)
+{
+  return column_device_view::create(search_keys, stream);
+}
+
+auto get_search_keys_device_iterable_view(cudf::scalar const& search_key, rmm::cuda_stream_view)
+{
+  return &search_key;
+}
+
+template <typename ElementType, bool has_nulls>
+auto get_pair_iterator(cudf::column_device_view const& d_search_keys)
+{
+  return d_search_keys.pair_begin<ElementType, has_nulls>();
+}
+
+template <typename ElementType, bool>
+auto get_pair_iterator(cudf::scalar const& search_key)
+{
+  return cudf::detail::make_pair_iterator<ElementType>(search_key);
+}
+
+/**
+ * @brief Functor to search each list row for the specified search keys.
+ */
+template <bool search_keys_have_nulls>
+struct lookup_functor {
+  template <typename ElementType>
+  struct is_supported {
+    static constexpr bool value = cudf::is_numeric<ElementType>() ||
+                                  cudf::is_chrono<ElementType>() ||
+                                  std::is_same<ElementType, cudf::string_view>::value;
+  };
+
+  template <typename ElementType, typename... Args>
+  std::enable_if_t<!is_supported<ElementType>::value, std::unique_ptr<column>> operator()(
+    Args&&...) const
+  {
+    CUDF_FAIL("lists::contains() is only supported on numeric types, chrono types, and strings.");
+  }
+
+  std::pair<rmm::device_buffer, size_type> construct_null_mask(lists_column_view const& input_lists,
+                                                               column_view const& result_validity,
+                                                               rmm::cuda_stream_view stream,
+                                                               rmm::mr::device_memory_resource* mr)
+  {
+    if (!search_keys_have_nulls && !input_lists.has_nulls() && !input_lists.child().has_nulls()) {
+      return {rmm::device_buffer{0, stream, mr}, size_type{0}};
+    } else {
+      return cudf::detail::valid_if(result_validity.begin<bool>(),
+                                    result_validity.end<bool>(),
+                                    thrust::identity<bool>{},
+                                    stream,
+                                    mr);
+    }
+  }
+
+  template <typename ElementType, typename SearchKeyPairIter>
+  void search_each_list_row(cudf::detail::lists_column_device_view const& d_lists,
+                            SearchKeyPairIter search_key_pair_iter,
+                            cudf::mutable_column_device_view mutable_ret_bools,
+                            cudf::mutable_column_device_view mutable_ret_validity,
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr)
+  {
+    thrust::for_each(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(d_lists.size()),
+      [d_lists,
+       search_key_pair_iter,
+       d_bools    = mutable_ret_bools.data<bool>(),
+       d_validity = mutable_ret_validity.data<bool>()] __device__(auto row_index) {
+        auto search_key_and_validity    = search_key_pair_iter[row_index];
+        auto const& search_key_is_valid = search_key_and_validity.second;
+
+        if (search_keys_have_nulls && !search_key_is_valid) {
+          d_bools[row_index]    = false;
+          d_validity[row_index] = false;
+          return;
+        }
+
+        auto list = cudf::list_device_view(d_lists, row_index);
+        if (list.is_null()) {
+          d_bools[row_index]    = false;
+          d_validity[row_index] = false;
+          return;
+        }
+
+        auto search_key    = search_key_and_validity.first;
+        d_bools[row_index] = thrust::find_if(thrust::seq,
+                                             list.pair_begin<ElementType>(),
+                                             list.pair_end<ElementType>(),
+                                             [search_key] __device__(auto element_and_validity) {
+                                               return element_and_validity.second &&
+                                                      (element_and_validity.first == search_key);
+                                             }) != list.pair_end<ElementType>();
+        d_validity[row_index] =
+          d_bools[row_index] ||
+          thrust::none_of(thrust::seq,
+                          thrust::make_counting_iterator(size_type{0}),
+                          thrust::make_counting_iterator(list.size()),
+                          [&list] __device__(auto const& i) { return list.is_null(i); });
+      });
+  }
+
+  template <typename ElementType, typename SearchKeyType>
+  std::enable_if_t<is_supported<ElementType>::value, std::unique_ptr<column>> operator()(
+    cudf::lists_column_view const& lists,
+    SearchKeyType const& search_key,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr)
+  {
+    using namespace cudf;
+    using namespace cudf::detail;
+
+    CUDF_EXPECTS(!cudf::is_nested(lists.child().type()),
+                 "Nested types not supported in lists::contains()");
+    CUDF_EXPECTS(lists.child().type().id() == search_key.type().id(),
+                 "Type of search key does not match list column element type.");
+    CUDF_EXPECTS(search_key.type().id() != type_id::EMPTY, "Type cannot be empty.");
+
+    auto constexpr search_key_is_scalar = std::is_same<SearchKeyType, cudf::scalar>::value;
+
+    if (search_keys_have_nulls && search_key_is_scalar) {
+      return make_fixed_width_column(data_type(type_id::BOOL8),
+                                     lists.size(),
+                                     cudf::create_null_mask(lists.size(), mask_state::ALL_NULL, mr),
+                                     lists.size(),
+                                     stream,
+                                     mr);
+    }
+
+    auto const device_view = column_device_view::create(lists.parent(), stream);
+    auto const d_lists     = lists_column_device_view(*device_view);
+    auto const d_skeys     = get_search_keys_device_iterable_view(search_key, stream);
+
+    auto const lists_column_has_nulls = lists.has_nulls() || lists.child().has_nulls();
+
+    auto result_validity = make_fixed_width_column(
+      data_type{type_id::BOOL8}, lists.size(), cudf::mask_state::UNALLOCATED, stream, mr);
+    auto result_bools = make_fixed_width_column(
+      data_type{type_id::BOOL8}, lists.size(), cudf::mask_state::UNALLOCATED, stream, mr);
+    auto mutable_result_bools =
+      mutable_column_device_view::create(result_bools->mutable_view(), stream);
+    auto mutable_result_validity =
+      mutable_column_device_view::create(result_validity->mutable_view(), stream);
+    auto search_key_iter = get_pair_iterator<ElementType, search_keys_have_nulls>(*d_skeys);
+
+    search_each_list_row<ElementType>(
+      d_lists, search_key_iter, *mutable_result_bools, *mutable_result_validity, stream, mr);
+
+    rmm::device_buffer null_mask;
+    size_type num_nulls;
+
+    std::tie(null_mask, num_nulls) =
+      construct_null_mask(lists, result_validity->view(), stream, mr);
+    result_bools->set_null_mask(std::move(null_mask), num_nulls);
+
+    return result_bools;
+  }
+};
+
+}  // namespace
+
+namespace detail {
+
+std::unique_ptr<column> contains(cudf::lists_column_view const& lists,
+                                 cudf::scalar const& search_key,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  return search_key.is_valid(stream)
+           ? cudf::type_dispatcher(
+               search_key.type(), lookup_functor<false>{}, lists, search_key, stream, mr)
+           : cudf::type_dispatcher(
+               search_key.type(), lookup_functor<true>{}, lists, search_key, stream, mr);
+}
+
+std::unique_ptr<column> contains(cudf::lists_column_view const& lists,
+                                 cudf::column_view const& search_keys,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(search_keys.size() == lists.size(),
+               "Number of search keys must match list column size.");
+
+  return search_keys.has_nulls()
+           ? cudf::type_dispatcher(
+               search_keys.type(), lookup_functor<true>{}, lists, search_keys, stream, mr)
+           : cudf::type_dispatcher(
+               search_keys.type(), lookup_functor<false>{}, lists, search_keys, stream, mr);
+}
+
+}  // namespace detail
+
+std::unique_ptr<column> contains(cudf::lists_column_view const& lists,
+                                 cudf::scalar const& search_key,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::contains(lists, search_key, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> contains(cudf::lists_column_view const& lists,
+                                 cudf::column_view const& search_keys,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::contains(lists, search_keys, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace lists
+}  // namespace cudf
diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu
new file mode 100644
index 00000000000..78549152770
--- /dev/null
+++ b/cpp/src/lists/count_elements.cu
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/lists/count_elements.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_vector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/transform.h>
+#include <thrust/transform_scan.h>
+
+namespace cudf {
+namespace lists {
+namespace detail {
+/**
+ * @brief Returns a numeric column containing lengths of each element.
+ *
+ * @param input Input lists column.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New INT32 column with lengths.
+ */
+std::unique_ptr<column> count_elements(lists_column_view const& input,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
+{
+  auto device_column = cudf::column_device_view::create(input.parent(), stream);
+  auto d_column      = *device_column;
+  // create output column
+  auto output = make_fixed_width_column(data_type{type_to_id<size_type>()},
+                                        input.size(),
+                                        copy_bitmask(input.parent()),
+                                        input.null_count(),
+                                        stream,
+                                        mr);
+
+  // fill in the sizes
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<cudf::size_type>(0),
+                    thrust::make_counting_iterator<cudf::size_type>(input.size()),
+                    output->mutable_view().begin<size_type>(),
+                    [d_column] __device__(size_type idx) {
+                      if (d_column.is_null(idx)) return size_type{0};
+                      auto d_offsets =
+                        d_column.child(lists_column_view::offsets_column_index).data<size_type>() +
+                        d_column.offset();
+                      return d_offsets[idx + 1] - d_offsets[idx];
+                    });
+
+  output->set_null_count(input.null_count());  // reset null count
+  return output;
+}
+
+}  // namespace detail
+
+// external APIS
+
+std::unique_ptr<column> count_elements(lists_column_view const& input,
+                                       rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::count_elements(input, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace lists
+}  // namespace cudf
diff --git a/cpp/src/reshape/explode.cu b/cpp/src/reshape/explode.cu
new file mode 100644
index 00000000000..bc532893fb0
--- /dev/null
+++ b/cpp/src/reshape/explode.cu
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/reshape.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include <memory>
+#include <type_traits>
+
+namespace cudf {
+namespace detail {
+namespace {
+/**
+ * @brief Function object for exploding a column.
+ */
+struct explode_functor {
+  template <typename T>
+  std::unique_ptr<table> operator()(table_view const& input_table,
+                                    size_type explode_column_idx,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr) const
+  {
+    CUDF_FAIL("Unsupported non-list column");
+
+    return std::make_unique<table>();
+  }
+};
+
+template <>
+std::unique_ptr<table> explode_functor::operator()<list_view>(
+  table_view const& input_table,
+  size_type explode_column_idx,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr) const
+{
+  /* we explode by building a gather map that includes the number of entries in each list inside
+   the column for each index. Interestingly, this can be done with lower_bound across the offsets
+   as values between the offsets will all map down to the index below. We have some off-by-one
+   manipulations we need to do with the output, but it's almost our gather map by itself. Once we
+   build the gather map we need to remove the explode column from the table and run gather on it.
+   Next we build the explode column, which turns out is simply lifting the child column out of the
+   explode column. This unrolls the top level of lists. Then we need to insert the explode column
+   back into the table and return it. */
+  lists_column_view lc{input_table.column(explode_column_idx)};
+  auto sliced_child = lc.get_sliced_child(stream);
+  rmm::device_uvector<size_type> gather_map_indices(sliced_child.size(), stream, mr);
+
+  // sliced columns can make this a little tricky. We have to start iterating at the start of the
+  // offsets for this column, which could be > 0. Then we also have to handle rebasing the offsets
+  // as we go.
+  auto offsets           = lc.offsets().begin<size_type>() + lc.offset();
+  auto offsets_minus_one = thrust::make_transform_iterator(
+    offsets, [offsets] __device__(auto i) { return (i - offsets[0]) - 1; });
+  auto counting_iter = thrust::make_counting_iterator(0);
+
+  // This looks like an off-by-one bug, but what is going on here is that we need to reduce each
+  // result from `lower_bound` by 1 to build the correct gather map. It was pointed out that
+  // this can be accomplished by simply skipping the first entry and using the result of
+  // `lower_bound` directly.
+  thrust::lower_bound(rmm::exec_policy(stream),
+                      offsets_minus_one + 1,
+                      offsets_minus_one + lc.size() + 1,
+                      counting_iter,
+                      counting_iter + gather_map_indices.size(),
+                      gather_map_indices.begin());
+
+  auto select_iter = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(0),
+    [explode_column_idx](size_type i) { return i >= explode_column_idx ? i + 1 : i; });
+  std::vector<size_type> selected_columns(select_iter, select_iter + input_table.num_columns() - 1);
+
+  auto gathered_table = cudf::detail::gather(
+    input_table.select(selected_columns),
+    column_view(data_type(type_to_id<size_type>()), sliced_child.size(), gather_map_indices.data()),
+    cudf::out_of_bounds_policy::DONT_CHECK,
+    cudf::detail::negative_index_policy::ALLOWED,
+    stream,
+    mr);
+
+  std::vector<std::unique_ptr<column>> columns = gathered_table.release()->release();
+
+  columns.insert(columns.begin() + explode_column_idx,
+                 std::make_unique<column>(column(sliced_child, stream, mr)));
+
+  return std::make_unique<table>(std::move(columns));
+}
+}  // namespace
+
+/**
+ * @copydoc
+ * cudf::explode(input_table,explode_column_idx,rmm::mr::device_memory_resource)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<table> explode(table_view const& input_table,
+                               size_type explode_column_idx,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
+{
+  return type_dispatcher(input_table.column(explode_column_idx).type(),
+                         explode_functor{},
+                         input_table,
+                         explode_column_idx,
+                         stream,
+                         mr);
+}
+
+}  // namespace detail
+
+/**
+ * @copydoc cudf::explode(input_table,explode_column_idx,rmm::mr::device_memory_resource)
+ */
+std::unique_ptr<table> explode(table_view const& input_table,
+                               size_type explode_column_idx,
+                               rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::explode(input_table, explode_column_idx, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace cudf
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 052c2aaedc7..fe051b1ffc5 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/string_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
@@ -22,6 +23,29 @@
 #include <string>
 
 namespace cudf {
+
+string_scalar::string_scalar(rmm::device_scalar<value_type>& data,
+                             bool is_valid,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
+  : string_scalar(data.value(stream), is_valid, stream, mr)
+{
+}
+
+string_scalar::string_scalar(value_type const& source,
+                             bool is_valid,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
+  : scalar(data_type(type_id::STRING), is_valid),
+    _data(source.data(), source.size_bytes(), stream, mr)
+{
+}
+
+string_scalar::value_type string_scalar::value(rmm::cuda_stream_view stream) const
+{
+  return value_type{data(), size()};
+}
+
 std::string string_scalar::to_string(rmm::cuda_stream_view stream) const
 {
   std::string result;
diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu
index b6c603c231f..2d36a573a49 100644
--- a/cpp/src/sort/sort.cu
+++ b/cpp/src/sort/sort.cu
@@ -55,6 +55,53 @@ std::unique_ptr<table> sort_by_key(table_view const& values,
                         mr);
 }
 
+struct inplace_column_sort_fn {
+  template <typename T, typename std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
+  void operator()(mutable_column_view& col, bool ascending, rmm::cuda_stream_view stream) const
+  {
+    CUDF_EXPECTS(!col.has_nulls(), "Nulls not supported for in-place sort");
+    using DeviceT = device_storage_type_t<T>;
+    if (ascending) {
+      thrust::sort(rmm::exec_policy(stream),
+                   col.begin<DeviceT>(),
+                   col.end<DeviceT>(),
+                   thrust::less<DeviceT>());
+    } else {
+      thrust::sort(rmm::exec_policy(stream),
+                   col.begin<DeviceT>(),
+                   col.end<DeviceT>(),
+                   thrust::greater<DeviceT>());
+    }
+  }
+
+  template <typename T, typename std::enable_if_t<!cudf::is_fixed_width<T>()>* = nullptr>
+  void operator()(mutable_column_view&, bool, rmm::cuda_stream_view) const
+  {
+    CUDF_FAIL("Column type must be relationally comparable and fixed-width");
+  }
+};
+
+std::unique_ptr<table> sort(table_view input,
+                            std::vector<order> const& column_order,
+                            std::vector<null_order> const& null_precedence,
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  if (input.num_columns() == 1 && !input.column(0).has_nulls() &&
+      cudf::is_fixed_width(input.column(0).type())) {
+    auto output    = std::make_unique<column>(input.column(0), stream, mr);
+    auto view      = output->mutable_view();
+    bool ascending = (column_order.empty() ? true : column_order.front() == order::ASCENDING);
+    cudf::type_dispatcher(output->type(), inplace_column_sort_fn{}, view, ascending, stream);
+    std::vector<std::unique_ptr<column>> columns;
+    columns.emplace_back(std::move(output));
+    return std::make_unique<table>(std::move(columns));
+  }
+  return detail::sort_by_key(
+    input, input, column_order, null_precedence, rmm::cuda_stream_default, mr);
+}
+
 }  // namespace detail
 
 std::unique_ptr<column> sorted_order(table_view input,
@@ -72,8 +119,7 @@ std::unique_ptr<table> sort(table_view input,
                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sort_by_key(
-    input, input, column_order, null_precedence, rmm::cuda_stream_default, mr);
+  return detail::sort(input, column_order, null_precedence, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<table> sort_by_key(table_view const& values,
diff --git a/cpp/src/sort/sort_column.cu b/cpp/src/sort/sort_column.cu
new file mode 100644
index 00000000000..070aa6eae03
--- /dev/null
+++ b/cpp/src/sort/sort_column.cu
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <sort/sort_impl.cuh>
+
+namespace cudf {
+namespace detail {
+namespace {
+
+/**
+ * @brief Type-dispatched functor for sorting a single column.
+ */
+struct column_sorted_order_fn {
+  /**
+   * @brief Compile time check for allowing radix sort for column type.
+   *
+   * Floating point is removed here for special handling of NaNs.
+   */
+  template <typename T>
+  static constexpr bool is_radix_sort_supported()
+  {
+    return cudf::is_fixed_width<T>() && !cudf::is_floating_point<T>();
+  }
+
+  /**
+   * @brief Sorts fixed-width columns using faster thrust sort.
+   *
+   * @param input Column to sort
+   * @param indices Output sorted indices
+   * @param ascending True if sort order is ascending
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   */
+  template <typename T, typename std::enable_if_t<is_radix_sort_supported<T>()>* = nullptr>
+  void radix_sort(column_view const& input,
+                  mutable_column_view& indices,
+                  bool ascending,
+                  rmm::cuda_stream_view stream)
+  {
+    // A non-stable sort on a column of arithmetic type with no nulls will use a radix sort
+    // if specifying only the `thrust::less` or `thrust::greater` comparators.
+    // But this also requires making a copy of the input data.
+    auto temp_col = column(input, stream);
+    auto d_col    = temp_col.mutable_view();
+    using DeviceT = device_storage_type_t<T>;
+    if (ascending) {
+      thrust::sort_by_key(rmm::exec_policy(stream),
+                          d_col.begin<DeviceT>(),
+                          d_col.end<DeviceT>(),
+                          indices.begin<size_type>(),
+                          thrust::less<DeviceT>());
+    } else {
+      thrust::sort_by_key(rmm::exec_policy(stream),
+                          d_col.begin<DeviceT>(),
+                          d_col.end<DeviceT>(),
+                          indices.begin<size_type>(),
+                          thrust::greater<DeviceT>());
+    }
+  }
+  template <typename T, typename std::enable_if_t<!is_radix_sort_supported<T>()>* = nullptr>
+  void radix_sort(column_view const&, mutable_column_view&, bool, rmm::cuda_stream_view)
+  {
+    CUDF_FAIL("Only fixed-width types are suitable for faster sorting");
+  }
+
+  /**
+   * @brief Sorts a single column with a relationally comparable type.
+   *
+   * This includes numeric, timestamp, duration, and string types.
+   *
+   * @param input Column to sort
+   * @param indices Output sorted indices
+   * @param ascending True if sort order is ascending
+   * @param null_precedence How null rows are to be ordered
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   */
+  template <typename T,
+            typename std::enable_if_t<cudf::is_relationally_comparable<T, T>()>* = nullptr>
+  void operator()(column_view const& input,
+                  mutable_column_view& indices,
+                  bool ascending,
+                  null_order null_precedence,
+                  rmm::cuda_stream_view stream)
+  {
+    // column with nulls or non-supported types will also use a comparator
+    if (input.has_nulls() || !is_radix_sort_supported<T>()) {
+      auto keys = column_device_view::create(input, stream);
+      thrust::sort(rmm::exec_policy(stream),
+                   indices.begin<size_type>(),
+                   indices.end<size_type>(),
+                   simple_comparator<T>{*keys, input.has_nulls(), ascending, null_precedence});
+    } else {
+      radix_sort<T>(input, indices, ascending, stream);
+    }
+  }
+
+  template <typename T,
+            typename std::enable_if_t<!cudf::is_relationally_comparable<T, T>()>* = nullptr>
+  void operator()(column_view const&, mutable_column_view&, bool, null_order, rmm::cuda_stream_view)
+  {
+    CUDF_FAIL("Column type must be relationally comparable");
+  }
+};
+
+}  // namespace
+
+/**
+ * @copydoc
+ * sorted_order(column_view&,order,null_order,rmm::cuda_stream_view,rmm::mr::device_memory_resource*)
+ */
+template <>
+std::unique_ptr<column> sorted_order<false>(column_view const& input,
+                                            order column_order,
+                                            null_order null_precedence,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
+{
+  auto sorted_indices = cudf::make_numeric_column(
+    data_type(type_to_id<size_type>()), input.size(), mask_state::UNALLOCATED, stream, mr);
+  mutable_column_view indices_view = sorted_indices->mutable_view();
+  thrust::sequence(
+    rmm::exec_policy(stream), indices_view.begin<size_type>(), indices_view.end<size_type>(), 0);
+  cudf::type_dispatcher(input.type(),
+                        column_sorted_order_fn{},
+                        input,
+                        indices_view,
+                        column_order == order::ASCENDING,
+                        null_precedence,
+                        stream);
+  return sorted_indices;
+}
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/sort/sort_impl.cuh b/cpp/src/sort/sort_impl.cuh
index cfa3a726138..4fc83d343d5 100644
--- a/cpp/src/sort/sort_impl.cuh
+++ b/cpp/src/sort/sort_impl.cuh
@@ -18,20 +18,70 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/gather.hpp>
-#include <cudf/strings/detail/sorting.cuh>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/sequence.h>
+#include <thrust/sort.h>
 
 namespace cudf {
 namespace detail {
-// Create permuted row indices that would materialize sorted order
+
+/**
+ * @brief Comparator functor needed for single column sort.
+ *
+ * @tparam Column element type.
+ */
+template <typename T>
+struct simple_comparator {
+  __device__ bool operator()(size_type lhs, size_type rhs)
+  {
+    if (has_nulls) {
+      bool lhs_null{d_column.is_null(lhs)};
+      bool rhs_null{d_column.is_null(rhs)};
+      if (lhs_null || rhs_null) {
+        if (!ascending) thrust::swap(lhs_null, rhs_null);
+        return (null_precedence == cudf::null_order::BEFORE ? !rhs_null : !lhs_null);
+      }
+    }
+    return relational_compare(d_column.element<T>(lhs), d_column.element<T>(rhs)) ==
+           (ascending ? weak_ordering::LESS : weak_ordering::GREATER);
+  }
+  column_device_view const d_column;
+  bool has_nulls;
+  bool ascending;
+  null_order null_precedence{};
+};
+
+/**
+ * @brief Sort indices of a single column.
+ *
+ * @param input Column to sort. The column data is not modified.
+ * @param column_order Ascending or descending sort order
+ * @param null_precedence How null rows are to be ordered
+ * @param stable True if sort should be stable
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Sorted indices for the input column.
+ */
+template <bool stable>
+std::unique_ptr<column> sorted_order(column_view const& input,
+                                     order column_order,
+                                     null_order null_precedence,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
+
+/**
+ * @copydoc
+ * sorted_order(table_view&,std::vector<order>,std::vector<null_order>,rmm::mr::device_memory_resource*)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
 template <bool stable = false>
 std::unique_ptr<column> sorted_order(table_view input,
                                      std::vector<order> const& column_order,
@@ -53,28 +103,24 @@ std::unique_ptr<column> sorted_order(table_view input,
                  "Mismatch between number of columns and null_precedence size.");
   }
 
-  // fast-path for single strings column sort
-  if (input.num_columns() == 1 && input.column(0).type().id() == type_id::STRING) {
-    return cudf::strings::detail::sorted_order<stable>(
-      strings_column_view(input.column(0)),
-      column_order.empty() ? order::ASCENDING : column_order.front(),
-      null_precedence.empty() ? null_order::BEFORE : null_precedence.front(),
-      stream,
-      mr);
-  }
-
   std::unique_ptr<column> sorted_indices = cudf::make_numeric_column(
     data_type(type_to_id<size_type>()), input.num_rows(), mask_state::UNALLOCATED, stream, mr);
-
   mutable_column_view mutable_indices_view = sorted_indices->mutable_view();
-
-  auto device_table = table_device_view::create(input, stream);
-
   thrust::sequence(rmm::exec_policy(stream),
                    mutable_indices_view.begin<size_type>(),
                    mutable_indices_view.end<size_type>(),
                    0);
 
+  // fast-path for single column sort
+  if (input.num_columns() == 1) {
+    auto const single_col = input.column(0);
+    auto const col_order  = column_order.empty() ? order::ASCENDING : column_order.front();
+    auto const null_prec  = null_precedence.empty() ? null_order::BEFORE : null_precedence.front();
+    return stable ? sorted_order<true>(single_col, col_order, null_prec, stream, mr)
+                  : sorted_order<false>(single_col, col_order, null_prec, stream, mr);
+  }
+
+  auto device_table = table_device_view::create(input, stream);
   rmm::device_vector<order> d_column_order(column_order);
 
   if (has_nulls(input)) {
diff --git a/cpp/src/sort/stable_sort_column.cu b/cpp/src/sort/stable_sort_column.cu
new file mode 100644
index 00000000000..abeaa7bef76
--- /dev/null
+++ b/cpp/src/sort/stable_sort_column.cu
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <sort/sort_impl.cuh>
+
+namespace cudf {
+namespace detail {
+namespace {
+
+struct column_stable_sorted_order_fn {
+  /**
+   * @brief Stable sort of fixed-width columns using a thrust sort with no comparator.
+   *
+   * @param input Column to sort
+   * @param indices Output sorted indices
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   */
+  template <typename T, typename std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
+  void faster_stable_sort(column_view const& input,
+                          mutable_column_view& indices,
+                          rmm::cuda_stream_view stream)
+  {
+    auto temp_col = column(input, stream);
+    auto d_col    = temp_col.mutable_view();
+    using DeviceT = device_storage_type_t<T>;
+    thrust::stable_sort_by_key(rmm::exec_policy(stream),
+                               d_col.begin<DeviceT>(),
+                               d_col.end<DeviceT>(),
+                               indices.begin<size_type>());
+  }
+  template <typename T, typename std::enable_if_t<!cudf::is_fixed_width<T>()>* = nullptr>
+  void faster_stable_sort(column_view const&, mutable_column_view&, rmm::cuda_stream_view)
+  {
+    CUDF_FAIL("Only fixed-width types are suitable for faster stable sorting");
+  }
+
+  /**
+   * @brief Stable sorts a single column with a relationally comparable type.
+   *
+   * This includes numeric, timestamp, duration, and string types.
+   *
+   * @param input Column to sort
+   * @param indices Output sorted indices
+   * @param ascending True if sort order is ascending
+   * @param null_precedence How null rows are to be ordered
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   */
+  template <typename T,
+            typename std::enable_if_t<cudf::is_relationally_comparable<T, T>()>* = nullptr>
+  void operator()(column_view const& input,
+                  mutable_column_view& indices,
+                  bool ascending,
+                  null_order null_precedence,
+                  rmm::cuda_stream_view stream)
+  {
+    if (!ascending || input.has_nulls() || !cudf::is_fixed_width<T>()) {
+      auto keys = column_device_view::create(input, stream);
+      thrust::stable_sort(
+        rmm::exec_policy(stream),
+        indices.begin<size_type>(),
+        indices.end<size_type>(),
+        simple_comparator<T>{*keys, input.has_nulls(), ascending, null_precedence});
+    } else {
+      faster_stable_sort<T>(input, indices, stream);
+    }
+  }
+  template <typename T,
+            typename std::enable_if_t<!cudf::is_relationally_comparable<T, T>()>* = nullptr>
+  void operator()(column_view const&, mutable_column_view&, bool, null_order, rmm::cuda_stream_view)
+  {
+    CUDF_FAIL("Column type must be relationally comparable");
+  }
+};
+
+}  // namespace
+
+/**
+ * @copydoc
+ * sorted_order(column_view&,order,null_order,rmm::cuda_stream_view,rmm::mr::device_memory_resource*)
+ */
+template <>
+std::unique_ptr<column> sorted_order<true>(column_view const& input,
+                                           order column_order,
+                                           null_order null_precedence,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  auto sorted_indices = cudf::make_numeric_column(
+    data_type(type_to_id<size_type>()), input.size(), mask_state::UNALLOCATED, stream, mr);
+  mutable_column_view indices_view = sorted_indices->mutable_view();
+  thrust::sequence(
+    rmm::exec_policy(stream), indices_view.begin<size_type>(), indices_view.end<size_type>(), 0);
+  cudf::type_dispatcher(input.type(),
+                        column_stable_sorted_order_fn{},
+                        input,
+                        indices_view,
+                        column_order == order::ASCENDING,
+                        null_precedence,
+                        stream);
+  return sorted_indices;
+}
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/strings/regex/regexec.cu b/cpp/src/strings/regex/regexec.cu
index 0f344fb7111..b76e1932196 100644
--- a/cpp/src/strings/regex/regexec.cu
+++ b/cpp/src/strings/regex/regexec.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
+#include <algorithm>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -40,7 +42,9 @@ namespace {
 std::vector<char32_t> string_to_char32_vector(std::string const& pattern)
 {
   size_type size  = static_cast<size_type>(pattern.size());
-  size_type count = characters_in_string(pattern.c_str(), size);
+  size_type count = std::count_if(pattern.cbegin(), pattern.cend(), [](char ch) {
+    return is_begin_utf8_char(static_cast<uint8_t>(ch));
+  });
   std::vector<char32_t> result(count + 1);
   char32_t* output_ptr  = result.data();
   const char* input_ptr = pattern.data();
diff --git a/cpp/src/structs/structs_column_view.cpp b/cpp/src/structs/structs_column_view.cpp
index d5537957013..dba31ecc21e 100644
--- a/cpp/src/structs/structs_column_view.cpp
+++ b/cpp/src/structs/structs_column_view.cpp
@@ -36,7 +36,9 @@ column_view structs_column_view::get_sliced_child(int index) const
                      size(),
                      child(index).head<uint8_t>(),
                      child(index).null_mask(),
-                     child(index).null_count(),
+                     // TODO: could potentially compute the actual count here, but at
+                     // the moment this interface doesn't take a stream.
+                     UNKNOWN_NULL_COUNT,
                      offset(),
                      children};
 }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index ad05c871012..8395a3cc1f2 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -39,7 +39,6 @@ endif("$ENV{CONDA_BUILD}" STREQUAL "1")
 add_library(cudftestutil STATIC
             "${CMAKE_CURRENT_SOURCE_DIR}/utilities/base_fixture.cpp"
             "${CMAKE_CURRENT_SOURCE_DIR}/utilities/column_utilities.cu"
-            "${CMAKE_CURRENT_SOURCE_DIR}/utilities/scalar_utilities.cu"
             "${CMAKE_CURRENT_SOURCE_DIR}/utilities/table_utilities.cu"
             "${CMAKE_CURRENT_SOURCE_DIR}/strings/utilities.cu")
 
@@ -524,6 +523,7 @@ ConfigureTest(SEARCH_TEST "${SEARCH_TEST_SRC}")
 
 set(RESHAPE_TEST_SRC
     "${CMAKE_CURRENT_SOURCE_DIR}/reshape/byte_cast_tests.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/reshape/explode_tests.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/reshape/interleave_columns_tests.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/reshape/tile_tests.cpp")
 
@@ -661,6 +661,8 @@ ConfigureTest(AST_TEST "${AST_TEST_SRC}")
 # - lists tests ----------------------------------------------------------------------------------
 
 set(LISTS_TEST_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/lists/contains_tests.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/lists/count_elements_tests.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/lists/extract_tests.cpp")
 
 ConfigureTest(LISTS_TEST "${LISTS_TEST_SRC}")
diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp
index 3994ab60a18..8f4a46e2a54 100644
--- a/cpp/tests/ast/transform_tests.cpp
+++ b/cpp/tests/ast/transform_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,7 +29,6 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <limits>
 #include <type_traits>
diff --git a/cpp/tests/binaryop/binop-integration-test.cpp b/cpp/tests/binaryop/binop-integration-test.cpp
index f2f68c7601b..7c02b4957b5 100644
--- a/cpp/tests/binaryop/binop-integration-test.cpp
+++ b/cpp/tests/binaryop/binop-integration-test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Copyright 2018-2019 BlazingDB, Inc.
  *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
@@ -2200,6 +2200,21 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiplyScalar)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
+TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpSimplePlus)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const lhs      = fp_wrapper<RepType>{{150, 200}, scale_type{-2}};
+  auto const rhs      = fp_wrapper<RepType>{{2250, 1005}, scale_type{-3}};
+  auto const expected = fp_wrapper<RepType>{{3750, 3005}, scale_type{-3}};
+
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, {});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
 TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimple)
 {
   using namespace numeric;
diff --git a/cpp/tests/copying/gather_str_tests.cu b/cpp/tests/copying/gather_str_tests.cu
index 75cea81c950..6655f819190 100644
--- a/cpp/tests/copying/gather_str_tests.cu
+++ b/cpp/tests/copying/gather_str_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,7 +25,6 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
 
 class GatherTestStr : public cudf::test::BaseFixture {
 };
diff --git a/cpp/tests/copying/sample_tests.cpp b/cpp/tests/copying/sample_tests.cpp
index f010b504436..62415693363 100644
--- a/cpp/tests/copying/sample_tests.cpp
+++ b/cpp/tests/copying/sample_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
diff --git a/cpp/tests/copying/scatter_list_tests.cu b/cpp/tests/copying/scatter_list_tests.cu
index e8e11629628..786f1c57b26 100644
--- a/cpp/tests/copying/scatter_list_tests.cu
+++ b/cpp/tests/copying/scatter_list_tests.cu
@@ -733,7 +733,9 @@ TYPED_TEST(TypedScatterListsTest, ListsOfNullStructs)
   };
   // clang-format on
 
-  auto expected_structs = structs_column_wrapper{{expected_numerics, expected_strings}};
+  auto expected_structs =
+    structs_column_wrapper{{expected_numerics, expected_strings},
+                           make_counting_transform_iterator(0, [](auto i) { return i != 6; })};
 
   auto expected_lists = cudf::make_lists_column(
     6, offsets_column{0, 3, 5, 9, 11, 13, 15}.release(), expected_structs.release(), 0, {});
@@ -828,7 +830,9 @@ TYPED_TEST(TypedScatterListsTest, EmptyListsOfStructs)
   };
   // clang-format on
 
-  auto expected_structs = structs_column_wrapper{{expected_numerics, expected_strings}};
+  auto expected_structs =
+    structs_column_wrapper{{expected_numerics, expected_strings},
+                           make_counting_transform_iterator(0, [](auto i) { return i != 6; })};
 
   auto expected_lists = cudf::make_lists_column(
     6, offsets_column{0, 3, 5, 9, 11, 11, 13}.release(), expected_structs.release(), 0, {});
@@ -929,7 +933,9 @@ TYPED_TEST(TypedScatterListsTest, NullListsOfStructs)
   };
   // clang-format on
 
-  auto expected_structs = structs_column_wrapper{{expected_numerics, expected_strings}};
+  auto expected_structs =
+    structs_column_wrapper{{expected_numerics, expected_strings},
+                           make_counting_transform_iterator(0, [](auto i) { return i != 6; })};
 
   auto expected_lists_null_mask_begin =
     make_counting_transform_iterator(0, [](auto i) { return i != 4; });
diff --git a/cpp/tests/copying/slice_tests.cpp b/cpp/tests/copying/slice_tests.cpp
index 1f8e7ebe0bf..e9759aa0259 100644
--- a/cpp/tests/copying/slice_tests.cpp
+++ b/cpp/tests/copying/slice_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,6 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <cudf/wrappers/timestamps.hpp>
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cu b/cpp/tests/fixed_point/fixed_point_tests.cu
index 535cb32defc..5f969098b48 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cu
+++ b/cpp/tests/fixed_point/fixed_point_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <cudf/binaryop.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/fixed_point/fixed_point.hpp>
@@ -580,4 +581,69 @@ TYPED_TEST(FixedPointTestBothReps, SimpleFixedPointColumnWrapper)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(a, b);
 }
 
+TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType32)
+{
+  // This is testing fixed_point values outside the range of its underlying type.
+  // For example, 100,000,000 with scale of 6 is 100,000,000,000,000 (100 trillion) and this is
+  // outside the range of a int32_t
+
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<int32_t>;
+
+  auto const a = fp_wrapper{{100000000}, scale_type{6}};
+  auto const b = fp_wrapper{{5000000}, scale_type{7}};
+  auto const c = fp_wrapper{{2}, scale_type{0}};
+
+  auto const expected1 = fp_wrapper{{150000000}, scale_type{6}};
+  auto const expected2 = fp_wrapper{{50000000}, scale_type{6}};
+
+  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, {});
+  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, {});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
+}
+
+TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType64)
+{
+  // This is testing fixed_point values outside the range of its underlying type.
+  // For example, 100,000,000 with scale of 100 is 10 ^ 108 and this is far outside the
+  // range of a int64_t
+
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<int64_t>;
+
+  auto const a = fp_wrapper{{100000000}, scale_type{100}};
+  auto const b = fp_wrapper{{5000000}, scale_type{101}};
+  auto const c = fp_wrapper{{2}, scale_type{0}};
+
+  auto const expected1 = fp_wrapper{{150000000}, scale_type{100}};
+  auto const expected2 = fp_wrapper{{50000000}, scale_type{100}};
+
+  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, {});
+  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, {});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
+}
+
+TYPED_TEST(FixedPointTestBothReps, ExtremelyLargeNegativeScale)
+{
+  // This is testing fixed_point values with an extremely large negative scale. The fixed_point
+  // implementation should be able to handle any scale representable by an int32_t
+
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<TypeParam>;
+
+  auto const a = fp_wrapper{{10}, scale_type{-201}};
+  auto const b = fp_wrapper{{50}, scale_type{-202}};
+  auto const c = fp_wrapper{{2}, scale_type{0}};
+
+  auto const expected1 = fp_wrapper{{150}, scale_type{-202}};
+  auto const expected2 = fp_wrapper{{5}, scale_type{-201}};
+
+  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, {});
+  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, {});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index 88b7a4f4bb2..4dae480d39e 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,6 @@
 
 #include <cudf/io/csv.hpp>
 #include <cudf/io/datasource.hpp>
-#include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 6a50aed3f7e..886af048aac 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,6 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/io/datasource.hpp>
-#include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index d1f799f0d84..a93c3170445 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -25,7 +25,6 @@
 #include <cudf/copying.hpp>
 #include <cudf/io/orc.hpp>
 #include <cudf/io/orc_metadata.hpp>
-#include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 949c1bd2597..743634fd6d3 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,6 @@
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/parquet.hpp>
-#include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
@@ -881,9 +880,7 @@ TEST_F(ParquetChunkedWriterTest, SingleTable)
   auto filepath = temp_env->get_temp_filepath("ChunkedSingle.parquet");
   cudf_io::chunked_parquet_writer_options args =
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  auto state = cudf_io::write_parquet_chunked_begin(args);
-  cudf_io::write_parquet_chunked(*table1, state);
-  cudf_io::write_parquet_chunked_end(state);
+  cudf_io::parquet_chunked_writer(args).write(*table1);
 
   cudf_io::parquet_reader_options read_opts =
     cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
@@ -903,10 +900,7 @@ TEST_F(ParquetChunkedWriterTest, SimpleTable)
   auto filepath = temp_env->get_temp_filepath("ChunkedSimple.parquet");
   cudf_io::chunked_parquet_writer_options args =
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  auto state = cudf_io::write_parquet_chunked_begin(args);
-  cudf_io::write_parquet_chunked(*table1, state);
-  cudf_io::write_parquet_chunked(*table2, state);
-  cudf_io::write_parquet_chunked_end(state);
+  cudf_io::parquet_chunked_writer(args).write(*table1).write(*table2);
 
   cudf_io::parquet_reader_options read_opts =
     cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
@@ -926,10 +920,7 @@ TEST_F(ParquetChunkedWriterTest, LargeTables)
   auto filepath = temp_env->get_temp_filepath("ChunkedLarge.parquet");
   cudf_io::chunked_parquet_writer_options args =
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  auto state = cudf_io::write_parquet_chunked_begin(args);
-  cudf_io::write_parquet_chunked(*table1, state);
-  cudf_io::write_parquet_chunked(*table2, state);
-  auto md = cudf_io::write_parquet_chunked_end(state);
+  auto md = cudf_io::parquet_chunked_writer(args).write(*table1).write(*table2).close();
   CUDF_EXPECTS(!md, "The return value should be null.");
 
   cudf_io::parquet_reader_options read_opts =
@@ -956,11 +947,11 @@ TEST_F(ParquetChunkedWriterTest, ManyTables)
   auto filepath = temp_env->get_temp_filepath("ChunkedManyTables.parquet");
   cudf_io::chunked_parquet_writer_options args =
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  auto state = cudf_io::write_parquet_chunked_begin(args);
-  std::for_each(table_views.begin(), table_views.end(), [&state](table_view const& tbl) {
-    cudf_io::write_parquet_chunked(tbl, state);
+  cudf_io::parquet_chunked_writer writer(args);
+  std::for_each(table_views.begin(), table_views.end(), [&writer](table_view const& tbl) {
+    writer.write(tbl);
   });
-  auto md = cudf_io::write_parquet_chunked_end(state, true, "dummy/path");
+  auto md = writer.close("dummy/path");
   CUDF_EXPECTS(md, "The returned metadata should not be null.");
 
   cudf_io::parquet_reader_options read_opts =
@@ -991,10 +982,7 @@ TEST_F(ParquetChunkedWriterTest, Strings)
   auto filepath = temp_env->get_temp_filepath("ChunkedStrings.parquet");
   cudf_io::chunked_parquet_writer_options args =
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  auto state = cudf_io::write_parquet_chunked_begin(args);
-  cudf_io::write_parquet_chunked(tbl1, state);
-  cudf_io::write_parquet_chunked(tbl2, state);
-  cudf_io::write_parquet_chunked_end(state);
+  cudf_io::parquet_chunked_writer(args).write(tbl1).write(tbl2);
 
   cudf_io::parquet_reader_options read_opts =
     cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
@@ -1053,10 +1041,7 @@ TEST_F(ParquetChunkedWriterTest, ListColumn)
   auto filepath = temp_env->get_temp_filepath("ChunkedLists.parquet");
   cudf_io::chunked_parquet_writer_options args =
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  auto state = cudf_io::write_parquet_chunked_begin(args);
-  cudf_io::write_parquet_chunked(tbl0, state);
-  cudf_io::write_parquet_chunked(tbl1, state);
-  cudf_io::write_parquet_chunked_end(state);
+  cudf_io::parquet_chunked_writer(args).write(tbl0).write(tbl1);
 
   cudf_io::parquet_reader_options read_opts =
     cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
@@ -1074,10 +1059,39 @@ TEST_F(ParquetChunkedWriterTest, MismatchedTypes)
   auto filepath = temp_env->get_temp_filepath("ChunkedMismatchedTypes.parquet");
   cudf_io::chunked_parquet_writer_options args =
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  auto state = cudf_io::write_parquet_chunked_begin(args);
-  cudf_io::write_parquet_chunked(*table1, state);
-  EXPECT_THROW(cudf_io::write_parquet_chunked(*table2, state), cudf::logic_error);
-  cudf_io::write_parquet_chunked_end(state);
+  cudf_io::parquet_chunked_writer writer(args);
+  writer.write(*table1);
+  EXPECT_THROW(writer.write(*table2), cudf::logic_error);
+  writer.close();
+}
+
+TEST_F(ParquetChunkedWriterTest, ChunkedWriteAfterClosing)
+{
+  srand(31337);
+  auto table = create_random_fixed_table<int>(4, 4, true);
+
+  auto filepath = temp_env->get_temp_filepath("ChunkedWriteAfterClosing.parquet");
+  cudf_io::chunked_parquet_writer_options args =
+    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
+  cudf_io::parquet_chunked_writer writer(args);
+  writer.write(*table).close();
+  EXPECT_THROW(writer.write(*table), cudf::logic_error);
+}
+
+TEST_F(ParquetChunkedWriterTest, ReadingUnclosedFile)
+{
+  srand(31337);
+  auto table = create_random_fixed_table<int>(4, 4, true);
+
+  auto filepath = temp_env->get_temp_filepath("ReadingUnlosedFile.parquet");
+  cudf_io::chunked_parquet_writer_options args =
+    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
+  cudf_io::parquet_chunked_writer writer(args);
+  writer.write(*table);
+
+  cudf_io::parquet_reader_options read_opts =
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
+  EXPECT_THROW(cudf_io::read_parquet(read_opts), cudf::logic_error);
 }
 
 TEST_F(ParquetChunkedWriterTest, MismatchedStructure)
@@ -1089,10 +1103,10 @@ TEST_F(ParquetChunkedWriterTest, MismatchedStructure)
   auto filepath = temp_env->get_temp_filepath("ChunkedMismatchedStructure.parquet");
   cudf_io::chunked_parquet_writer_options args =
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  auto state = cudf_io::write_parquet_chunked_begin(args);
-  cudf_io::write_parquet_chunked(*table1, state);
-  EXPECT_THROW(cudf_io::write_parquet_chunked(*table2, state), cudf::logic_error);
-  cudf_io::write_parquet_chunked_end(state);
+  cudf_io::parquet_chunked_writer writer(args);
+  writer.write(*table1);
+  EXPECT_THROW(writer.write(*table2), cudf::logic_error);
+  writer.close();
 }
 
 TEST_F(ParquetChunkedWriterTest, MismatchedStructureList)
@@ -1129,9 +1143,9 @@ TEST_F(ParquetChunkedWriterTest, MismatchedStructureList)
   auto filepath = temp_env->get_temp_filepath("ChunkedLists.parquet");
   cudf_io::chunked_parquet_writer_options args =
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  auto state = cudf_io::write_parquet_chunked_begin(args);
-  cudf_io::write_parquet_chunked(tbl0, state);
-  CUDF_EXPECT_THROW_MESSAGE(cudf_io::write_parquet_chunked(tbl1, state),
+  cudf_io::parquet_chunked_writer writer(args);
+  writer.write(tbl0);
+  CUDF_EXPECT_THROW_MESSAGE(writer.write(tbl1),
                             "Mismatch in schema between multiple calls to write_chunk");
 }
 
@@ -1146,10 +1160,7 @@ TEST_F(ParquetChunkedWriterTest, DifferentNullability)
   auto filepath = temp_env->get_temp_filepath("ChunkedNullable.parquet");
   cudf_io::chunked_parquet_writer_options args =
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  auto state = cudf_io::write_parquet_chunked_begin(args);
-  cudf_io::write_parquet_chunked(*table1, state);
-  cudf_io::write_parquet_chunked(*table2, state);
-  cudf_io::write_parquet_chunked_end(state);
+  cudf_io::parquet_chunked_writer(args).write(*table1).write(*table2);
 
   cudf_io::parquet_reader_options read_opts =
     cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
@@ -1179,10 +1190,7 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullability)
   cudf_io::chunked_parquet_writer_options args =
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath})
       .nullable_metadata(&nullable_metadata);
-  auto state = cudf_io::write_parquet_chunked_begin(args);
-  cudf_io::write_parquet_chunked(*table1, state);
-  cudf_io::write_parquet_chunked(*table2, state);
-  cudf_io::write_parquet_chunked_end(state);
+  cudf_io::parquet_chunked_writer(args).write(*table1).write(*table2);
 
   cudf_io::parquet_reader_options read_opts =
     cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
@@ -1238,10 +1246,7 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityList)
   cudf_io::chunked_parquet_writer_options args =
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath})
       .nullable_metadata(&nullable_metadata);
-  auto state = cudf_io::write_parquet_chunked_begin(args);
-  cudf_io::write_parquet_chunked(table1, state);
-  cudf_io::write_parquet_chunked(table2, state);
-  cudf_io::write_parquet_chunked_end(state);
+  cudf_io::parquet_chunked_writer(args).write(table1).write(table2);
 
   cudf_io::parquet_reader_options read_opts =
     cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
@@ -1258,20 +1263,22 @@ TEST_F(ParquetChunkedWriterTest, WrongNullability)
   auto filepath = temp_env->get_temp_filepath("ChunkedWrongNullable.parquet");
 
   cudf::io::table_metadata_with_nullability nullable_metadata;
+  // Number of columns with mask in table (i.e 5) and size of column nullability (i.e 6), are
+  // mismatching.
   nullable_metadata.column_nullable.insert(nullable_metadata.column_nullable.begin(), 6, true);
   cudf_io::chunked_parquet_writer_options args =
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath})
       .nullable_metadata(&nullable_metadata);
-  auto state = cudf_io::write_parquet_chunked_begin(args);
-  EXPECT_THROW(cudf_io::write_parquet_chunked(*table1, state), cudf::logic_error);
+  EXPECT_THROW(cudf_io::parquet_chunked_writer(args).write(*table1), cudf::logic_error);
 
   nullable_metadata.column_nullable.clear();
+  // Number of columns with mask in table (i.e 5) and size of column nullability (i.e 4), are
+  // mismatching.
   nullable_metadata.column_nullable.insert(nullable_metadata.column_nullable.begin(), 4, true);
   cudf_io::chunked_parquet_writer_options args2 =
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath})
       .nullable_metadata(&nullable_metadata);
-  state = cudf_io::write_parquet_chunked_begin(args2);
-  EXPECT_THROW(cudf_io::write_parquet_chunked(*table1, state), cudf::logic_error);
+  EXPECT_THROW(cudf_io::parquet_chunked_writer(args2).write(*table1), cudf::logic_error);
 }
 
 TEST_F(ParquetChunkedWriterTest, ReadRowGroups)
@@ -1285,10 +1292,9 @@ TEST_F(ParquetChunkedWriterTest, ReadRowGroups)
   auto filepath = temp_env->get_temp_filepath("ChunkedRowGroups.parquet");
   cudf_io::chunked_parquet_writer_options args =
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  auto state = cudf_io::write_parquet_chunked_begin(args);
-  cudf_io::write_parquet_chunked(*table1, state);
-  cudf_io::write_parquet_chunked(*table2, state);
-  cudf_io::write_parquet_chunked_end(state);
+  {
+    cudf_io::parquet_chunked_writer(args).write(*table1).write(*table2);
+  }
 
   cudf_io::parquet_reader_options read_opts =
     cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath})
@@ -1306,9 +1312,7 @@ TEST_F(ParquetChunkedWriterTest, ReadRowGroupsError)
   auto filepath = temp_env->get_temp_filepath("ChunkedRowGroupsError.parquet");
   cudf_io::chunked_parquet_writer_options args =
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  auto state = cudf_io::write_parquet_chunked_begin(args);
-  cudf_io::write_parquet_chunked(*table1, state);
-  cudf_io::write_parquet_chunked_end(state);
+  cudf_io::parquet_chunked_writer(args).write(*table1);
 
   cudf_io::parquet_reader_options read_opts =
     cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}).row_groups({{0, 1}});
@@ -1339,40 +1343,32 @@ TEST_F(ParquetChunkedWriterTest, DecimalWrite)
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
 
   // verify failure if no decimal precision given
-  auto state = cudf_io::write_parquet_chunked_begin(args);
-  EXPECT_THROW(cudf_io::write_parquet_chunked(table, state), cudf::logic_error);
+  EXPECT_THROW(cudf_io::parquet_chunked_writer(args).write(table), cudf::logic_error);
 
   // verify failure if too small a precision is given
   std::vector<uint8_t> precisions{7, 1};
   args.set_decimal_precision_data(precisions);
-  state = cudf_io::write_parquet_chunked_begin(args);
-  EXPECT_THROW(cudf_io::write_parquet_chunked(table, state), cudf::logic_error);
+  EXPECT_THROW(cudf_io::parquet_chunked_writer(args).write(table), cudf::logic_error);
 
   // verify failure if too few precisions given
   precisions.pop_back();
   args.set_decimal_precision_data(precisions);
-  state = cudf_io::write_parquet_chunked_begin(args);
-  EXPECT_THROW(cudf_io::write_parquet_chunked(table, state), cudf::logic_error);
+  EXPECT_THROW(cudf_io::parquet_chunked_writer(args).write(table), cudf::logic_error);
 
   // verify sucess if equal precision is given
   precisions = {7, 9};
   args.set_decimal_precision_data(precisions);
-  state = cudf_io::write_parquet_chunked_begin(args);
-  cudf_io::write_parquet_chunked(table, state);
-  cudf_io::write_parquet_chunked_end(state);
+  cudf_io::parquet_chunked_writer(args).write(table);
 
   // verify failure if too many precisions given
   precisions = {7, 14, 11};
   args.set_decimal_precision_data(precisions);
-  state = cudf_io::write_parquet_chunked_begin(args);
-  EXPECT_THROW(cudf_io::write_parquet_chunked(table, state), cudf::logic_error);
+  EXPECT_THROW(cudf_io::parquet_chunked_writer(args).write(table), cudf::logic_error);
 
   // write correctly
   precisions.pop_back();
   args.set_decimal_precision_data(precisions);
-  state = cudf_io::write_parquet_chunked_begin(args);
-  cudf_io::write_parquet_chunked(table, state);
-  cudf_io::write_parquet_chunked_end(state);
+  cudf_io::parquet_chunked_writer(args).write(table);
 
   cudf_io::parquet_reader_options read_opts =
     cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
@@ -1419,10 +1415,7 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize)
   auto filepath = temp_env->get_temp_filepath("ChunkedUnalignedSize.parquet");
   cudf_io::chunked_parquet_writer_options args =
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  auto state = cudf_io::write_parquet_chunked_begin(args);
-  cudf_io::write_parquet_chunked(tbl1, state);
-  cudf_io::write_parquet_chunked(tbl2, state);
-  cudf_io::write_parquet_chunked_end(state);
+  cudf_io::parquet_chunked_writer(args).write(tbl1).write(tbl2);
 
   cudf_io::parquet_reader_options read_opts =
     cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
@@ -1469,10 +1462,7 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize2)
   auto filepath = temp_env->get_temp_filepath("ChunkedUnalignedSize2.parquet");
   cudf_io::chunked_parquet_writer_options args =
     cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  auto state = cudf_io::write_parquet_chunked_begin(args);
-  cudf_io::write_parquet_chunked(tbl1, state);
-  cudf_io::write_parquet_chunked(tbl2, state);
-  cudf_io::write_parquet_chunked_end(state);
+  cudf_io::parquet_chunked_writer(args).write(tbl1).write(tbl2);
 
   cudf_io::parquet_reader_options read_opts =
     cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
diff --git a/cpp/tests/jit/jit-cache-test.hpp b/cpp/tests/jit/jit-cache-test.hpp
index 44736b821ae..261cc0fd3b4 100644
--- a/cpp/tests/jit/jit-cache-test.hpp
+++ b/cpp/tests/jit/jit-cache-test.hpp
@@ -123,10 +123,10 @@ struct JitCacheTest : public ::testing::Test, public cudf::jit::cudfJitCache {
 
 /**
  * @brief Similar to JitCacheTest but it doesn't run warmUp() test in SetUp and
- * purgeFileCache() in TearDown
+ * purgeFileCache() in SetUp and TearDown
  */
 struct JitCacheMultiProcessTest : public JitCacheTest {
-  virtual void SetUp() { purgeFileCache(); }
+  virtual void SetUp() {}
 
   virtual void TearDown() {}
 };
diff --git a/cpp/tests/lists/contains_tests.cpp b/cpp/tests/lists/contains_tests.cpp
new file mode 100644
index 00000000000..1885f626490
--- /dev/null
+++ b/cpp/tests/lists/contains_tests.cpp
@@ -0,0 +1,568 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/lists/contains.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+namespace cudf {
+namespace test {
+
+struct ContainsTest : public BaseFixture {
+};
+
+using ContainsTestTypes = Concat<IntegralTypesNotBool, FloatingPointTypes, ChronoTypes>;
+
+template <typename T>
+struct TypedContainsTest : public ContainsTest {
+};
+
+TYPED_TEST_CASE(TypedContainsTest, ContainsTestTypes);
+
+namespace {
+template <typename T, std::enable_if_t<cudf::is_numeric<T>(), void>* = nullptr>
+auto create_scalar_search_key(T const& value)
+{
+  auto search_key = make_numeric_scalar(data_type{type_to_id<T>()});
+  search_key->set_valid(true);
+  static_cast<scalar_type_t<T>*>(search_key.get())->set_value(value);
+  return search_key;
+}
+
+template <typename T, std::enable_if_t<std::is_same<T, std::string>::value, void>* = nullptr>
+auto create_scalar_search_key(std::string const& value)
+{
+  return make_string_scalar(value);
+}
+
+template <typename T, std::enable_if_t<cudf::is_timestamp<T>(), void>* = nullptr>
+auto create_scalar_search_key(typename T::rep const& value)
+{
+  auto search_key = make_timestamp_scalar(data_type{type_to_id<T>()});
+  search_key->set_valid(true);
+  static_cast<scalar_type_t<typename T::rep>*>(search_key.get())->set_value(value);
+  return search_key;
+}
+
+template <typename T, std::enable_if_t<cudf::is_duration<T>(), void>* = nullptr>
+auto create_scalar_search_key(typename T::rep const& value)
+{
+  auto search_key = make_duration_scalar(data_type{type_to_id<T>()});
+  search_key->set_valid(true);
+  static_cast<scalar_type_t<typename T::rep>*>(search_key.get())->set_value(value);
+  return search_key;
+}
+
+template <typename T, std::enable_if_t<cudf::is_numeric<T>(), void>* = nullptr>
+auto create_null_search_key()
+{
+  auto search_key = make_numeric_scalar(data_type{type_to_id<T>()});
+  search_key->set_valid(false);
+  return search_key;
+}
+
+template <typename T, std::enable_if_t<cudf::is_timestamp<T>(), void>* = nullptr>
+auto create_null_search_key()
+{
+  auto search_key = make_timestamp_scalar(data_type{type_to_id<T>()});
+  search_key->set_valid(false);
+  return search_key;
+}
+
+template <typename T, std::enable_if_t<cudf::is_duration<T>(), void>* = nullptr>
+auto create_null_search_key()
+{
+  auto search_key = make_duration_scalar(data_type{type_to_id<T>()});
+  search_key->set_valid(false);
+  return search_key;
+}
+
+}  // namespace
+
+TYPED_TEST(TypedContainsTest, ListContainsScalarWithNoNulls)
+{
+  using T = TypeParam;
+
+  auto search_space = lists_column_wrapper<T, int32_t>{
+    {0, 1, 2},
+    {3, 4, 5},
+    {6, 7, 8},
+    {9, 0, 1},
+    {2, 3, 4},
+    {5, 6, 7},
+    {8, 9, 0},
+    {},
+    {1, 2, 3},
+    {}}.release();
+
+  auto search_key_one = create_scalar_search_key<T>(1);
+
+  auto actual_result = lists::contains(search_space->view(), *search_key_one);
+
+  auto expected_result = fixed_width_column_wrapper<bool>{1, 0, 0, 1, 0, 0, 0, 0, 1, 0};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+}
+
+TYPED_TEST(TypedContainsTest, ListContainsScalarWithNullLists)
+{
+  // Test List columns that have NULL list rows.
+
+  using T = TypeParam;
+
+  auto search_space = lists_column_wrapper<T, int32_t>{
+    {{0, 1, 2},
+     {3, 4, 5},
+     {6, 7, 8},
+     {},
+     {9, 0, 1},
+     {2, 3, 4},
+     {5, 6, 7},
+     {8, 9, 0},
+     {},
+     {1, 2, 3},
+     {}},
+    make_counting_transform_iterator(0, [](auto i) {
+      return (i != 3) && (i != 10);
+    })}.release();
+
+  auto search_key_one = create_scalar_search_key<T>(1);
+
+  auto actual_result = lists::contains(search_space->view(), *search_key_one);
+
+  auto expected_result = fixed_width_column_wrapper<bool>{
+    {1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0},
+    make_counting_transform_iterator(0, [](auto i) { return (i != 3) && (i != 10); })};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+}
+
+TYPED_TEST(TypedContainsTest, ListContainsScalarNonNullListsWithNullValues)
+{
+  // Test List columns that have no NULL list rows, but NULL elements in some list rows.
+  using T = TypeParam;
+
+  auto numerals = fixed_width_column_wrapper<T>{
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4},
+    make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })};
+
+  auto search_space =
+    make_lists_column(8,
+                      fixed_width_column_wrapper<size_type>{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(),
+                      numerals.release(),
+                      0,
+                      {});
+
+  auto search_key_one = create_scalar_search_key<T>(1);
+
+  auto actual_result = lists::contains(search_space->view(), *search_key_one);
+
+  auto expected_result =
+    fixed_width_column_wrapper<bool>{{0, 1, 0, 0, 0, 0, 0, 1}, {0, 1, 0, 1, 1, 0, 1, 1}};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+}
+
+TYPED_TEST(TypedContainsTest, ListContainsScalarWithNullsInLists)
+{
+  using T = TypeParam;
+
+  auto numerals = fixed_width_column_wrapper<T>{
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4},
+    make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })};
+
+  auto input_null_mask_iter = make_counting_transform_iterator(0, [](auto i) { return i != 4; });
+
+  auto search_space = make_lists_column(
+    8,
+    fixed_width_column_wrapper<size_type>{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(),
+    numerals.release(),
+    1,
+    cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8));
+
+  auto search_key_one = create_scalar_search_key<T>(1);
+
+  auto actual_result = lists::contains(search_space->view(), *search_key_one);
+
+  auto expected_result =
+    fixed_width_column_wrapper<bool>{{0, 1, 0, 0, 0, 0, 0, 1}, {0, 1, 0, 1, 0, 0, 1, 1}};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+}
+
+TEST_F(ContainsTest, BoolListContainsScalarWithNullsInLists)
+{
+  using T = bool;
+
+  auto numerals = fixed_width_column_wrapper<T>{
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4},
+    make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })};
+
+  auto input_null_mask_iter = make_counting_transform_iterator(0, [](auto i) { return i != 4; });
+
+  auto search_space = make_lists_column(
+    8,
+    fixed_width_column_wrapper<size_type>{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(),
+    numerals.release(),
+    1,
+    cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8));
+
+  auto search_key_one = create_scalar_search_key<T>(1);
+
+  auto actual_result = lists::contains(search_space->view(), *search_key_one);
+
+  auto expected_result =
+    fixed_width_column_wrapper<bool>{{0, 1, 1, 0, 0, 1, 0, 1}, {0, 1, 1, 1, 0, 1, 1, 1}};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+}
+
+TEST_F(ContainsTest, StringListContainsScalarWithNullsInLists)
+{
+  using T = std::string;
+
+  auto strings = strings_column_wrapper{
+    {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "1", "2", "3", "4"},
+    make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })};
+
+  auto input_null_mask_iter = make_counting_transform_iterator(0, [](auto i) { return i != 4; });
+
+  auto search_space = make_lists_column(
+    8,
+    fixed_width_column_wrapper<size_type>{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(),
+    strings.release(),
+    1,
+    cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8));
+
+  auto search_key_one = create_scalar_search_key<T>("1");
+
+  auto actual_result = lists::contains(search_space->view(), *search_key_one);
+
+  auto expected_result =
+    fixed_width_column_wrapper<bool>{{0, 1, 0, 0, 0, 0, 0, 1}, {0, 1, 0, 1, 0, 0, 1, 1}};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+}
+
+TYPED_TEST(TypedContainsTest, ContainsScalarNullSearchKey)
+{
+  using T = TypeParam;
+
+  auto search_space = lists_column_wrapper<T, int32_t>{
+    {{0, 1, 2},
+     {3, 4, 5},
+     {6, 7, 8},
+     {},
+     {9, 0, 1},
+     {2, 3, 4},
+     {5, 6, 7},
+     {8, 9, 0},
+     {},
+     {1, 2, 3},
+     {}},
+    make_counting_transform_iterator(0, [](auto i) {
+      return (i != 3) && (i != 10);
+    })}.release();
+
+  auto search_key_null = create_null_search_key<T>();
+
+  auto actual_result = lists::contains(search_space->view(), *search_key_null);
+
+  auto expected_result = fixed_width_column_wrapper<bool>{
+    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    make_counting_transform_iterator(0, [](auto i) { return false; })};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+}
+
+TEST_F(ContainsTest, ScalarTypeRelatedExceptions)
+{
+  {
+    // Nested types unsupported.
+    auto list_of_lists = lists_column_wrapper<int32_t>{
+      {{1, 2, 3}, {4, 5, 6}},
+      {{1, 2, 3}, {4, 5, 6}},
+      {{1, 2, 3},
+       {4, 5, 6}}}.release();
+    auto skey = create_scalar_search_key<int32_t>(10);
+    CUDF_EXPECT_THROW_MESSAGE(lists::contains(list_of_lists->view(), *skey),
+                              "Nested types not supported in lists::contains()");
+  }
+
+  {
+    // Search key must match list elements in type.
+    auto list_of_ints =
+      lists_column_wrapper<int32_t>{
+        {0, 1, 2},
+        {3, 4, 5},
+      }
+        .release();
+    auto skey = create_scalar_search_key<std::string>("Hello, World!");
+    CUDF_EXPECT_THROW_MESSAGE(lists::contains(list_of_ints->view(), *skey),
+                              "Type of search key does not match list column element type.");
+  }
+}
+
+template <typename T>
+struct TypedVectorContainsTest : public ContainsTest {
+};
+
+using VectorContainsTestTypes =
+  cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
+
+TYPED_TEST_CASE(TypedVectorContainsTest, VectorContainsTestTypes);
+
+TYPED_TEST(TypedVectorContainsTest, ListContainsVectorWithNoNulls)
+{
+  using T = TypeParam;
+
+  auto search_space = lists_column_wrapper<T, int32_t>{
+    {0, 1, 2},
+    {3, 4, 5},
+    {6, 7, 8},
+    {9, 0, 1},
+    {2, 3, 4},
+    {5, 6, 7},
+    {8, 9, 0},
+    {},
+    {1, 2, 3},
+    {}}.release();
+
+  auto search_key = fixed_width_column_wrapper<T, int32_t>{1, 2, 3, 1, 2, 3, 1, 2, 3, 1};
+
+  auto actual_result = lists::contains(search_space->view(), search_key);
+
+  auto expected_result = fixed_width_column_wrapper<bool>{1, 0, 0, 1, 1, 0, 0, 0, 1, 0};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+}
+
+TYPED_TEST(TypedVectorContainsTest, ListContainsVectorWithNullLists)
+{
+  // Test List columns that have NULL list rows.
+
+  using T = TypeParam;
+
+  auto search_space = lists_column_wrapper<T, int32_t>{
+    {{0, 1, 2},
+     {3, 4, 5},
+     {6, 7, 8},
+     {},
+     {9, 0, 1},
+     {2, 3, 4},
+     {5, 6, 7},
+     {8, 9, 0},
+     {},
+     {1, 2, 3},
+     {}},
+    make_counting_transform_iterator(0, [](auto i) {
+      return (i != 3) && (i != 10);
+    })}.release();
+
+  auto search_keys = fixed_width_column_wrapper<T, int32_t>{1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2};
+
+  auto actual_result = lists::contains(search_space->view(), search_keys);
+
+  auto expected_result = fixed_width_column_wrapper<bool>{
+    {1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0},
+    make_counting_transform_iterator(0, [](auto i) { return (i != 3) && (i != 10); })};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+}
+
+TYPED_TEST(TypedVectorContainsTest, ListContainsVectorNonNullListsWithNullValues)
+{
+  // Test List columns that have no NULL list rows, but NULL elements in some list rows.
+  using T = TypeParam;
+
+  auto numerals = fixed_width_column_wrapper<T>{
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4},
+    make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })};
+
+  auto search_space =
+    make_lists_column(8,
+                      fixed_width_column_wrapper<size_type>{0, 1, 3, 7, 7, 7, 10, 12, 15}.release(),
+                      numerals.release(),
+                      0,
+                      {});
+
+  auto search_keys = fixed_width_column_wrapper<T, int32_t>{1, 2, 3, 1, 2, 3, 1, 3};
+
+  auto actual_result = lists::contains(search_space->view(), search_keys);
+
+  auto expected_result =
+    fixed_width_column_wrapper<bool>{{0, 1, 0, 0, 0, 0, 1, 1}, {0, 1, 0, 1, 1, 0, 1, 1}};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+}
+
+TYPED_TEST(TypedVectorContainsTest, ListContainsVectorWithNullsInLists)
+{
+  using T = TypeParam;
+
+  auto numerals = fixed_width_column_wrapper<T>{
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4},
+    make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })};
+
+  auto input_null_mask_iter = make_counting_transform_iterator(0, [](auto i) { return i != 4; });
+
+  auto search_space = make_lists_column(
+    8,
+    fixed_width_column_wrapper<size_type>{0, 1, 3, 7, 7, 7, 10, 12, 15}.release(),
+    numerals.release(),
+    1,
+    cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8));
+
+  auto search_keys = fixed_width_column_wrapper<T, int32_t>{1, 2, 3, 1, 2, 3, 1, 3};
+
+  auto actual_result = lists::contains(search_space->view(), search_keys);
+
+  auto expected_result =
+    fixed_width_column_wrapper<bool>{{0, 1, 0, 0, 0, 0, 1, 1}, {0, 1, 0, 1, 0, 0, 1, 1}};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+}
+
+TYPED_TEST(TypedVectorContainsTest, ListContainsVectorWithNullsInListsAndInSearchKeys)
+{
+  using T = TypeParam;
+
+  auto numerals = fixed_width_column_wrapper<T>{
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4},
+    make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })};
+
+  auto input_null_mask_iter = make_counting_transform_iterator(0, [](auto i) { return i != 4; });
+
+  auto search_space = make_lists_column(
+    8,
+    fixed_width_column_wrapper<size_type>{0, 1, 3, 7, 7, 7, 10, 12, 15}.release(),
+    numerals.release(),
+    1,
+    cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8));
+
+  auto search_keys = fixed_width_column_wrapper<T, int32_t>{
+    {1, 2, 3, 1, 2, 3, 1, 3}, make_counting_transform_iterator(0, [](auto i) { return i != 6; })};
+
+  auto actual_result = lists::contains(search_space->view(), search_keys);
+
+  auto expected_result =
+    fixed_width_column_wrapper<bool>{{0, 1, 0, 0, 0, 0, 0, 1}, {0, 1, 0, 1, 0, 0, 0, 1}};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+}
+
+TEST_F(ContainsTest, BoolListContainsVectorWithNullsInListsAndInSearchKeys)
+{
+  using T = bool;
+
+  auto numerals = fixed_width_column_wrapper<T, int32_t>{
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4},
+    make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })};
+
+  auto input_null_mask_iter = make_counting_transform_iterator(0, [](auto i) { return i != 4; });
+
+  auto search_space = make_lists_column(
+    8,
+    fixed_width_column_wrapper<size_type>{0, 1, 3, 7, 7, 7, 10, 12, 15}.release(),
+    numerals.release(),
+    1,
+    cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8));
+
+  auto search_keys = fixed_width_column_wrapper<T, int32_t>{
+    {0, 1, 0, 1, 0, 0, 1, 1}, make_counting_transform_iterator(0, [](auto i) { return i != 6; })};
+
+  auto actual_result = lists::contains(search_space->view(), search_keys);
+
+  auto expected_result =
+    fixed_width_column_wrapper<bool>{{0, 1, 0, 0, 0, 0, 0, 1}, {0, 1, 0, 1, 0, 0, 0, 1}};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+}
+
+TEST_F(ContainsTest, StringListContainsVectorWithNullsInListsAndInSearchKeys)
+{
+  auto numerals = strings_column_wrapper{
+    {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "1", "2", "3", "4"},
+    make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })};
+
+  auto input_null_mask_iter = make_counting_transform_iterator(0, [](auto i) { return i != 4; });
+
+  auto search_space = make_lists_column(
+    8,
+    fixed_width_column_wrapper<size_type>{0, 1, 3, 7, 7, 7, 10, 12, 15}.release(),
+    numerals.release(),
+    1,
+    cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8));
+
+  auto search_keys =
+    strings_column_wrapper{{"1", "2", "3", "1", "2", "3", "1", "3"},
+                           make_counting_transform_iterator(0, [](auto i) { return i != 6; })};
+
+  auto actual_result = lists::contains(search_space->view(), search_keys);
+
+  auto expected_result =
+    fixed_width_column_wrapper<bool>{{0, 1, 0, 0, 0, 0, 0, 1}, {0, 1, 0, 1, 0, 0, 0, 1}};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+}
+
+TEST_F(ContainsTest, VectorTypeRelatedExceptions)
+{
+  {
+    // Nested types unsupported.
+    auto list_of_lists = lists_column_wrapper<int32_t>{
+      {{1, 2, 3}, {4, 5, 6}},
+      {{1, 2, 3}, {4, 5, 6}},
+      {{1, 2, 3},
+       {4, 5, 6}}}.release();
+    auto skey = fixed_width_column_wrapper<int32_t>{0, 1, 2};
+    CUDF_EXPECT_THROW_MESSAGE(lists::contains(list_of_lists->view(), skey),
+                              "Nested types not supported in lists::contains()");
+  }
+
+  {
+    // Search key must match list elements in type.
+    auto list_of_ints =
+      lists_column_wrapper<int32_t>{
+        {0, 1, 2},
+        {3, 4, 5},
+      }
+        .release();
+    auto skey = strings_column_wrapper{"Hello", "World"};
+    CUDF_EXPECT_THROW_MESSAGE(lists::contains(list_of_ints->view(), skey),
+                              "Type of search key does not match list column element type.");
+  }
+
+  {
+    // Search key column size must match lists column size.
+    auto list_of_ints = lists_column_wrapper<int32_t>{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}}.release();
+
+    auto skey = fixed_width_column_wrapper<int32_t>{0, 1, 2, 3};
+    CUDF_EXPECT_THROW_MESSAGE(lists::contains(list_of_ints->view(), skey),
+                              "Number of search keys must match list column size.");
+  }
+}
+
+}  // namespace test
+
+}  // namespace cudf
diff --git a/cpp/tests/lists/count_elements_tests.cpp b/cpp/tests/lists/count_elements_tests.cpp
new file mode 100644
index 00000000000..c5cb9d230c3
--- /dev/null
+++ b/cpp/tests/lists/count_elements_tests.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/copying.hpp>
+#include <cudf/lists/count_elements.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+struct ListsElementsTest : public cudf::test::BaseFixture {
+};
+
+using NumericTypesNotBool =
+  cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
+
+template <typename T>
+class ListsElementsNumericsTest : public ListsElementsTest {
+};
+
+TYPED_TEST_CASE(ListsElementsNumericsTest, NumericTypesNotBool);
+
+TYPED_TEST(ListsElementsNumericsTest, CountElements)
+{
+  auto validity = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<cudf::size_type>(0), [](auto i) { return i != 1; });
+  using LCW = cudf::test::lists_column_wrapper<TypeParam>;
+  LCW input({LCW{3, 2, 1}, LCW{}, LCW{30, 20, 10, 50}, LCW{100, 120}, LCW{0}}, validity);
+
+  auto result = cudf::lists::count_elements(cudf::lists_column_view(input));
+  cudf::test::fixed_width_column_wrapper<int32_t> expected({3, 0, 4, 2, 1}, {1, 0, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+}
+
+TEST_F(ListsElementsTest, CountElementsStrings)
+{
+  auto validity = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<cudf::size_type>(0), [](auto i) { return i != 1; });
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW input(
+    {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", "", "z"}, LCW{"tést", "String"}, LCW{""}},
+    validity);
+
+  auto result = cudf::lists::count_elements(cudf::lists_column_view(input));
+  cudf::test::fixed_width_column_wrapper<int32_t> expected({3, 0, 4, 2, 1}, {1, 0, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+}
+
+TEST_F(ListsElementsTest, CountElementsSliced)
+{
+  auto validity = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<cudf::size_type>(0), [](auto i) { return i != 1; });
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW input(
+    {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", "", "z"}, LCW{"tést", "String"}, LCW{""}},
+    validity);
+
+  auto sliced = cudf::slice(input, {1, 4}).front();
+  auto result = cudf::lists::count_elements(cudf::lists_column_view(sliced));
+  cudf::test::fixed_width_column_wrapper<int32_t> expected({0, 4, 2}, {0, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+}
+
+TYPED_TEST(ListsElementsNumericsTest, CountElementsNestedLists)
+{
+  std::vector<int32_t> validity{1, 0, 1, 1};
+  using LCW = cudf::test::lists_column_wrapper<TypeParam>;
+  LCW list({LCW{LCW{2, 3}, LCW{4, 5}},
+            LCW{LCW{}},
+            LCW{LCW{6, 7, 8}, LCW{9, 10, 11}, LCW({12, 13, 14}, validity.begin())},
+            LCW{LCW{15, 16}, LCW{17, 18}, LCW{19, 20}, LCW{21, 22}, LCW{23, 24}}},
+           validity.begin());
+
+  auto result = cudf::lists::count_elements(cudf::lists_column_view(list));
+  cudf::test::fixed_width_column_wrapper<int32_t> expected({2, 1, 3, 5}, {1, 0, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+}
+
+TEST_F(ListsElementsTest, CountElementsEmpty)
+{
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+
+  LCW empty{};
+  auto result = cudf::lists::count_elements(cudf::lists_column_view(empty));
+  EXPECT_EQ(0, result->size());
+}
diff --git a/cpp/tests/reshape/byte_cast_tests.cpp b/cpp/tests/reshape/byte_cast_tests.cpp
index 2df3ce9e021..48ee77d565f 100644
--- a/cpp/tests/reshape/byte_cast_tests.cpp
+++ b/cpp/tests/reshape/byte_cast_tests.cpp
@@ -114,7 +114,7 @@ TEST_F(ByteCastTest, int32ValuesWithNulls)
     5,
     std::move(fixed_width_column_wrapper<cudf::size_type>{0, 4, 8, 12, 16, 20}.release()),
     std::move(int32_data.release()),
-    3,
+    2,
     detail::make_null_mask(even_validity, even_validity + 5));
 
   auto const output_int32 = cudf::byte_cast(int32_col, cudf::flip_endianness::YES);
diff --git a/cpp/tests/reshape/explode_tests.cpp b/cpp/tests/reshape/explode_tests.cpp
new file mode 100644
index 00000000000..6f98332243e
--- /dev/null
+++ b/cpp/tests/reshape/explode_tests.cpp
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/reshape.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+using namespace cudf::test;
+
+class ExplodeTest : public cudf::test::BaseFixture {
+};
+
+template <typename T>
+class ExplodeTypedTest : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_CASE(ExplodeTypedTest, cudf::test::FixedPointTypes);
+
+TEST_F(ExplodeTest, Empty)
+{
+  lists_column_wrapper<int32_t> a{};
+  fixed_width_column_wrapper<int32_t> b{};
+
+  cudf::table_view t({a, b});
+
+  auto ret = cudf::explode(t, 0);
+
+  fixed_width_column_wrapper<int32_t> expected_a{};
+  fixed_width_column_wrapper<int32_t> expected_b{};
+  cudf::table_view expected({expected_a, expected_b});
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+}
+
+TEST_F(ExplodeTest, NonList)
+{
+  fixed_width_column_wrapper<int32_t> a{100, 200, 300};
+  fixed_width_column_wrapper<int32_t> b{100, 200, 300};
+
+  cudf::table_view t({a, b});
+
+  EXPECT_THROW(cudf::explode(t, 1), cudf::logic_error);
+}
+
+TEST_F(ExplodeTest, Basics)
+{
+  /*
+      a                   b
+      [1, 2, 7]           100
+      [5, 6]              200
+      [0, 3]              300
+  */
+
+  fixed_width_column_wrapper<int32_t> a{100, 200, 300};
+  lists_column_wrapper<int32_t> b{lists_column_wrapper<int32_t>{1, 2, 7},
+                                  lists_column_wrapper<int32_t>{5, 6},
+                                  lists_column_wrapper<int32_t>{0, 3}};
+  strings_column_wrapper c{"string0", "string1", "string2"};
+
+  fixed_width_column_wrapper<int32_t> expected_a{100, 100, 100, 200, 200, 300, 300};
+  fixed_width_column_wrapper<int32_t> expected_b{1, 2, 7, 5, 6, 0, 3};
+  strings_column_wrapper expected_c{
+    "string0", "string0", "string0", "string1", "string1", "string2", "string2"};
+
+  cudf::table_view t({a, b, c});
+  cudf::table_view expected({expected_a, expected_b, expected_c});
+
+  auto ret = cudf::explode(t, 1);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+}
+
+TEST_F(ExplodeTest, SingleNull)
+{
+  /*
+      a                   b
+      [1, 2, 7]           100
+      [5, 6]              200
+      []                  300
+      [0, 3]              400
+  */
+
+  auto first_invalid =
+    cudf::test::make_counting_transform_iterator(0, [](auto i) { return i == 0 ? false : true; });
+
+  lists_column_wrapper<int32_t> a({lists_column_wrapper<int32_t>{1, 2, 7},
+                                   lists_column_wrapper<int32_t>{5, 6},
+                                   lists_column_wrapper<int32_t>{},
+                                   lists_column_wrapper<int32_t>{0, 3}},
+                                  first_invalid);
+  fixed_width_column_wrapper<int32_t> b({100, 200, 300, 400});
+
+  fixed_width_column_wrapper<int32_t> expected_a{5, 6, 0, 3};
+  fixed_width_column_wrapper<int32_t> expected_b{200, 200, 400, 400};
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+}
+
+TEST_F(ExplodeTest, Nulls)
+{
+  /*
+      a                   b
+      [1, 2, 7]           100
+      [5, 6]              200
+      [0, 3]              300
+  */
+
+  auto valids = cudf::test::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+  auto always_valid = cudf::test::make_counting_transform_iterator(0, [](auto i) { return true; });
+
+  lists_column_wrapper<int32_t> a({lists_column_wrapper<int32_t>{1, 2, 7},
+                                   lists_column_wrapper<int32_t>{5, 6},
+                                   lists_column_wrapper<int32_t>{0, 3}},
+                                  valids);
+  fixed_width_column_wrapper<int32_t> b({100, 200, 300}, valids);
+
+  fixed_width_column_wrapper<int32_t> expected_a({1, 2, 7, 0, 3});
+  fixed_width_column_wrapper<int32_t> expected_b({100, 100, 100, 300, 300}, always_valid);
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+}
+
+TEST_F(ExplodeTest, NullsInList)
+{
+  /*
+      a                   b
+      [1, 2, 7]           100
+      [5, 6, 0, 9]        200
+      []                  300
+      [0, 3, 8]           400
+  */
+
+  auto valids = cudf::test::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  lists_column_wrapper<int32_t> a{lists_column_wrapper<int32_t>({1, 2, 7}, valids),
+                                  lists_column_wrapper<int32_t>({5, 6, 0, 9}, valids),
+                                  lists_column_wrapper<int32_t>{},
+                                  lists_column_wrapper<int32_t>({0, 3, 8}, valids)};
+  fixed_width_column_wrapper<int32_t> b{100, 200, 300, 400};
+
+  fixed_width_column_wrapper<int32_t> expected_a({1, 2, 7, 5, 6, 0, 9, 0, 3, 8},
+                                                 {1, 0, 1, 1, 0, 1, 0, 1, 0, 1});
+  fixed_width_column_wrapper<int32_t> expected_b{100, 100, 100, 200, 200, 200, 200, 400, 400, 400};
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+}
+
+TEST_F(ExplodeTest, Nested)
+{
+  /*
+      a                      b
+      [[1, 2], [7, 6, 5]]    100
+      [[5, 6]]               200
+      [[0, 3],[],[5],[2, 1]] 300
+  */
+
+  lists_column_wrapper<int32_t> a{
+    lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{1, 2},
+                                  lists_column_wrapper<int32_t>{7, 6, 5}},
+    lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{5, 6}},
+    lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{0, 3},
+                                  lists_column_wrapper<int32_t>{},
+                                  lists_column_wrapper<int32_t>{5},
+                                  lists_column_wrapper<int32_t>{2, 1}}};
+  fixed_width_column_wrapper<int32_t> b{100, 200, 300};
+
+  lists_column_wrapper<int32_t> expected_a{lists_column_wrapper<int32_t>{1, 2},
+                                           lists_column_wrapper<int32_t>{7, 6, 5},
+                                           lists_column_wrapper<int32_t>{5, 6},
+                                           lists_column_wrapper<int32_t>{0, 3},
+                                           lists_column_wrapper<int32_t>{},
+                                           lists_column_wrapper<int32_t>{5},
+                                           lists_column_wrapper<int32_t>{2, 1}};
+  fixed_width_column_wrapper<int32_t> expected_b{100, 100, 200, 300, 300, 300, 300};
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+}
+
+TEST_F(ExplodeTest, NestedNulls)
+{
+  /*
+      a                   b
+      [[1, 2], [7, 6, 5]] 100
+      [[5, 6]]            200
+      [[0, 3],[5],[2, 1]] 300
+  */
+
+  auto valids = cudf::test::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+  auto always_valid = cudf::test::make_counting_transform_iterator(0, [](auto i) { return true; });
+
+  lists_column_wrapper<int32_t> a(
+    {lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{1, 2},
+                                   lists_column_wrapper<int32_t>{7, 6, 5}},
+     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{5, 6}},
+     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{0, 3},
+                                   lists_column_wrapper<int32_t>{5},
+                                   lists_column_wrapper<int32_t>{2, 1}}},
+    valids);
+  fixed_width_column_wrapper<int32_t> b({100, 200, 300}, valids);
+
+  lists_column_wrapper<int32_t> expected_a{lists_column_wrapper<int32_t>{1, 2},
+                                           lists_column_wrapper<int32_t>{7, 6, 5},
+                                           lists_column_wrapper<int32_t>{0, 3},
+                                           lists_column_wrapper<int32_t>{5},
+                                           lists_column_wrapper<int32_t>{2, 1}};
+  fixed_width_column_wrapper<int32_t> expected_b({100, 100, 300, 300, 300}, always_valid);
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+}
+
+TEST_F(ExplodeTest, NullsInNested)
+{
+  /*
+      a                   b
+      [[1, 2], [7, 6, 5]] 100
+      [[5, 6]]            200
+      [[0, 3],[5],[2, 1]] 300
+  */
+
+  auto valids = cudf::test::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  lists_column_wrapper<int32_t> a(
+    {lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>({1, 2}, valids),
+                                   lists_column_wrapper<int32_t>{7, 6, 5}},
+     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{5, 6}},
+     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{0, 3},
+                                   lists_column_wrapper<int32_t>{5},
+                                   lists_column_wrapper<int32_t>({2, 1}, valids)}});
+  fixed_width_column_wrapper<int32_t> b({100, 200, 300});
+
+  lists_column_wrapper<int32_t> expected_a{lists_column_wrapper<int32_t>({1, 2}, valids),
+                                           lists_column_wrapper<int32_t>{7, 6, 5},
+                                           lists_column_wrapper<int32_t>{5, 6},
+                                           lists_column_wrapper<int32_t>{0, 3},
+                                           lists_column_wrapper<int32_t>{5},
+                                           lists_column_wrapper<int32_t>({2, 1}, valids)};
+  fixed_width_column_wrapper<int32_t> expected_b{100, 100, 200, 300, 300, 300};
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+}
+
+TEST_F(ExplodeTest, NullsInNestedDoubleExplode)
+{
+  /*
+      a                       b
+      [[1, 2], [], [7, 6, 5]] 100
+      [[5, 6]]                200
+      [[0, 3],[5],[2, 1]]     300
+  */
+
+  auto valids = cudf::test::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  lists_column_wrapper<int32_t> a{
+    lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>({1, 2}, valids),
+                                  lists_column_wrapper<int32_t>{},
+                                  lists_column_wrapper<int32_t>{7, 6, 5}},
+    lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{5, 6}},
+    lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{0, 3},
+                                  lists_column_wrapper<int32_t>{5},
+                                  lists_column_wrapper<int32_t>({2, 1}, valids)}};
+  fixed_width_column_wrapper<int32_t> b{100, 200, 300};
+
+  fixed_width_column_wrapper<int32_t> expected_a({1, 2, 7, 6, 5, 5, 6, 0, 3, 5, 2, 1},
+                                                 {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+  fixed_width_column_wrapper<int32_t> expected_b{
+    100, 100, 100, 100, 100, 200, 200, 300, 300, 300, 300, 300};
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode(t, 0);
+  ret      = cudf::explode(ret->view(), 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+}
+
+TEST_F(ExplodeTest, NestedStructs)
+{
+  /*
+      a                   b
+      [[1, 2], [7, 6, 5]] {100, "100"}
+      [[5, 6]]            {200, "200"}
+      [[0, 3],[5],[2, 1]] {300, "300"}
+  */
+
+  auto valids = cudf::test::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  lists_column_wrapper<int32_t> a(
+    {lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>({1, 2}, valids),
+                                   lists_column_wrapper<int32_t>{7, 6, 5}},
+     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{5, 6}},
+     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{0, 3},
+                                   lists_column_wrapper<int32_t>{5},
+                                   lists_column_wrapper<int32_t>({2, 1}, valids)}});
+  fixed_width_column_wrapper<int32_t> b1({100, 200, 300});
+  strings_column_wrapper b2{"100", "200", "300"};
+  structs_column_wrapper b({b1, b2});
+
+  lists_column_wrapper<int32_t> expected_a{lists_column_wrapper<int32_t>({1, 2}, valids),
+                                           lists_column_wrapper<int32_t>{7, 6, 5},
+                                           lists_column_wrapper<int32_t>{5, 6},
+                                           lists_column_wrapper<int32_t>{0, 3},
+                                           lists_column_wrapper<int32_t>{5},
+                                           lists_column_wrapper<int32_t>({2, 1}, valids)};
+  fixed_width_column_wrapper<int32_t> expected_b1{100, 100, 200, 300, 300, 300};
+  strings_column_wrapper expected_b2{"100", "100", "200", "300", "300", "300"};
+  structs_column_wrapper expected_b({expected_b1, expected_b2});
+
+  cudf::table_view t({a, b});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode(t, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+}
+
+TYPED_TEST(ExplodeTypedTest, ListOfStructs)
+{
+  /*
+    a                        b
+    [{70, "70"}, {75, "75"}] 100
+    [{50, "50"}, {55, "55"}] 200
+    [{35, "35"}, {45, "45"}] 300
+    [{25, "25"}, {30, "30"}] 400
+    [{15, "15"}, {20, "20"}] 500
+*/
+
+  auto numeric_col =
+    fixed_width_column_wrapper<TypeParam, int32_t>{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}};
+  strings_column_wrapper string_col{"70", "75", "50", "55", "35", "45", "25", "30", "15", "20"};
+  auto struct_col = structs_column_wrapper{{numeric_col, string_col}}.release();
+  auto a          = cudf::make_lists_column(5,
+                                   fixed_width_column_wrapper<int32_t>{0, 2, 4, 6, 8, 10}.release(),
+                                   std::move(struct_col),
+                                   cudf::UNKNOWN_NULL_COUNT,
+                                   {});
+
+  fixed_width_column_wrapper<int32_t> b{100, 200, 300, 400, 500};
+
+  cudf::table_view t({a->view(), b});
+  auto ret = cudf::explode(t, 0);
+
+  auto expected_numeric_col =
+    fixed_width_column_wrapper<TypeParam, int32_t>{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}};
+  strings_column_wrapper expected_string_col{
+    "70", "75", "50", "55", "35", "45", "25", "30", "15", "20"};
+
+  auto expected_a = structs_column_wrapper{{expected_numeric_col, expected_string_col}}.release();
+  fixed_width_column_wrapper<int32_t> expected_b{100, 100, 200, 200, 300, 300, 400, 400, 500, 500};
+
+  cudf::table_view expected({expected_a->view(), expected_b});
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+}
+
+TEST_F(ExplodeTest, SlicedList)
+{
+  /*
+      a                        b
+      [[1, 2],[7, 6, 5]]       100
+      [[5, 6]]                 200
+      [[0, 3],[5],[2, 1]]      300
+      [[8, 3],[],[4, 3, 1, 2]] 400
+      [[2, 3, 4],[9, 8]]       500
+
+      slicing the top 2 rows and the bottom row off
+  */
+
+  auto valids = cudf::test::make_counting_transform_iterator(
+    0, [](auto i) { return i % 2 == 0 ? true : false; });
+
+  lists_column_wrapper<int32_t> a(
+    {lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>({1, 2}, valids),
+                                   lists_column_wrapper<int32_t>{7, 6, 5}},
+     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{5, 6}},
+     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{0, 3},
+                                   lists_column_wrapper<int32_t>{5},
+                                   lists_column_wrapper<int32_t>({2, 1}, valids)},
+     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{8, 3},
+                                   lists_column_wrapper<int32_t>{},
+                                   lists_column_wrapper<int32_t>({4, 3, 1, 2}, valids)},
+     lists_column_wrapper<int32_t>{lists_column_wrapper<int32_t>{2, 3, 4},
+                                   lists_column_wrapper<int32_t>{9, 8}}});
+  fixed_width_column_wrapper<int32_t> b({100, 200, 300, 400, 500});
+
+  lists_column_wrapper<int32_t> expected_a{lists_column_wrapper<int32_t>{0, 3},
+                                           lists_column_wrapper<int32_t>{5},
+                                           lists_column_wrapper<int32_t>({2, 1}, valids),
+                                           lists_column_wrapper<int32_t>{8, 3},
+                                           lists_column_wrapper<int32_t>{},
+                                           lists_column_wrapper<int32_t>({4, 3, 1, 2}, valids)};
+  fixed_width_column_wrapper<int32_t> expected_b{300, 300, 300, 400, 400, 400};
+
+  cudf::table_view t({a, b});
+  auto sliced_t = cudf::slice(t, {2, 4});
+  cudf::table_view expected({expected_a, expected_b});
+
+  auto ret = cudf::explode(sliced_t[0], 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+}
diff --git a/cpp/tests/scalar/scalar_device_view_test.cu b/cpp/tests/scalar/scalar_device_view_test.cu
index c6565ac72dc..c501071ccbe 100644
--- a/cpp/tests/scalar/scalar_device_view_test.cu
+++ b/cpp/tests/scalar/scalar_device_view_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
+#include <cudf/strings/string_view.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <cudf_test/base_fixture.hpp>
diff --git a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
index 066a6624fb7..ad693f96c4d 100644
--- a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
+++ b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
@@ -249,7 +249,7 @@ TEST_F(ApplyBooleanMask, NoNullInput)
 
 TEST_F(ApplyBooleanMask, CorrectNullCount)
 {
-  cudf::size_type inputRows = 75000;
+  cudf::size_type inputRows = 471234;
 
   auto seq1       = cudf::test::make_counting_transform_iterator(0, [](auto i) { return i; });
   auto valid_seq1 = cudf::test::make_counting_transform_iterator(0, [](auto row) { return true; });
diff --git a/cpp/tests/strings/attrs_tests.cpp b/cpp/tests/strings/attrs_tests.cpp
index 396895dc055..117a215374a 100644
--- a/cpp/tests/strings/attrs_tests.cpp
+++ b/cpp/tests/strings/attrs_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <vector>
 
diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp
index c01b220d9da..97b1dd716d7 100644
--- a/cpp/tests/strings/find_tests.cpp
+++ b/cpp/tests/strings/find_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <vector>
 
diff --git a/cpp/tests/strings/floats_tests.cpp b/cpp/tests/strings/floats_tests.cpp
index 3a3613cc35d..40775382e16 100644
--- a/cpp/tests/strings/floats_tests.cpp
+++ b/cpp/tests/strings/floats_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <vector>
 
diff --git a/cpp/tests/table/row_operators_tests.cpp b/cpp/tests/table/row_operators_tests.cpp
index c604e83f05d..3c970a5d1f1 100644
--- a/cpp/tests/table/row_operators_tests.cpp
+++ b/cpp/tests/table/row_operators_tests.cpp
@@ -65,3 +65,25 @@ TEST_F(RowOperatorTestForNAN, NANSorting)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, got2->view());
 }
+
+TEST_F(RowOperatorTestForNAN, NANSortingNonNull)
+{
+  cudf::test::fixed_width_column_wrapper<double> input{
+    {0.,
+     double(NAN),
+     -1.,
+     7.,
+     std::numeric_limits<double>::infinity(),
+     1.,
+     -1 * std::numeric_limits<double>::infinity()}};
+
+  cudf::table_view input_table{{input}};
+
+  auto result = cudf::sorted_order(input_table, {cudf::order::ASCENDING});
+  cudf::test::fixed_width_column_wrapper<int32_t> expected_asc{{6, 2, 0, 5, 3, 4, 1}};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_asc, result->view());
+
+  result = cudf::sorted_order(input_table, {cudf::order::DESCENDING});
+  cudf::test::fixed_width_column_wrapper<int32_t> expected_desc{{1, 4, 3, 5, 0, 2, 6}};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_desc, result->view());
+}
diff --git a/cpp/tests/transform/bools_to_mask_test.cpp b/cpp/tests/transform/bools_to_mask_test.cpp
index 5e49e2e854a..20d1c5df5ea 100644
--- a/cpp/tests/transform/bools_to_mask_test.cpp
+++ b/cpp/tests/transform/bools_to_mask_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
 
 struct MaskToNullTest : public cudf::test::BaseFixture {
   void run_test(std::vector<bool> input, std::vector<bool> val)
diff --git a/cpp/tests/transform/mask_to_bools_test.cpp b/cpp/tests/transform/mask_to_bools_test.cpp
index 2b0325336e1..2a759ffcfe5 100644
--- a/cpp/tests/transform/mask_to_bools_test.cpp
+++ b/cpp/tests/transform/mask_to_bools_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
 
 struct MaskToBools : public cudf::test::BaseFixture {
 };
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 62f31233c80..4f7ac41a00f 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -67,6 +67,8 @@ struct column_property_comparator {
 
     if (lhs.size() > 0 && check_exact_equality) { EXPECT_EQ(lhs.nullable(), rhs.nullable()); }
 
+    EXPECT_EQ(lhs.null_count(), rhs.null_count());
+
     // equivalent, but not exactly equal columns can have a different number of children if their
     // sizes are both 0. Specifically, empty string columns may or may not have children.
     if (check_exact_equality || lhs.size() > 0) {
diff --git a/cpp/tests/utilities/scalar_utilities.cu b/cpp/tests/utilities/scalar_utilities.cu
deleted file mode 100644
index 6149356e2f3..00000000000
--- a/cpp/tests/utilities/scalar_utilities.cu
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf_test/scalar_utilities.hpp>
-
-#include <jit/type.h>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <sstream>
-#include <type_traits>
-#include "gtest/gtest.h"
-
-using cudf::scalar_type_t;
-
-namespace cudf {
-namespace test {
-namespace {
-struct compare_scalar_functor {
-  template <typename T>
-  void operator()(cudf::scalar const& lhs, cudf::scalar const& rhs)
-  {
-    auto lhs_t = static_cast<scalar_type_t<T> const&>(lhs);
-    auto rhs_t = static_cast<scalar_type_t<T> const&>(rhs);
-    EXPECT_EQ(lhs_t.value(), rhs_t.value());
-  }
-};
-
-template <>
-void compare_scalar_functor::operator()<float>(cudf::scalar const& lhs, cudf::scalar const& rhs)
-{
-  auto lhs_t = static_cast<scalar_type_t<float> const&>(lhs);
-  auto rhs_t = static_cast<scalar_type_t<float> const&>(rhs);
-  EXPECT_FLOAT_EQ(lhs_t.value(), rhs_t.value());
-}
-
-template <>
-void compare_scalar_functor::operator()<double>(cudf::scalar const& lhs, cudf::scalar const& rhs)
-{
-  auto lhs_t = static_cast<scalar_type_t<double> const&>(lhs);
-  auto rhs_t = static_cast<scalar_type_t<double> const&>(rhs);
-  EXPECT_DOUBLE_EQ(lhs_t.value(), rhs_t.value());
-}
-
-template <>
-void compare_scalar_functor::operator()<cudf::dictionary32>(cudf::scalar const& lhs,
-                                                            cudf::scalar const& rhs)
-{
-  CUDF_FAIL("Unsupported scalar compare type: dictionary");
-}
-
-template <>
-void compare_scalar_functor::operator()<cudf::list_view>(cudf::scalar const& lhs,
-                                                         cudf::scalar const& rhs)
-{
-  CUDF_FAIL("Unsupported scalar compare type: list_view");
-}
-
-template <>
-void compare_scalar_functor::operator()<cudf::struct_view>(cudf::scalar const& lhs,
-                                                           cudf::scalar const& rhs)
-{
-  CUDF_FAIL("Unsupported scalar compare type: struct_view");
-}
-
-}  // anonymous namespace
-
-void expect_scalars_equal(cudf::scalar const& lhs, cudf::scalar const& rhs)
-{
-  EXPECT_EQ(lhs.type(), rhs.type());
-  EXPECT_EQ(lhs.is_valid(), rhs.is_valid());
-
-  if (lhs.is_valid() && rhs.is_valid() && lhs.type() == rhs.type()) {
-    type_dispatcher(lhs.type(), compare_scalar_functor{}, lhs, rhs);
-  }
-}
-
-}  // namespace test
-}  // namespace cudf
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 09cfcabdcdb..8daf3a0850e 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -21,6 +21,7 @@
 #
 import os
 import sys
+from recommonmark.transform import AutoStructify
 
 sys.path.insert(0, os.path.abspath("../.."))
 
@@ -200,8 +201,15 @@
 
 autoclass_content = "init"
 
+# Config AutoStructify
+github_doc_root = 'https://github.com/rtfd/recommonmark/tree/master/doc/'
 
 def setup(app):
     app.add_js_file("copybutton_pydocs.js")
     app.add_css_file("params.css")
     app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
+    app.add_config_value('recommonmark_config', {
+            'url_resolver': lambda url: github_doc_root + url,
+            'auto_toc_tree_section': 'Contents',
+            }, True)
+    app.add_transform(AutoStructify)
diff --git a/docs/cudf/source/groupby.md b/docs/cudf/source/groupby.md
index 4d775e3d51a..7e96d4fe38c 100644
--- a/docs/cudf/source/groupby.md
+++ b/docs/cudf/source/groupby.md
@@ -33,6 +33,37 @@ import cudf
 >>> gb3 = df.groupby(cudf.Series(['a', 'a', 'b', 'b', 'b']))  # grouping by an external column
 ```
 
+``` warning::
+   cuDF uses `sort=False` by default to achieve better performance, which provides no gaurentee to the group order in outputs. This deviates from Pandas default behavior.
+
+   For example:
+
+   .. code-block:: python
+   
+      >>> df = cudf.DataFrame({'a' : [2, 2, 1], 'b' : [42, 21, 11]})
+      >>> df.groupby('a').sum()
+         b
+      a    
+      2  63
+      1  11
+      >>> df.to_pandas().groupby('a').sum()
+         b
+      a    
+      1  11
+      2  63
+   
+   Setting `sort=True` will produce Pandas-like output, but with some performance penalty:
+
+   .. code-block:: python
+   
+      >>> df.groupby('a', sort=True).sum()
+         b
+      a    
+      1  11
+      2  63
+
+```
+
 ### Grouping by index levels
 
 You can also group by one or more levels of a MultiIndex:
@@ -66,7 +97,7 @@ b
 
 Aggregations on groups is supported via the `agg` method:
 
-```
+```python
 >>> df
    a  b  c
 0  1  1  1
diff --git a/java/pom.xml b/java/pom.xml
index ddd0d06a74f..387ef1cb65b 100755
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -132,6 +132,12 @@
             <version>2.25.0</version>
             <scope>test</scope>
         </dependency>
+        <dependency>
+            <groupId>org.apache.arrow</groupId>
+            <artifactId>arrow-vector</artifactId>
+            <version>${arrow.version}</version>
+           <scope>test</scope>
+        </dependency>
     </dependencies>
 
     <properties>
@@ -151,6 +157,7 @@
         <GPU_ARCHS>ALL</GPU_ARCHS>
         <native.build.path>${project.build.directory}/cmake-build</native.build.path>
         <slf4j.version>1.7.30</slf4j.version>
+        <arrow.version>0.15.1</arrow.version>
     </properties>
 
     <profiles>
diff --git a/java/src/main/java/ai/rapids/cudf/ArrowColumnBuilder.java b/java/src/main/java/ai/rapids/cudf/ArrowColumnBuilder.java
new file mode 100644
index 00000000000..b3c97930d2a
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/ArrowColumnBuilder.java
@@ -0,0 +1,113 @@
+/*
+ *
+ *  Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+
+/**
+ * Column builder from Arrow data. This builder takes in byte buffers referencing
+ * Arrow data and allows efficient building of CUDF ColumnVectors from that Arrow data.
+ * The caller can add multiple batches where each batch corresponds to Arrow data
+ * and those batches get concatenated together after being converted to CUDF
+ * ColumnVectors.
+ * This currently only supports primitive types and Strings, Decimals and nested types
+ * such as list and struct are not supported.
+ */
+public final class ArrowColumnBuilder implements AutoCloseable {
+    private DType type;
+    private final ArrayList<ByteBuffer> data = new ArrayList<>();
+    private final ArrayList<ByteBuffer> validity = new ArrayList<>();
+    private final ArrayList<ByteBuffer> offsets = new ArrayList<>();
+    private final ArrayList<Long> nullCount = new ArrayList<>();
+    private final ArrayList<Long> rows = new ArrayList<>();
+
+    public ArrowColumnBuilder(HostColumnVector.DataType type) {
+      this.type = type.getType();
+    }
+
+    /**
+     * Add an Arrow buffer. This API allows you to add multiple if you want them
+     * combined into a single ColumnVector.
+     * Note, this takes all data, validity, and offsets buffers, but they may not all
+     * be needed based on the data type. The buffer should be null if its not used
+     * for that type.
+     * This API only supports primitive types and Strings, Decimals and nested types
+     * such as list and struct are not supported.
+     * @param rows - number of rows in this Arrow buffer
+     * @param nullCount - number of null values in this Arrow buffer
+     * @param data - ByteBuffer of the Arrow data buffer
+     * @param validity - ByteBuffer of the Arrow validity buffer
+     * @param offsets - ByteBuffer of the Arrow offsets buffer
+     */
+    public void addBatch(long rows, long nullCount, ByteBuffer data, ByteBuffer validity,
+                         ByteBuffer offsets) {
+      this.rows.add(rows);
+      this.nullCount.add(nullCount);
+      this.data.add(data);
+      this.validity.add(validity);
+      this.offsets.add(offsets);
+    }
+
+    /**
+     * Create the immutable ColumnVector, copied to the device based on the Arrow data.
+     * @return - new ColumnVector
+     */
+    public final ColumnVector buildAndPutOnDevice() {
+      int numBatches = rows.size();
+      ArrayList<ColumnVector> allVecs = new ArrayList<>(numBatches);
+      ColumnVector vecRet;
+      try {
+        for (int i = 0; i < numBatches; i++) {
+          allVecs.add(ColumnVector.fromArrow(type, rows.get(i), nullCount.get(i),
+            data.get(i), validity.get(i), offsets.get(i)));
+        }
+        if (numBatches == 1) {
+          vecRet = allVecs.get(0);
+        } else if (numBatches > 1) {
+          vecRet = ColumnVector.concatenate(allVecs.toArray(new ColumnVector[0]));
+        } else {
+          throw new IllegalStateException("Can't build a ColumnVector when no Arrow batches specified");
+        }
+      } finally {
+        // close the vectors that were concatenated
+        if (numBatches > 1) {
+          allVecs.forEach(cv -> cv.close());
+        }
+      }
+      return vecRet;
+    }
+
+    @Override
+    public void close() {
+      // memory buffers owned outside of this
+    }
+
+    @Override
+    public String toString() {
+      return "ArrowColumnBuilder{" +
+        "type=" + type +
+        ", data=" + data +
+        ", validity=" + validity +
+        ", offsets=" + offsets +
+        ", nullCount=" + nullCount +
+        ", rows=" + rows +
+        '}';
+    }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
index 88c024a437b..252f869a049 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
@@ -25,6 +25,7 @@
 
 import java.math.BigDecimal;
 import java.math.RoundingMode;
+import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Optional;
@@ -310,6 +311,50 @@ public BaseDeviceMemoryBuffer getDeviceBufferFor(BufferType type) {
     return srcBuffer;
   }
 
+  /**
+   * Ensures the ByteBuffer passed in is a direct byte buffer.
+   * If it is not then it creates one and copies the data in
+   * the byte buffer passed in to the direct byte buffer
+   * it created and returns it.
+   */
+  private static ByteBuffer bufferAsDirect(ByteBuffer buf) {
+    ByteBuffer bufferOut = buf;
+    if (bufferOut != null && !bufferOut.isDirect()) {
+      bufferOut = ByteBuffer.allocateDirect(buf.remaining());
+      bufferOut.put(buf);
+      bufferOut.flip();
+    }
+    return bufferOut;
+  }
+
+  /**
+   * Create a ColumnVector from the Apache Arrow byte buffers passed in.
+   * Any of the buffers not used for that datatype should be set to null.
+   * The buffers are expected to be off heap buffers, but if they are not,
+   * it will handle copying them to direct byte buffers.
+   * This only supports primitive types. Strings, Decimals and nested types
+   * such as list and struct are not supported.
+   * @param type - type of the column
+   * @param numRows - Number of rows in the arrow column
+   * @param nullCount - Null count
+   * @param data - ByteBuffer of the Arrow data buffer
+   * @param validity - ByteBuffer of the Arrow validity buffer
+   * @param offsets - ByteBuffer of the Arrow offsets buffer
+   * @return - new ColumnVector
+   */
+  public static ColumnVector fromArrow(
+      DType type,
+      long numRows,
+      long nullCount,
+      ByteBuffer data,
+      ByteBuffer validity,
+      ByteBuffer offsets) {
+    long columnHandle = fromArrow(type.typeId.getNativeId(), numRows, nullCount,
+      bufferAsDirect(data), bufferAsDirect(validity), bufferAsDirect(offsets));
+    ColumnVector vec = new ColumnVector(columnHandle);
+    return vec;
+  }
+
   /**
    * Create a new vector of length rows, where each row is filled with the Scalar's
    * value
@@ -615,6 +660,10 @@ public ColumnVector castTo(DType type) {
 
   private static native long sequence(long initialValue, long step, int rows);
 
+  private static native long fromArrow(int type, long col_length,
+      long null_count, ByteBuffer data, ByteBuffer validity,
+      ByteBuffer offsets) throws CudfException;
+
   private static native long fromScalar(long scalarHandle, int rowCount) throws CudfException;
 
   private static native long makeList(long[] handles, long typeHandle, int scale, long rows)
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index c2110a5f8ff..1dce52f7105 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -2323,6 +2323,37 @@ public static ColumnView makeStructView(ColumnView... columns) {
     return makeStructView(columns[0].rows, columns);
   }
 
+  /**
+   * Create a column of bool values indicating whether the specified scalar
+   * is an element of each row of a list column.
+   * Output `column[i]` is set to null if one or more of the following are true:
+   * 1. The key is null
+   * 2. The column vector list value is null
+   * 3. The list row does not contain the key, and contains at least
+   *    one null.
+   * @param key the scalar to look up
+   * @return a Boolean ColumnVector with the result of the lookup
+   */
+  public final ColumnVector listContains(Scalar key) {
+    assert type.equals(DType.LIST) : "column type must be a LIST";
+    return new ColumnVector(listContains(getNativeView(), key.getScalarHandle()));
+  }
+
+  /**
+   * Create a column of bool values indicating whether the list rows of the first
+   * column contain the corresponding values in the second column.
+   * 1. The key value is null
+   * 2. The column vector list value is null
+   * 3. The list row does not contain the key, and contains at least
+   *    one null.
+   * @param key the ColumnVector with look up values
+   * @return a Boolean ColumnVector with the result of the lookup
+   */
+  public final ColumnVector listContainsColumn(ColumnView key) {
+    assert type.equals(DType.LIST) : "column type must be a LIST";
+    return new ColumnVector(listContainsColumn(getNativeView(), key.getNativeView()));
+  }
+
   /////////////////////////////////////////////////////////////////////////////
   // INTERNAL/NATIVE ACCESS
   /////////////////////////////////////////////////////////////////////////////
@@ -2558,6 +2589,22 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat
 
   private static native long extractListElement(long nativeView, int index);
 
+  /**
+   * Native method for list lookup
+   * @param nativeView the column view handle of the list
+   * @param key the scalar key handle
+   * @return column handle of the resultant
+   */
+  private static native long listContains(long nativeView, long key);
+
+  /**
+   * Native method for list lookup
+   * @param nativeView the column view handle of the list
+   * @param keyColumn the column handle of look up keys
+   * @return column handle of the resultant
+   */
+  private static native long listContainsColumn(long nativeView, long keyColumn);
+
   private static native long castTo(long nativeHandle, int type, int scale);
 
   private static native long logicalCastTo(long nativeHandle, int type, int scale);
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 14748db872d..da4c446d9f7 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -501,6 +501,8 @@ private static native long[] repeatColumnCount(long tableHandle,
                                                  long columnHandle,
                                                  boolean checkCount);
 
+  private static native long[] explode(long tableHandle, int index);
+
   private native long createCudfTableView(long[] nativeColumnViewHandles);
 
   /////////////////////////////////////////////////////////////////////////////
@@ -1615,6 +1617,47 @@ public ContiguousTable[] contiguousSplit(int... indices) {
     return contiguousSplit(nativeHandle, indices);
   }
 
+  /**
+   * Explodes a list column's elements.
+   *
+   * Any list is exploded, which means the elements of the list in each row are expanded
+   * into new rows in the output. The corresponding rows for other columns in the input
+   * are duplicated.
+   *
+   * <code>
+   * Example:
+   * input:  [[5,10,15], 100],
+   *         [[20,25],   200],
+   *         [[30],      300],
+   * index: 0
+   * output: [5,         100],
+   *         [10,        100],
+   *         [15,        100],
+   *         [20,        200],
+   *         [25,        200],
+   *         [30,        300]
+   * </code>
+   *
+   * Nulls propagate in different ways depending on what is null.
+   * <code>
+   *     [[5,null,15], 100],
+   *     [null,        200]
+   * returns:
+   *     [5,           100],
+   *     [null,        100],
+   *     [15,          100]
+   * </code>
+   * Note that null lists are completely removed from the output
+   * and nulls inside lists are pulled out and remain.
+   *
+   * @param index Column index to explode inside the table.
+   * @return A new table with explode_col exploded.
+   */
+  public Table explode(int index) {
+    assert 0 <= index && index < columns.length : "Column index is out of range";
+    assert columns[index].getType().equals(DType.LIST) : "Column to explode must be of type LIST";
+    return new Table(explode(nativeHandle, index));
+  }
 
   /**
    * Gathers the rows of this table according to `gatherMap` such that row "i"
@@ -2658,11 +2701,15 @@ private static ColumnVector from(DType type, Object dataArray) {
     }
 
     @SuppressWarnings("unchecked")
-    private static <T> ColumnVector fromLists(DataType dataType, Object[][] dataArray) {
+    private static <T> ColumnVector fromLists(DataType dataType, Object[] dataArray) {
       List[] dataLists = new List[dataArray.length];
       for (int i = 0; i < dataLists.length; ++i) {
-        Object[] dataList = dataArray[i];
-        dataLists[i] = dataList != null ? Arrays.asList(dataList) : null;
+        // The element in dataArray can be an array or list, because the below overloaded
+        // version accepts a List of Array as rows.
+        //  `public TestBuilder column(ListType dataType, List<?>... values)`
+        Object dataList = dataArray[i];
+        dataLists[i] = dataList == null ? null :
+            (dataList instanceof List ? (List)dataList : Arrays.asList((Object[])dataList));
       }
       return ColumnVector.fromLists(dataType, dataLists);
     }
@@ -2680,7 +2727,7 @@ public Table build() {
           Object dataArray = typeErasedData.get(i);
           if (dtype.isNestedType()) {
             if (dtype.equals(DType.LIST)) {
-              columns.add(fromLists(dataType, (Object[][]) dataArray));
+              columns.add(fromLists(dataType, (Object[]) dataArray));
             } else if (dtype.equals(DType.STRUCT)) {
               columns.add(fromStructs(dataType, (StructData[]) dataArray));
             } else {
diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp
index 3bce4912fa4..a1e8517c646 100644
--- a/java/src/main/native/src/ColumnVectorJni.cpp
+++ b/java/src/main/native/src/ColumnVectorJni.cpp
@@ -14,12 +14,15 @@
  * limitations under the License.
  */
 
+#include <arrow/api.h>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/concatenate.hpp>
 #include <cudf/filling.hpp>
+#include <cudf/interop.hpp>
 #include <cudf/hashing.hpp>
 #include <cudf/reshape.hpp>
 #include <cudf/utilities/bit.hpp>
+#include <cudf/detail/interop.hpp>
 #include <cudf/lists/detail/concatenate.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
@@ -50,6 +53,78 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_sequence(JNIEnv *env, j
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(JNIEnv *env, jclass,
+                                                                   jint j_type,
+                                                                   jlong j_col_length,
+                                                                   jlong j_null_count,
+                                                                   jobject j_data_obj,
+                                                                   jobject j_validity_obj,
+                                                                   jobject j_offsets_obj) {
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::type_id n_type = static_cast<cudf::type_id>(j_type);
+    // not all the buffers are used for all types
+    void const *data_address = 0;
+    int data_length = 0;
+    if (j_data_obj != 0) {
+      data_address = env->GetDirectBufferAddress(j_data_obj);
+      data_length = env->GetDirectBufferCapacity(j_data_obj);
+    }
+    void const *validity_address = 0;
+    int validity_length = 0;
+    if (j_validity_obj != 0) {
+      validity_address = env->GetDirectBufferAddress(j_validity_obj);
+      validity_length = env->GetDirectBufferCapacity(j_validity_obj);
+    }
+    void const *offsets_address = 0;
+    int offsets_length = 0;
+    if (j_offsets_obj != 0) {
+      offsets_address = env->GetDirectBufferAddress(j_offsets_obj);
+      offsets_length = env->GetDirectBufferCapacity(j_offsets_obj);
+    }
+    auto data_buffer = arrow::Buffer::Wrap(static_cast<const char *>(data_address), static_cast<int>(data_length));
+    auto null_buffer = arrow::Buffer::Wrap(static_cast<const char *>(validity_address), static_cast<int>(validity_length));
+    auto offsets_buffer = arrow::Buffer::Wrap(static_cast<const char *>(offsets_address), static_cast<int>(offsets_length));
+
+    cudf::jni::native_jlongArray outcol_handles(env, 1);
+    std::shared_ptr<arrow::Array> arrow_array;
+    switch (n_type) {
+      case cudf::type_id::DECIMAL32:
+        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting DECIMAL32 yet", 0);
+        break;
+      case cudf::type_id::DECIMAL64:
+        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting DECIMAL64 yet", 0);
+        break;
+      case cudf::type_id::STRUCT:
+        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting STRUCT yet", 0);
+        break;
+      case cudf::type_id::LIST:
+        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting LIST yet", 0);
+        break;
+      case cudf::type_id::DICTIONARY32:
+        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting DICTIONARY32 yet", 0);
+        break;
+      case cudf::type_id::STRING:
+        arrow_array = std::make_shared<arrow::StringArray>(j_col_length, offsets_buffer, data_buffer, null_buffer, j_null_count);
+        break;
+      default:
+        // this handles the primitive types
+        arrow_array = cudf::detail::to_arrow_array(n_type, j_col_length, data_buffer, null_buffer, j_null_count);
+    }
+    auto name_and_type = arrow::field("col", arrow_array->type());
+    std::vector<std::shared_ptr<arrow::Field>> fields = {name_and_type};
+    std::shared_ptr<arrow::Schema> schema = std::make_shared<arrow::Schema>(fields);
+    auto arrow_table = arrow::Table::Make(schema, std::vector<std::shared_ptr<arrow::Array>>{arrow_array});
+    std::unique_ptr<cudf::table> table_result = cudf::from_arrow(*(arrow_table));
+    std::vector<std::unique_ptr<cudf::column>> retCols = table_result->release();
+    if (retCols.size() != 1) {
+      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Must result in one column", 0);
+    }
+    return reinterpret_cast<jlong>(retCols[0].release());
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeList(JNIEnv *env, jobject j_object,
                                                                   jlongArray handles,
                                                                   jlong j_type,
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 621344ac38f..82e71b04a2f 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -56,6 +56,7 @@
 #include <cudf/transform.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/bit.hpp>
+#include <cudf/lists/contains.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <map_lookup.hpp>
@@ -329,6 +330,40 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_extractListElement(JNIEnv
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContains(JNIEnv *env, jclass,
+                                                                    jlong column_view,
+                                                                    jlong lookup_key) {
+  JNI_NULL_CHECK(env, column_view, "column is null", 0);
+  JNI_NULL_CHECK(env, lookup_key, "lookup scalar is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::lists_column_view lcv(*cv);
+    cudf::scalar *lookup_scalar = reinterpret_cast<cudf::scalar *>(lookup_key);
+
+    std::unique_ptr<cudf::column> ret = cudf::lists::contains(lcv, *lookup_scalar);
+    return reinterpret_cast<jlong>(ret.release());
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContainsColumn(JNIEnv *env, jclass,
+                                                                    jlong column_view,
+                                                                    jlong lookup_key_cv) {
+  JNI_NULL_CHECK(env, column_view, "column is null", 0);
+  JNI_NULL_CHECK(env, lookup_key_cv, "lookup column is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::lists_column_view lcv(*cv);
+    cudf::column_view *lookup_cv = reinterpret_cast<cudf::column_view *>(lookup_key_cv);
+
+    std::unique_ptr<cudf::column> ret = cudf::lists::contains(lcv, *lookup_cv);
+    return reinterpret_cast<jlong>(ret.release());
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass,
                                                                         jlong column_view,
                                                                         jlong delimiter) {
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 32f602ffe85..20afe12baf9 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -186,18 +186,19 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
   long alloc_size = MINIMUM_WRITE_BUFFER_SIZE;
 };
 
-template <typename STATE> class jni_table_writer_handle final {
+template <typename WRITER> class jni_table_writer_handle final {
 public:
-  explicit jni_table_writer_handle(std::shared_ptr<STATE> &state) : state(state), sink() {}
-  jni_table_writer_handle(std::shared_ptr<STATE> &state,
-                          std::unique_ptr<jni_writer_data_sink> &sink)
-      : state(state), sink(std::move(sink)) {}
+  explicit jni_table_writer_handle(std::unique_ptr<WRITER> writer)
+      : writer(std::move(writer)), sink() {}
+  jni_table_writer_handle(std::unique_ptr<WRITER> writer,
+                          std::unique_ptr<jni_writer_data_sink> sink)
+      : writer(std::move(writer)), sink(std::move(sink)) {}
 
-  std::shared_ptr<STATE> state;
+  std::unique_ptr<WRITER> writer;
   std::unique_ptr<jni_writer_data_sink> sink;
 };
 
-typedef jni_table_writer_handle<cudf::io::pq_chunked_state> native_parquet_writer_handle;
+typedef jni_table_writer_handle<cudf::io::parquet_chunked_writer> native_parquet_writer_handle;
 typedef jni_table_writer_handle<cudf::io::orc_chunked_writer> native_orc_writer_handle;
 
 class native_arrow_ipc_writer_handle final {
@@ -871,9 +872,9 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
             .decimal_precision(v_precisions)
             .build();
 
-    std::shared_ptr<pq_chunked_state> state = write_parquet_chunked_begin(opts);
+    auto writer_ptr = std::make_unique<cudf::io::parquet_chunked_writer>(opts);
     cudf::jni::native_parquet_writer_handle *ret =
-        new cudf::jni::native_parquet_writer_handle(state, data_sink);
+        new cudf::jni::native_parquet_writer_handle(std::move(writer_ptr), std::move(data_sink));
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)
@@ -919,9 +920,9 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
             .decimal_precision(v_precisions)
             .build();
 
-    std::shared_ptr<pq_chunked_state> state = write_parquet_chunked_begin(opts);
+    auto writer_ptr = std::make_unique<cudf::io::parquet_chunked_writer>(opts);
     cudf::jni::native_parquet_writer_handle *ret =
-        new cudf::jni::native_parquet_writer_handle(state);
+        new cudf::jni::native_parquet_writer_handle(std::move(writer_ptr));
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)
@@ -944,7 +945,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetChunk(JNIEnv *env,
   }
   try {
     cudf::jni::auto_set_device(env);
-    write_parquet_chunked(*tview, state->state);
+    state->writer->write(*tview);
   }
   CATCH_STD(env, )
 }
@@ -959,7 +960,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetEnd(JNIEnv *env, jc
   std::unique_ptr<cudf::jni::native_parquet_writer_handle> make_sure_we_delete(state);
   try {
     cudf::jni::auto_set_device(env);
-    write_parquet_chunked_end(state->state);
+    state->writer->close();
   }
   CATCH_STD(env, )
 }
@@ -1043,9 +1044,9 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
                                           .compression(static_cast<compression_type>(j_compression))
                                           .enable_statistics(true)
                                           .build();
-    auto writer_ptr = std::make_shared<cudf::io::orc_chunked_writer>(opts);
+    auto writer_ptr = std::make_unique<cudf::io::orc_chunked_writer>(opts);
     cudf::jni::native_orc_writer_handle *ret =
-        new cudf::jni::native_orc_writer_handle(writer_ptr, data_sink);
+        new cudf::jni::native_orc_writer_handle(std::move(writer_ptr), std::move(data_sink));
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)
@@ -1084,8 +1085,9 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(
                                           .compression(static_cast<compression_type>(j_compression))
                                           .enable_statistics(true)
                                           .build();
-    auto writer_ptr = std::make_shared<cudf::io::orc_chunked_writer>(opts);
-    cudf::jni::native_orc_writer_handle *ret = new cudf::jni::native_orc_writer_handle(writer_ptr);
+    auto writer_ptr = std::make_unique<cudf::io::orc_chunked_writer>(opts);
+    cudf::jni::native_orc_writer_handle *ret =
+        new cudf::jni::native_orc_writer_handle(std::move(writer_ptr));
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)
@@ -1107,7 +1109,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCChunk(JNIEnv *env, jcla
   }
   try {
     cudf::jni::auto_set_device(env);
-    state->state->write(*tview);
+    state->writer->write(*tview);
   }
   CATCH_STD(env, )
 }
@@ -1121,7 +1123,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCEnd(JNIEnv *env, jclass
   std::unique_ptr<cudf::jni::native_orc_writer_handle> make_sure_we_delete(state);
   try {
     cudf::jni::auto_set_device(env);
-    state->state->close();
+    state->writer->close();
   }
   CATCH_STD(env, )
 }
@@ -1950,4 +1952,18 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_timeRangeRollingWindowAgg
   CATCH_STD(env, NULL);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explode(JNIEnv *env, jclass,
+                                                               jlong input_jtable,
+                                                               jint column_index) {
+  JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *input_table = reinterpret_cast<cudf::table_view *>(input_jtable);
+    cudf::size_type col_index = static_cast<cudf::size_type>(column_index);
+    std::unique_ptr<cudf::table> exploded = cudf::explode(*input_table, col_index);
+    return cudf::jni::convert_table_for_return(env, exploded);
+  }
+  CATCH_STD(env, 0);
+}
+
 } // extern "C"
diff --git a/java/src/test/java/ai/rapids/cudf/ArrowColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ArrowColumnVectorTest.java
new file mode 100644
index 00000000000..d8ba4548b6d
--- /dev/null
+++ b/java/src/test/java/ai/rapids/cudf/ArrowColumnVectorTest.java
@@ -0,0 +1,330 @@
+/*
+ *
+ *  Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+
+import ai.rapids.cudf.HostColumnVector.BasicType;
+import ai.rapids.cudf.HostColumnVector.ListType;
+import ai.rapids.cudf.HostColumnVector.StructType;
+
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.BigIntVector;
+import org.apache.arrow.vector.DateDayVector;
+import org.apache.arrow.vector.DecimalVector;
+import org.apache.arrow.vector.Float4Vector;
+import org.apache.arrow.vector.Float8Vector;
+import org.apache.arrow.vector.IntVector;
+import org.apache.arrow.vector.VarCharVector;
+import org.apache.arrow.vector.complex.ListVector;
+import org.apache.arrow.vector.complex.StructVector;
+import org.apache.arrow.vector.util.Text;
+
+import org.junit.jupiter.api.Test;
+
+import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+public class ArrowColumnVectorTest extends CudfTestBase {
+
+  @Test
+  void testArrowIntMultiBatches() {
+    ArrowColumnBuilder builder = new ArrowColumnBuilder(new HostColumnVector.BasicType(true, DType.INT32));
+    BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE);
+    int numVecs = 4;
+    IntVector[] vectors = new IntVector[numVecs];
+    try {
+      ArrayList<Integer> expectedArr = new ArrayList<Integer>();
+      for (int j = 0; j < numVecs; j++) {
+        int pos = 0;
+        int count = 10000;
+        IntVector vector = new IntVector("intVec", allocator);
+        int start = count * j;
+        int end = count * (j + 1);
+        for (int i = start; i < end; i++) {
+          expectedArr.add(i);
+          ((IntVector) vector).setSafe(pos, i);
+          pos++;
+        }
+        vector.setValueCount(count);
+        vectors[j] = vector;
+        ByteBuffer data = vector.getDataBuffer().nioBuffer();
+        ByteBuffer valid = vector.getValidityBuffer().nioBuffer();
+        builder.addBatch(vector.getValueCount(), vector.getNullCount(), data, valid, null);
+      }
+      ColumnVector cv = builder.buildAndPutOnDevice();
+      ColumnVector expected = ColumnVector.fromBoxedInts(expectedArr.toArray(new Integer[0]));
+      assertEquals(cv.getType(), DType.INT32);
+      assertColumnsAreEqual(expected, cv, "ints");
+    } finally {
+      for (int i = 0; i < numVecs; i++) {
+        vectors[i].close();
+      }
+    }
+  }
+
+  @Test
+  void testArrowLong() {
+    ArrowColumnBuilder builder = new ArrowColumnBuilder(new HostColumnVector.BasicType(true, DType.INT64));
+    BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE);
+    try (BigIntVector vector = new BigIntVector("vec", allocator)) {
+      ArrayList<Long> expectedArr = new ArrayList<Long>();
+      int count = 10000;
+      for (int i = 0; i < count; i++) {
+        expectedArr.add(new Long(i));
+        ((BigIntVector) vector).setSafe(i, i);
+      }
+      vector.setValueCount(count);
+      ByteBuffer data = vector.getDataBuffer().nioBuffer();
+      ByteBuffer valid = vector.getValidityBuffer().nioBuffer();
+      builder.addBatch(vector.getValueCount(), vector.getNullCount(), data, valid, null);
+      ColumnVector cv = builder.buildAndPutOnDevice();
+      assertEquals(cv.getType(), DType.INT64);
+      ColumnVector expected = ColumnVector.fromBoxedLongs(expectedArr.toArray(new Long[0]));
+      assertColumnsAreEqual(expected, cv, "Longs");
+    }
+  }
+
+  @Test
+  void testArrowLongOnHeap() {
+    ArrowColumnBuilder builder = new ArrowColumnBuilder(new HostColumnVector.BasicType(true, DType.INT64));
+    BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE);
+    try (BigIntVector vector = new BigIntVector("vec", allocator)) {
+      ArrayList<Long> expectedArr = new ArrayList<Long>();
+      int count = 10000;
+      for (int i = 0; i < count; i++) {
+        expectedArr.add(new Long(i));
+        ((BigIntVector) vector).setSafe(i, i);
+      }
+      vector.setValueCount(count);
+      // test that we handle convert buffer to direct byte buffer if its on the heap
+      ByteBuffer data = vector.getDataBuffer().nioBuffer();
+      ByteBuffer dataOnHeap = ByteBuffer.allocate(data.remaining());
+      dataOnHeap.put(data);
+      dataOnHeap.flip();
+      ByteBuffer valid = vector.getValidityBuffer().nioBuffer();
+      ByteBuffer validOnHeap = ByteBuffer.allocate(valid.remaining());
+      validOnHeap.put(data);
+      validOnHeap.flip();
+      builder.addBatch(vector.getValueCount(), vector.getNullCount(), dataOnHeap, validOnHeap, null);
+      ColumnVector cv = builder.buildAndPutOnDevice();
+      assertEquals(cv.getType(), DType.INT64);
+      ColumnVector expected = ColumnVector.fromBoxedLongs(expectedArr.toArray(new Long[0]));
+      assertColumnsAreEqual(expected, cv, "Longs");
+    }
+  }
+
+  @Test
+  void testArrowDouble() {
+    ArrowColumnBuilder builder = new ArrowColumnBuilder(new HostColumnVector.BasicType(true, DType.FLOAT64));
+    BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE);
+    try (Float8Vector vector = new Float8Vector("vec", allocator)) {
+      ArrayList<Double> expectedArr = new ArrayList<Double>();
+      int count = 10000;
+      for (int i = 0; i < count; i++) {
+        expectedArr.add(new Double(i));
+        ((Float8Vector) vector).setSafe(i, i);
+      }
+      vector.setValueCount(count);
+      ByteBuffer data = vector.getDataBuffer().nioBuffer();
+      ByteBuffer valid = vector.getValidityBuffer().nioBuffer();
+      builder.addBatch(vector.getValueCount(), vector.getNullCount(), data, valid, null);
+      ColumnVector cv = builder.buildAndPutOnDevice();
+      assertEquals(cv.getType(), DType.FLOAT64);
+      double[] array = expectedArr.stream().mapToDouble(i->i).toArray();
+      ColumnVector expected = ColumnVector.fromDoubles(array);
+      assertColumnsAreEqual(expected, cv, "doubles");
+    }
+  }
+
+  @Test
+  void testArrowFloat() {
+    ArrowColumnBuilder builder = new ArrowColumnBuilder(new HostColumnVector.BasicType(true, DType.FLOAT32));
+    BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE);
+    try (Float4Vector vector = new Float4Vector("vec", allocator)) {
+      ArrayList<Float> expectedArr = new ArrayList<Float>();
+      int count = 10000;
+      for (int i = 0; i < count; i++) {
+        expectedArr.add(new Float(i));
+        ((Float4Vector) vector).setSafe(i, i);
+      }
+      vector.setValueCount(count);
+      ByteBuffer data = vector.getDataBuffer().nioBuffer();
+      ByteBuffer valid = vector.getValidityBuffer().nioBuffer();
+      builder.addBatch(vector.getValueCount(), vector.getNullCount(), data, valid, null);
+      ColumnVector cv = builder.buildAndPutOnDevice();
+      assertEquals(cv.getType(), DType.FLOAT32);
+      float[] floatArray = new float[expectedArr.size()];
+      int i = 0;
+      for (Float f : expectedArr) {
+        floatArray[i++] = (f != null ? f : Float.NaN); // Or whatever default you want.
+      }
+      ColumnVector expected = ColumnVector.fromFloats(floatArray);
+      assertColumnsAreEqual(expected, cv, "floats");
+    }
+  }
+
+  @Test
+  void testArrowString() {
+    ArrowColumnBuilder builder = new ArrowColumnBuilder(new HostColumnVector.BasicType(true, DType.STRING));
+    BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE);
+    try (VarCharVector vector = new VarCharVector("vec", allocator)) {
+      ArrayList<String> expectedArr = new ArrayList<String>();
+      int count = 10000;
+      for (int i = 0; i < count; i++) {
+        String toAdd = i + "testString";
+        expectedArr.add(toAdd);
+        ((VarCharVector) vector).setSafe(i, new Text(toAdd));
+      }
+      vector.setValueCount(count);
+      ByteBuffer data = vector.getDataBuffer().nioBuffer();
+      ByteBuffer valid = vector.getValidityBuffer().nioBuffer();
+      ByteBuffer offsets = vector.getOffsetBuffer().nioBuffer();
+      builder.addBatch(vector.getValueCount(), vector.getNullCount(), data, valid, offsets);
+      ColumnVector cv = builder.buildAndPutOnDevice();
+      assertEquals(cv.getType(), DType.STRING);
+      ColumnVector expected = ColumnVector.fromStrings(expectedArr.toArray(new String[0]));
+      assertColumnsAreEqual(expected, cv, "Strings");
+    }
+  }
+
+  @Test
+  void testArrowStringOnHeap() {
+    ArrowColumnBuilder builder = new ArrowColumnBuilder(new HostColumnVector.BasicType(true, DType.STRING));
+    BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE);
+    try (VarCharVector vector = new VarCharVector("vec", allocator)) {
+      ArrayList<String> expectedArr = new ArrayList<String>();
+      int count = 10000;
+      for (int i = 0; i < count; i++) {
+        String toAdd = i + "testString";
+        expectedArr.add(toAdd);
+        ((VarCharVector) vector).setSafe(i, new Text(toAdd));
+      }
+      vector.setValueCount(count);
+      ByteBuffer data = vector.getDataBuffer().nioBuffer();
+      ByteBuffer valid = vector.getValidityBuffer().nioBuffer();
+      ByteBuffer offsets = vector.getOffsetBuffer().nioBuffer();
+      ByteBuffer dataOnHeap = ByteBuffer.allocate(data.remaining());
+      dataOnHeap.put(data);
+      dataOnHeap.flip();
+      ByteBuffer validOnHeap = ByteBuffer.allocate(valid.remaining());
+      validOnHeap.put(data);
+      validOnHeap.flip();
+      ByteBuffer offsetsOnHeap = ByteBuffer.allocate(offsets.remaining());
+      offsetsOnHeap.put(offsets);
+      offsetsOnHeap.flip();
+      builder.addBatch(vector.getValueCount(), vector.getNullCount(), dataOnHeap, validOnHeap, offsetsOnHeap);
+      ColumnVector cv = builder.buildAndPutOnDevice();
+      assertEquals(cv.getType(), DType.STRING);
+      ColumnVector expected = ColumnVector.fromStrings(expectedArr.toArray(new String[0]));
+      assertColumnsAreEqual(expected, cv, "Strings");
+    }
+  }
+
+  @Test
+  void testArrowDays() {
+    ArrowColumnBuilder builder = new ArrowColumnBuilder(new HostColumnVector.BasicType(true, DType.TIMESTAMP_DAYS));
+    BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE);
+    try (DateDayVector vector = new DateDayVector("vec", allocator)) {
+      ArrayList<Integer> expectedArr = new ArrayList<Integer>();
+      int count = 10000;
+      for (int i = 0; i < count; i++) {
+        expectedArr.add(i);
+        ((DateDayVector) vector).setSafe(i, i);
+      }
+      vector.setValueCount(count);
+      ByteBuffer data = vector.getDataBuffer().nioBuffer();
+      ByteBuffer valid = vector.getValidityBuffer().nioBuffer();
+      builder.addBatch(vector.getValueCount(), vector.getNullCount(), data, valid, null);
+      ColumnVector cv = builder.buildAndPutOnDevice();
+      assertEquals(cv.getType(), DType.TIMESTAMP_DAYS);
+      int[] array = expectedArr.stream().mapToInt(i->i).toArray();
+      ColumnVector expected = ColumnVector.daysFromInts(array);
+      assertColumnsAreEqual(expected, cv, "timestamp days");
+    }
+  }
+
+  @Test
+  void testArrowDecimalThrows() {
+    BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE);
+    try (DecimalVector vector = new DecimalVector("vec", allocator, 7, 3)) {
+      ArrowColumnBuilder builder = new ArrowColumnBuilder(new HostColumnVector.BasicType(true, DType.create(DType.DTypeEnum.DECIMAL32, 3)));
+      ((DecimalVector) vector).setSafe(0, -3);
+      ((DecimalVector) vector).setSafe(1, 1);
+      ((DecimalVector) vector).setSafe(2, 2);
+      ((DecimalVector) vector).setSafe(3, 3);
+      ((DecimalVector) vector).setSafe(4, 4);
+      ((DecimalVector) vector).setSafe(5, 5);
+      vector.setValueCount(6);
+      ByteBuffer data = vector.getDataBuffer().nioBuffer();
+      ByteBuffer valid = vector.getValidityBuffer().nioBuffer();
+      builder.addBatch(vector.getValueCount(), vector.getNullCount(), data, valid, null);
+      assertThrows(IllegalArgumentException.class, () -> {
+        builder.buildAndPutOnDevice();
+      });
+    }
+  }
+
+  @Test
+  void testArrowDecimal64Throws() {
+    BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE);
+    try (DecimalVector vector = new DecimalVector("vec", allocator, 18, 0)) {
+      ArrowColumnBuilder builder = new ArrowColumnBuilder(new HostColumnVector.BasicType(true, DType.create(DType.DTypeEnum.DECIMAL64, -11)));
+      ((DecimalVector) vector).setSafe(0, -3);
+      ((DecimalVector) vector).setSafe(1, 1);
+      ((DecimalVector) vector).setSafe(2, 2);
+      vector.setValueCount(3);
+      ByteBuffer data = vector.getDataBuffer().nioBuffer();
+      ByteBuffer valid = vector.getValidityBuffer().nioBuffer();
+      builder.addBatch(vector.getValueCount(), vector.getNullCount(), data, valid, null);
+      assertThrows(IllegalArgumentException.class, () -> {
+        builder.buildAndPutOnDevice();
+      });
+    }
+  }
+
+  @Test
+  void testArrowListThrows() {
+    BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE);
+    try (ListVector vector = ListVector.empty("list", allocator)) {
+      ArrowColumnBuilder builder = new ArrowColumnBuilder(new ListType(true, new HostColumnVector.BasicType(true, DType.STRING)));
+      // buffer don't matter as we expect it to throw anyway
+      builder.addBatch(vector.getValueCount(), vector.getNullCount(), null, null, null);
+      assertThrows(IllegalArgumentException.class, () -> {
+        builder.buildAndPutOnDevice();
+      });
+    }
+  }
+
+  @Test
+  void testArrowStructThrows() {
+    BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE);
+    try (StructVector vector = StructVector.empty("struct", allocator)) {
+      ArrowColumnBuilder builder = new ArrowColumnBuilder(new StructType(true, new HostColumnVector.BasicType(true, DType.STRING)));
+      // buffer don't matter as we expect it to throw anyway
+      builder.addBatch(vector.getValueCount(), vector.getNullCount(), null, null, null);
+      assertThrows(IllegalArgumentException.class, () -> {
+        builder.buildAndPutOnDevice();
+      });
+    }
+  }
+}
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 88ff50959f7..582b67b8287 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -2899,6 +2899,67 @@ void testExtractListElements() {
       }
   }
 
+  @Test
+  void testListContainsString() {
+    List<String> list1 = Arrays.asList("Héllo there", "thésé");
+    List<String> list2 = Arrays.asList("", "ARé some", "test strings");
+    List<String> list3 = Arrays.asList(null, "", "ARé some", "test strings", "thésé");
+    List<String> list4 = Arrays.asList(null, "", "ARé some", "test strings");
+    List<String> list5 = null;
+    try (ColumnVector v = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+        new HostColumnVector.BasicType(true, DType.STRING)), list1, list2, list3, list4, list5);
+         ColumnVector expected = ColumnVector.fromBoxedBooleans(true, false, true, null, null);
+         ColumnVector result = v.listContains(Scalar.fromString("thésé"))) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testListContainsInt() {
+    List<Integer> list1 = Arrays.asList(1, 2, 3);
+    List<Integer> list2 = Arrays.asList(4, 5, 6);
+    List<Integer> list3 = Arrays.asList(7, 8, 9);
+    List<Integer> list4 = null;
+    try (ColumnVector v = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+        new HostColumnVector.BasicType(true, DType.INT32)), list1, list2, list3, list4);
+         ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, true, null);
+         ColumnVector result = v.listContains(Scalar.fromInt(7))) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testListContainsStringCol() {
+    List<String> list1 = Arrays.asList("Héllo there", "thésé");
+    List<String> list2 = Arrays.asList("", "ARé some", "test strings");
+    List<String> list3 = Arrays.asList("FOO", "", "ARé some", "test");
+    List<String> list4 = Arrays.asList(null, "FOO", "", "ARé some", "test");
+    List<String> list5 = Arrays.asList(null, "FOO", "", "ARé some", "test");
+    List<String> list6 = null;
+    try (ColumnVector v = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+        new HostColumnVector.BasicType(true, DType.STRING)), list1, list2, list3, list4, list5, list6);
+         ColumnVector expected = ColumnVector.fromBoxedBooleans(true, true, true, true, null, null);
+         ColumnVector result = v.listContainsColumn(
+             ColumnVector.fromStrings("thésé", "", "test", "test", "iotA", null))) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testListContainsIntCol() {
+    List<Integer> list1 = Arrays.asList(1, 2, 3);
+    List<Integer> list2 = Arrays.asList(4, 5, 6);
+    List<Integer> list3 = Arrays.asList(null, 8, 9);
+    List<Integer> list4 = Arrays.asList(null, 8, 9);
+    List<Integer> list5 = null;
+    try (ColumnVector v = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+        new HostColumnVector.BasicType(true, DType.INT32)), list1, list2, list3, list4, list5);
+         ColumnVector expected = ColumnVector.fromBoxedBooleans(true, false, true, null, null);
+         ColumnVector result = v.listContainsColumn(ColumnVector.fromBoxedInts(3, 3, 8, 3, null))) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
   @Test
   void testStringSplitRecord() {
       try (ColumnVector v = ColumnVector.fromStrings("Héllo there", "thésé", "null", "", "ARé some", "test strings");
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index ebd8dadc514..35be427d0c8 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -4426,4 +4426,55 @@ void testBuilderWithColumn() {
       }
     }
   }
+
+  @Test
+  void testExplode() {
+    // Child is primitive type
+    try (Table t1 = new Table.TestBuilder()
+            .column(new ListType(true, new BasicType(true, DType.INT32)),
+                Arrays.asList(1, 2, 3),
+                Arrays.asList(4, 5),
+                Arrays.asList(6),
+                null)
+            .column("s1", "s2", "s3", "s4")
+            .column(   1,    3,    5,    7)
+            .column(12.0, 14.0, 13.0, 11.0)
+            .build();
+         Table expected = new Table.TestBuilder()
+            .column(   1,    2,    3,    4,    5,    6)
+            .column("s1", "s1", "s1", "s2", "s2", "s3")
+            .column(   1,    1,    1,    3,    3,    5)
+            .column(12.0, 12.0, 12.0, 14.0, 14.0, 13.0)
+            .build()) {
+      try (Table exploded = t1.explode(0)) {
+        assertTablesAreEqual(expected, exploded);
+      }
+    }
+
+    // Child is nested type
+    StructType nestedType = new StructType(false,
+        new BasicType(false, DType.INT32), new BasicType(false, DType.STRING));
+    try (Table t1 = new Table.TestBuilder()
+            .column(new ListType(false, nestedType),
+                Arrays.asList(struct(1, "k1"), struct(2, "k2"), struct(3, "k3")),
+                Arrays.asList(struct(4, "k4"), struct(5, "k5")),
+                Arrays.asList(struct(6, "k6")))
+            .column("s1", "s2", "s3")
+            .column(   1,    3,    5)
+            .column(12.0, 14.0, 13.0)
+            .build();
+         Table expected = new Table.TestBuilder()
+            .column(nestedType,
+                struct(1, "k1"), struct(2, "k2"), struct(3, "k3"),
+                struct(4, "k4"), struct(5, "k5"), struct(6, "k6"))
+            .column("s1", "s1", "s1", "s2", "s2", "s3")
+            .column(   1,    1,    1,    3,    3,    5)
+            .column(12.0, 12.0, 12.0, 14.0, 14.0, 13.0)
+            .build()) {
+      try (Table exploded = t1.explode(0)) {
+        assertTablesAreEqual(expected, exploded);
+      }
+    }
+  }
+
 }
diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index 77d69ebc150..2d9438b515f 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 from cudf.utils.gpu_utils import validate_setup  # isort:skip
 
 validate_setup()
@@ -40,7 +40,7 @@
     merge,
 )
 from cudf.core.algorithms import factorize
-from cudf.core.dtypes import CategoricalDtype
+from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype
 from cudf.core.groupby import Grouper
 from cudf.core.ops import (
     add,
@@ -64,7 +64,7 @@
 )
 from cudf.core.reshape import concat, get_dummies, melt, merge_sorted
 from cudf.core.series import isclose
-from cudf.core.tools.datetimes import to_datetime, DateOffset
+from cudf.core.tools.datetimes import DateOffset, to_datetime
 from cudf.core.tools.numeric import to_numeric
 from cudf.io import (
     from_dlpack,
diff --git a/python/cudf/cudf/_fuzz_testing/parquet.py b/python/cudf/cudf/_fuzz_testing/parquet.py
index 975cfebcd59..b816f18b5aa 100644
--- a/python/cudf/cudf/_fuzz_testing/parquet.py
+++ b/python/cudf/cudf/_fuzz_testing/parquet.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 
 import logging
@@ -96,6 +96,10 @@ def set_rand_params(self, params):
                 params_dict[param] = list(
                     np.unique(np.random.choice(self._df.columns, col_size))
                 )
+            elif param in ("skiprows", "num_rows"):
+                params_dict[param] = np.random.choice(
+                    [None, self._rand(len(self._df))]
+                )
             else:
                 params_dict[param] = np.random.choice(values)
         self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py
index c392cefcabf..db2bcf74112 100644
--- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py
+++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 import sys
 
@@ -28,18 +28,29 @@ def parquet_reader_test(parquet_buffer):
     params={
         "columns": ALL_POSSIBLE_VALUES,
         "use_pandas_metadata": [True, False],
+        "skiprows": ALL_POSSIBLE_VALUES,
+        "num_rows": ALL_POSSIBLE_VALUES,
     },
 )
-def parquet_reader_columns(parquet_buffer, columns, use_pandas_metadata):
+def parquet_reader_columns(
+    parquet_buffer, columns, use_pandas_metadata, skiprows, num_rows
+):
     pdf = pd.read_parquet(
         parquet_buffer,
         columns=columns,
         use_pandas_metadata=use_pandas_metadata,
     )
+
+    pdf = pdf.iloc[skiprows:]
+    if num_rows is not None:
+        pdf = pdf.head(num_rows)
+
     gdf = cudf.read_parquet(
         parquet_buffer,
         columns=columns,
         use_pandas_metadata=use_pandas_metadata,
+        skiprows=skiprows,
+        num_rows=num_rows,
     )
 
     compare_dataframe(gdf, pdf)
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index be2d4ef5f51..0293518a5d9 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -10,13 +10,16 @@
     datetime,
     filling,
     gpuarrow,
+    groupby,
     hash,
     interop,
     join,
+    json,
     merge,
     null_mask,
     nvtext,
     orc,
+    parquet,
     partitioning,
     quantiles,
     reduce,
@@ -27,6 +30,7 @@
     search,
     sort,
     stream_compaction,
+    string_casting,
     strings,
     table,
     transpose,
diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi
new file mode 100644
index 00000000000..0f8c044410d
--- /dev/null
+++ b/python/cudf/cudf/_lib/column.pyi
@@ -0,0 +1,124 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from __future__ import annotations
+from typing import Tuple, Union, TypeVar, Optional
+
+from cudf._typing import DtypeObj, Dtype, ScalarLike
+from cudf.core.buffer import Buffer
+from cudf.core.column import ColumnBase
+
+
+T = TypeVar("T")
+
+class Column:
+    _data: Optional[Buffer]
+    _mask: Optional[Buffer]
+    _base_data: Optional[Buffer]
+    _base_mask: Optional[Buffer]
+    _dtype: DtypeObj
+    _offset: int
+    _null_count: int
+    _children: Tuple[ColumnBase, ...]
+    _base_children: Tuple[ColumnBase, ...]
+
+    def __init__(
+        self,
+        data: Optional[Buffer],
+        dtype: Dtype,
+        size: int = None,
+        mask: Optional[Buffer] = None,
+        offset: int = None,
+        null_count: int = None,
+        children: Tuple[ColumnBase, ...] = (),
+    ) -> None:
+        ...
+
+    @property
+    def base_size(self) -> int:
+        ...
+
+    @property
+    def dtype(self) -> DtypeObj:
+        ...
+
+    @property
+    def size(self) -> int:
+        ...
+
+    @property
+    def base_data(self) -> Optional[Buffer]:
+        ...
+
+    @property
+    def base_data_ptr(self) -> int:
+        ...
+
+    @property
+    def data(self) -> Optional[Buffer]:
+        ...
+
+    @property
+    def data_ptr(self) -> int:
+        ...
+
+    def set_base_data(self, value: Buffer) -> None:
+        ...
+
+    @property
+    def nullable(self) -> bool:
+        ...
+
+    @property
+    def has_nulls(self) -> bool:
+        ...
+
+    @property
+    def base_mask(self) -> Optional[Buffer]:
+        ...
+
+    @property
+    def base_mask_ptr(self) -> int:
+        ...
+
+    @property
+    def mask(self) -> Optional[Buffer]:
+        ...
+
+    @property
+    def mask_ptr(self) -> int:
+        ...
+
+    def set_base_mask(self, value: Optional[Buffer]) -> None:
+        ...
+
+    def set_mask(self: T, value: Optional[Buffer]) -> T:
+        ...
+
+    @property
+    def null_count(self) -> int:
+        ...
+
+    @property
+    def offset(self) -> int:
+        ...
+
+    @property
+    def base_children(self) -> Tuple[ColumnBase, ...]:
+        ...
+
+    @property
+    def children(self) -> Tuple[ColumnBase, ...]:
+        ...
+
+    def set_base_children(self, value: Tuple[ColumnBase, ...]) -> None:
+        ...
+
+    def _mimic_inplace(self, other_col: ColumnBase, inplace=False) -> Optional[ColumnBase]:
+        ...
+
+    @staticmethod
+    def from_scalar(
+        val: ScalarLike,
+        size: int
+    ) -> ColumnBase:  # TODO: This should be Scalar, not ScalarLike
+        ...
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index c2f047fd0d5..28dacb5e944 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 import cupy as cp
 import numpy as np
@@ -10,6 +10,7 @@ import cudf
 from cudf.core.buffer import Buffer
 from cudf.utils.dtypes import (
     is_categorical_dtype,
+    is_decimal_dtype,
     is_list_dtype,
     is_struct_dtype
 )
@@ -59,14 +60,14 @@ cdef class Column:
     The *dtype* indicates the Column's element type.
     """
     def __init__(
-            self,
-            object data,
-            int size,
-            object dtype,
-            object mask=None,
-            int offset=0,
-            object null_count=None,
-            object children=()
+        self,
+        object data,
+        int size,
+        object dtype,
+        object mask=None,
+        int offset=0,
+        object null_count=None,
+        object children=()
     ):
 
         self._size = size
@@ -246,10 +247,10 @@ cdef class Column:
             )
 
         return cudf.core.column.build_column(
-            self.data,
-            self.dtype,
-            mask,
-            self.size,
+            data=self.data,
+            dtype=self.dtype,
+            mask=mask,
+            size=self.size,
             offset=0,
             children=self.children
         )
@@ -386,14 +387,19 @@ cdef class Column:
             tid = libcudf_types.type_id.LIST
         elif is_struct_dtype(self.dtype):
             tid = libcudf_types.type_id.STRUCT
+        elif is_decimal_dtype(self.dtype):
+            tid = libcudf_types.type_id.DECIMAL64
         else:
             tid = <libcudf_types.type_id> (
                 <underlying_type_t_type_id> (
                     np_to_cudf_types[np.dtype(data_dtype)]
                 )
             )
-
-        cdef libcudf_types.data_type dtype = libcudf_types.data_type(tid)
+        cdef libcudf_types.data_type dtype = (
+            libcudf_types.data_type(tid, -self.dtype.scale)
+            if tid == libcudf_types.type_id.DECIMAL64
+            else libcudf_types.data_type(tid)
+        )
         cdef libcudf_types.size_type offset = self.offset
         cdef vector[column_view] children
         cdef void* data
@@ -555,25 +561,22 @@ cdef class Column:
         children = tuple(children)
 
         result = cudf.core.column.build_column(
-            data,
-            dtype,
-            mask,
-            size,
-            offset,
-            null_count,
-            tuple(children)
+            data=data,
+            dtype=dtype,
+            mask=mask,
+            size=size,
+            offset=offset,
+            null_count=null_count,
+            children=tuple(children)
         )
 
         return result
 
-
-def make_column_from_scalar(object py_val, size_type size):
-
-    cdef DeviceScalar val = py_val.device_value
-
-    cdef const scalar* c_val = val.get_raw_ptr()
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(cpp_make_column_from_scalar(c_val[0], size))
-
-    return Column.from_unique_ptr(move(c_result))
+    @staticmethod
+    def from_scalar(py_val, size_type size):
+        cdef DeviceScalar val = py_val.device_value
+        cdef const scalar* c_val = val.get_raw_ptr()
+        cdef unique_ptr[column] c_result
+        with nogil:
+            c_result = move(cpp_make_column_from_scalar(c_val[0], size))
+        return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
index 412f8c25658..f7f094834e6 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.string cimport string
@@ -71,7 +71,6 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         cudf_io_types.statistics_freq get_stats_level() except +
         cudf_table_view.table_view get_table() except +
         const cudf_io_types.table_metadata get_metadata() except +
-        bool is_enabled_return_filemetadata() except +
         string get_column_chunks_file_path() except+
 
         void set_metadata(
@@ -83,9 +82,6 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_compression(
             cudf_io_types.compression_type compression
         ) except +
-        void enable_return_filemetadata(
-            bool req
-        ) except +
         void set_column_chunks_file_path(
             string column_chunks_file_path
         ) except +
@@ -112,9 +108,6 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_writer_options_builder& compression(
             cudf_io_types.compression_type compression
         ) except +
-        parquet_writer_options_builder& return_filemetadata(
-            bool req
-        ) except +
         parquet_writer_options_builder& column_chunks_file_path(
             string column_chunks_file_path
         ) except +
@@ -168,21 +161,15 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
 
         chunked_parquet_writer_options build() except +
 
-    cdef shared_ptr[pq_chunked_state] write_parquet_chunked_begin(
-        chunked_parquet_writer_options args
-    ) except +
-
-    cdef void write_parquet_chunked(cudf_table_view.table_view table_,
-                                    shared_ptr[pq_chunked_state]) except +
-
-    cdef unique_ptr[vector[uint8_t]] write_parquet_chunked_end(
-        shared_ptr[pq_chunked_state],
-        bool return_meta,
-        string column_chunks_file_path,
-    ) except +
-
-    cdef cppclass pq_chunked_state:
-        pass
+    cdef cppclass parquet_chunked_writer:
+        parquet_chunked_writer() except+
+        parquet_chunked_writer(chunked_parquet_writer_options args) except+
+        parquet_chunked_writer& write(
+            cudf_table_view.table_view table_,
+        ) except+
+        unique_ptr[vector[uint8_t]] close(
+            string column_chunks_file_path,
+        ) except+
 
     cdef unique_ptr[vector[uint8_t]] merge_rowgroup_metadata(
         const vector[unique_ptr[vector[uint8_t]]]& metadata_list
diff --git a/python/cudf/cudf/_lib/cpp/types.pxd b/python/cudf/cudf/_lib/cpp/types.pxd
index cf86076f8d6..bd1108b2cdf 100644
--- a/python/cudf/cudf/_lib/cpp/types.pxd
+++ b/python/cudf/cudf/_lib/cpp/types.pxd
@@ -47,32 +47,34 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil:
         UNEQUAL "cudf::null_equality::UNEQUAL"
 
     ctypedef enum type_id "cudf::type_id":
-        EMPTY "cudf::type_id::EMPTY"
-        INT8  "cudf::type_id::INT8"
-        INT16 "cudf::type_id::INT16"
-        INT32 "cudf::type_id::INT32"
-        INT64 "cudf::type_id::INT64"
-        UINT8 "cudf::type_id::UINT8"
-        UINT16 "cudf::type_id::UINT16"
-        UINT32 "cudf::type_id::UINT32"
-        UINT64 "cudf::type_id::UINT64"
-        FLOAT32 "cudf::type_id::FLOAT32"
-        FLOAT64 "cudf::type_id::FLOAT64"
-        BOOL8 "cudf::type_id::BOOL8"
-        TIMESTAMP_DAYS "cudf::type_id::TIMESTAMP_DAYS"
-        TIMESTAMP_SECONDS "cudf::type_id::TIMESTAMP_SECONDS"
+        EMPTY                  "cudf::type_id::EMPTY"
+        INT8                   "cudf::type_id::INT8"
+        INT16                  "cudf::type_id::INT16"
+        INT32                  "cudf::type_id::INT32"
+        INT64                  "cudf::type_id::INT64"
+        UINT8                  "cudf::type_id::UINT8"
+        UINT16                 "cudf::type_id::UINT16"
+        UINT32                 "cudf::type_id::UINT32"
+        UINT64                 "cudf::type_id::UINT64"
+        FLOAT32                "cudf::type_id::FLOAT32"
+        FLOAT64                "cudf::type_id::FLOAT64"
+        BOOL8                  "cudf::type_id::BOOL8"
+        TIMESTAMP_DAYS         "cudf::type_id::TIMESTAMP_DAYS"
+        TIMESTAMP_SECONDS      "cudf::type_id::TIMESTAMP_SECONDS"
         TIMESTAMP_MILLISECONDS "cudf::type_id::TIMESTAMP_MILLISECONDS"
         TIMESTAMP_MICROSECONDS "cudf::type_id::TIMESTAMP_MICROSECONDS"
-        TIMESTAMP_NANOSECONDS "cudf::type_id::TIMESTAMP_NANOSECONDS"
-        DICTIONARY32 "cudf::type_id::DICTIONARY32"
-        STRING "cudf::type_id::STRING"
-        LIST "cudf::type_id::LIST"
-        STRUCT "cudf::type_id::STRUCT"
-        NUM_TYPE_IDS "cudf::type_id::NUM_TYPE_IDS"
-        DURATION_SECONDS "cudf::type_id::DURATION_SECONDS"
-        DURATION_MILLISECONDS "cudf::type_id::DURATION_MILLISECONDS"
-        DURATION_MICROSECONDS "cudf::type_id::DURATION_MICROSECONDS"
-        DURATION_NANOSECONDS "cudf::type_id::DURATION_NANOSECONDS"
+        TIMESTAMP_NANOSECONDS  "cudf::type_id::TIMESTAMP_NANOSECONDS"
+        DICTIONARY32           "cudf::type_id::DICTIONARY32"
+        STRING                 "cudf::type_id::STRING"
+        LIST                   "cudf::type_id::LIST"
+        STRUCT                 "cudf::type_id::STRUCT"
+        NUM_TYPE_IDS           "cudf::type_id::NUM_TYPE_IDS"
+        DURATION_SECONDS       "cudf::type_id::DURATION_SECONDS"
+        DURATION_MILLISECONDS  "cudf::type_id::DURATION_MILLISECONDS"
+        DURATION_MICROSECONDS  "cudf::type_id::DURATION_MICROSECONDS"
+        DURATION_NANOSECONDS   "cudf::type_id::DURATION_NANOSECONDS"
+        DECIMAL32              "cudf::type_id::DECIMAL32"
+        DECIMAL64              "cudf::type_id::DECIMAL64"
 
     ctypedef enum hash_id "cudf::hash_id":
         HASH_IDENTITY "cudf::hash_id::HASH_IDENTITY"
@@ -85,7 +87,9 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil:
         data_type() except +
         data_type(const data_type&) except +
         data_type(type_id id) except +
+        data_type(type_id id, int32_t scale) except +
         type_id id() except +
+        int32_t scale() except +
 
 cdef extern from "cudf/types.hpp" namespace "cudf" nogil:
     ctypedef enum interpolation:
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index c7780d17b27..a9739a02283 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -31,7 +31,7 @@ from cudf._lib.utils import (
 
 from libc.stdlib cimport free
 from libc.stdint cimport uint8_t
-from libcpp.memory cimport shared_ptr, unique_ptr, make_unique
+from libcpp.memory cimport unique_ptr, make_unique
 from libcpp.string cimport string
 from libcpp.map cimport map
 from libcpp.vector cimport vector
@@ -50,13 +50,10 @@ from cudf._lib.cpp.io.parquet cimport (
     parquet_reader_options,
     parquet_writer_options,
     write_parquet as parquet_writer,
+    parquet_chunked_writer as cpp_parquet_chunked_writer,
     chunked_parquet_writer_options,
     chunked_parquet_writer_options_builder,
-    write_parquet_chunked_begin,
-    write_parquet_chunked,
-    write_parquet_chunked_end,
     merge_rowgroup_metadata as parquet_merge_metadata,
-    pq_chunked_state
 )
 from cudf._lib.column cimport Column
 from cudf._lib.io.utils cimport (
@@ -323,11 +320,9 @@ cpdef write_parquet(
     cdef parquet_writer_options args
     cdef unique_ptr[vector[uint8_t]] out_metadata_c
     cdef string c_column_chunks_file_path
-    cdef bool return_filemetadata = False
     cdef bool _int96_timestamps = int96_timestamps
     if metadata_file_path is not None:
         c_column_chunks_file_path = str.encode(metadata_file_path)
-        return_filemetadata = True
 
     # Perform write
     with nogil:
@@ -337,7 +332,6 @@ cpdef write_parquet(
             .compression(comp_type)
             .stats_level(stat_freq)
             .column_chunks_file_path(c_column_chunks_file_path)
-            .return_filemetadata(return_filemetadata)
             .int96_timestamps(_int96_timestamps)
             .build()
         )
@@ -361,7 +355,8 @@ cdef class ParquetWriter:
     --------
     cudf.io.parquet.write_parquet
     """
-    cdef shared_ptr[pq_chunked_state] state
+    cdef bool initialized
+    cdef unique_ptr[cpp_parquet_chunked_writer] writer
     cdef cudf_io_types.sink_info sink
     cdef unique_ptr[cudf_io_types.data_sink] _data_sink
     cdef cudf_io_types.statistics_freq stat_freq
@@ -374,43 +369,39 @@ cdef class ParquetWriter:
         self.stat_freq = _get_stat_freq(statistics)
         self.comp_type = _get_comp_type(compression)
         self.index = index
+        self.initialized = False
 
     def write_table(self, Table table):
         """ Writes a single table to the file """
-        if not self.state:
+        if not self.initialized:
             self._initialize_chunked_state(table)
 
-        cdef table_view tv = table.data_view()
-        if self.index is not False:
-            if isinstance(table._index, cudf.core.multiindex.MultiIndex) \
-                    or table._index.name is not None:
-                tv = table.view()
+        cdef table_view tv
+        if self.index is not False and (
+            table._index.name is not None or
+                isinstance(table._index, cudf.core.multiindex.MultiIndex)):
+            tv = table.view()
+        else:
+            tv = table.data_view()
 
         with nogil:
-            write_parquet_chunked(tv, self.state)
+            self.writer.get()[0].write(tv)
 
     def close(self, object metadata_file_path=None):
         cdef unique_ptr[vector[uint8_t]] out_metadata_c
-        cdef bool return_meta
         cdef string column_chunks_file_path
 
-        if not self.state:
+        if not self.initialized:
             return None
 
         # Update metadata-collection options
         if metadata_file_path is not None:
             column_chunks_file_path = str.encode(metadata_file_path)
-            return_meta = True
-        else:
-            return_meta = False
 
         with nogil:
             out_metadata_c = move(
-                write_parquet_chunked_end(
-                    self.state, return_meta, column_chunks_file_path
-                )
+                self.writer.get()[0].close(column_chunks_file_path)
             )
-            self.state.reset()
 
         if metadata_file_path is not None:
             out_metadata_py = BufferArrayFromVector.from_unique_ptr(
@@ -423,8 +414,8 @@ cdef class ParquetWriter:
         self.close()
 
     def _initialize_chunked_state(self, Table table):
-        """ Wraps write_parquet_chunked_begin. This is called lazily on the first
-        call to write, so that we can get metadata from the first table """
+        """ Prepares all the values required to build the
+        chunked_parquet_writer_options and creates a writer"""
         cdef unique_ptr[cudf_io_types.table_metadata_with_nullability] tbl_meta
         tbl_meta = make_unique[cudf_io_types.table_metadata_with_nullability]()
 
@@ -434,7 +425,6 @@ cdef class ParquetWriter:
         tbl_meta.get().user_data[str.encode("pandas")] = \
             str.encode(pandas_metadata)
 
-        # call write_parquet_chunked_begin
         cdef chunked_parquet_writer_options args
         with nogil:
             args = move(
@@ -444,7 +434,8 @@ cdef class ParquetWriter:
                 .stats_level(self.stat_freq)
                 .build()
             )
-            self.state = write_parquet_chunked_begin(args)
+            self.writer.reset(new cpp_parquet_chunked_writer(args))
+        self.initialized = True
 
 
 cpdef merge_filemetadata(object filemetadata_list):
diff --git a/python/cudf/cudf/_lib/table.pyi b/python/cudf/cudf/_lib/table.pyi
new file mode 100644
index 00000000000..772e940f812
--- /dev/null
+++ b/python/cudf/cudf/_lib/table.pyi
@@ -0,0 +1,29 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from typing import List, Any, Optional, TYPE_CHECKING
+
+import cudf
+
+class Table(object):
+    _data: cudf.core.column_accessor.ColumnAccessor
+    _index: Optional[cudf.core.index.Index]
+
+    def __init__(self, data: object = None, index: object = None) -> None: ...
+
+    @property
+    def _num_columns(self) -> int: ...
+
+    @property
+    def _num_indices(self) -> int: ...
+
+    @property
+    def _num_rows(self) -> int: ...
+
+    @property
+    def _column_names(self) -> List[Any]: ...
+
+    @property
+    def _index_names(self) -> List[Any]: ...
+
+    @property
+    def _columns(self) -> List[Any]: ... # TODO: actually, a list of columns
diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd
index c6e19840d6a..9b35ca2e80c 100644
--- a/python/cudf/cudf/_lib/types.pxd
+++ b/python/cudf/cudf/_lib/types.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
 from libcpp cimport bool
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index 5998f9ec2f9..370d083d7ac 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 from enum import IntEnum
 
@@ -14,7 +14,7 @@ from cudf._lib.types cimport (
 )
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-from cudf.core.dtypes import ListDtype, StructDtype
+from cudf.core.dtypes import ListDtype, StructDtype, Decimal64Dtype
 
 cimport cudf._lib.cpp.types as libcudf_types
 
@@ -64,6 +64,8 @@ class TypeId(IntEnum):
     DURATION_NANOSECONDS = (
         <underlying_type_t_type_id> libcudf_types.type_id.DURATION_NANOSECONDS
     )
+    DECIMAL32 = <underlying_type_t_type_id> libcudf_types.type_id.DECIMAL32
+    DECIMAL64 = <underlying_type_t_type_id> libcudf_types.type_id.DECIMAL64
 
 
 np_to_cudf_types = {
@@ -188,12 +190,21 @@ cdef dtype_from_structs_column_view(column_view cv):
     }
     return StructDtype(fields)
 
+cdef dtype_from_decimal_column_view(column_view cv):
+    scale = -cv.type().scale()
+    precision = 18  # max of 64 bit integer
+    return Decimal64Dtype(precision=precision, scale=scale)
+
 cdef dtype_from_column_view(column_view cv):
     cdef libcudf_types.type_id tid = cv.type().id()
     if tid == libcudf_types.type_id.LIST:
-        dtype = dtype_from_lists_column_view(cv)
+        return dtype_from_lists_column_view(cv)
     elif tid == libcudf_types.type_id.STRUCT:
-        dtype = dtype_from_structs_column_view(cv)
+        return dtype_from_structs_column_view(cv)
+    elif tid == libcudf_types.type_id.DECIMAL64:
+        return dtype_from_decimal_column_view(cv)
+    elif tid == libcudf_types.type_id.DECIMAL32:
+        raise NotImplementedError("decimal32 types are not supported yet. "
+                                  "Use decimal64 instead")
     else:
-        dtype = cudf_to_np_types[<underlying_type_t_type_id>(tid)]
-    return dtype
+        return cudf_to_np_types[<underlying_type_t_type_id>(tid)]
diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py
new file mode 100644
index 00000000000..0087daa1676
--- /dev/null
+++ b/python/cudf/cudf/_typing.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from typing import TYPE_CHECKING, Any, TypeVar, Union
+
+import numpy as np
+from pandas import Period, Timedelta, Timestamp
+from pandas.api.extensions import ExtensionDtype
+
+if TYPE_CHECKING:
+    import cudf
+
+# Many of these are from
+# https://github.com/pandas-dev/pandas/blob/master/pandas/_typing.py
+
+Dtype = Union["ExtensionDtype", str, np.dtype]
+DtypeObj = Union["ExtensionDtype", np.dtype]
+
+# scalars
+DatetimeLikeScalar = TypeVar(
+    "DatetimeLikeScalar", Period, Timestamp, Timedelta
+)
+ScalarLike = Any
+
+# columns
+ColumnLike = Any
+
+# binary operation
+BinaryOperand = Union["cudf.Scalar", "cudf.core.column.ColumnBase"]
diff --git a/python/cudf/cudf/core/__init__.py b/python/cudf/cudf/core/__init__.py
index d6c232373c7..91a369c31f8 100644
--- a/python/cudf/cudf/core/__init__.py
+++ b/python/cudf/cudf/core/__init__.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2018-2020, NVIDIA CORPORATION.
 
-from cudf.core import buffer, column, common
+from cudf.core import buffer, column, column_accessor, common
 from cudf.core.buffer import Buffer
 from cudf.core.dataframe import DataFrame, from_pandas, merge
 from cudf.core.index import (
diff --git a/python/cudf/cudf/core/abc.py b/python/cudf/cudf/core/abc.py
index 02150a79d57..0550b1d4de0 100644
--- a/python/cudf/cudf/core/abc.py
+++ b/python/cudf/cudf/core/abc.py
@@ -12,9 +12,9 @@
     try:
         import pickle5 as pickle
     except ImportError:
-        import pickle
+        import pickle  # type: ignore
 else:
-    import pickle
+    import pickle  # type: ignore
 
 
 class Serializable(abc.ABC):
diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py
index 08bc068c28c..350346a87f9 100644
--- a/python/cudf/cudf/core/buffer.py
+++ b/python/cudf/cudf/core/buffer.py
@@ -1,7 +1,10 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
+from __future__ import annotations
+
 import functools
 import operator
 import pickle
+from typing import Any, Dict, Optional, Tuple
 
 import numpy as np
 
@@ -12,7 +15,13 @@
 
 
 class Buffer(Serializable):
-    def __init__(self, data=None, size=None, owner=None):
+    ptr: int
+    size: int
+    _owner: Any
+
+    def __init__(
+        self, data: Any = None, size: Optional[int] = None, owner: Any = None
+    ):
         """
         A Buffer represents a device memory allocation.
 
@@ -36,7 +45,6 @@ def __init__(self, data=None, size=None, owner=None):
         elif hasattr(data, "__array_interface__") or hasattr(
             data, "__cuda_array_interface__"
         ):
-
             self._init_from_array_like(data, owner)
         elif isinstance(data, memoryview):
             self._init_from_array_like(np.asarray(data), owner)
@@ -57,15 +65,15 @@ def __init__(self, data=None, size=None, owner=None):
                 raise TypeError("data must be Buffer, array-like or integer")
             self._init_from_array_like(np.asarray(data), owner)
 
-    def __len__(self):
+    def __len__(self) -> int:
         return self.size
 
     @property
-    def nbytes(self):
+    def nbytes(self) -> int:
         return self.size
 
     @property
-    def __cuda_array_interface__(self):
+    def __cuda_array_interface__(self) -> dict:
         intf = {
             "data": (self.ptr, False),
             "shape": (self.size,),
@@ -102,8 +110,8 @@ def _init_from_array_like(self, data, owner):
                 f"Cannot construct Buffer from {data.__class__.__name__}"
             )
 
-    def serialize(self):
-        header = {}
+    def serialize(self) -> Tuple[dict, list]:
+        header = {}  # type: Dict[Any, Any]
         header["type-serialized"] = pickle.dumps(type(self))
         header["constructor-kwargs"] = {}
         header["desc"] = self.__cuda_array_interface__.copy()
@@ -112,7 +120,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
-    def deserialize(cls, header, frames):
+    def deserialize(cls, header: dict, frames: list) -> Buffer:
         buf = cls(frames[0], **header["constructor-kwargs"])
 
         if header["desc"]["shape"] != buf.__cuda_array_interface__["shape"]:
@@ -125,7 +133,7 @@ def deserialize(cls, header, frames):
         return buf
 
     @classmethod
-    def empty(cls, size):
+    def empty(cls, size: int) -> Buffer:
         dbuf = DeviceBuffer(size=size)
         return Buffer(dbuf)
 
diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py
index 7e583ea4b2b..81dab52d353 100644
--- a/python/cudf/cudf/core/column/__init__.py
+++ b/python/cudf/cudf/core/column/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 from cudf.core.column.categorical import CategoricalColumn
 from cudf.core.column.column import (
@@ -21,3 +21,4 @@
 from cudf.core.column.string import StringColumn  # noqa: F401
 from cudf.core.column.struct import StructColumn  # noqa: F401
 from cudf.core.column.timedelta import TimeDeltaColumn  # noqa: F401
+from cudf.core.column.decimal import DecimalColumn  # noqa: F401
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index ff514e6c6f0..498851c47ee 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1,12 +1,27 @@
 # Copyright (c) 2018-2020, NVIDIA CORPORATION.
+from __future__ import annotations
+
 import pickle
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    Mapping,
+    Optional,
+    Tuple,
+    Union,
+    cast,
+)
 
 import numpy as np
 import pandas as pd
+from numba import cuda
 
 import cudf
 from cudf import _lib as libcudf
+from cudf._lib.scalar import as_device_scalar
 from cudf._lib.transform import bools_to_mask
+from cudf._typing import ColumnLike, Dtype, ScalarLike
 from cudf.core.buffer import Buffer
 from cudf.core.column import column
 from cudf.core.column.methods import ColumnMethodsMixin
@@ -18,9 +33,23 @@
     min_unsigned_type,
 )
 
+if TYPE_CHECKING:
+    from cudf.core.column import (
+        ColumnBase,
+        DatetimeColumn,
+        NumericalColumn,
+        StringColumn,
+        TimeDeltaColumn,
+    )
+
+
+ParentType = Union["cudf.Series", "cudf.Index"]
+
 
 class CategoricalAccessor(ColumnMethodsMixin):
-    def __init__(self, column, parent=None):
+    _column: CategoricalColumn
+
+    def __init__(self, column: Any, parent: ParentType = None):
         """
         Accessor object for categorical properties of the Series values.
         Be aware that assigning to `categories` is a inplace operation,
@@ -28,7 +57,8 @@ def __init__(self, column, parent=None):
 
         Parameters
         ----------
-        data : Series or CategoricalIndex
+        column : Column
+        parent : Series or CategoricalIndex
 
         Examples
         --------
@@ -77,34 +107,35 @@ def __init__(self, column, parent=None):
             raise AttributeError(
                 "Can only use .cat accessor with a 'category' dtype"
             )
-        self._column = column
-        self._parent = parent
+        super().__init__(column=column, parent=parent)
 
     @property
-    def categories(self):
+    def categories(self) -> "cudf.Index":
         """
         The categories of this categorical.
         """
         return cudf.core.index.as_index(self._column.categories)
 
     @property
-    def codes(self):
+    def codes(self) -> "cudf.Series":
         """
         Return Series of codes as well as the index.
         """
-        return cudf.Series(
-            self._column.codes,
-            index=self._parent.index if self._parent is not None else None,
+        index = (
+            self._parent.index
+            if isinstance(self._parent, cudf.Series)
+            else None
         )
+        return cudf.Series(self._column.codes, index=index)
 
     @property
-    def ordered(self):
+    def ordered(self) -> bool:
         """
         Whether the categories have an ordered relationship.
         """
         return self._column.ordered
 
-    def as_ordered(self, inplace=False):
+    def as_ordered(self, inplace: bool = False) -> Optional[ParentType]:
         """
         Set the Categorical to be ordered.
 
@@ -165,7 +196,7 @@ def as_ordered(self, inplace=False):
 
         return self._return_or_inplace(out_col, inplace=inplace)
 
-    def as_unordered(self, inplace=False):
+    def as_unordered(self, inplace: bool = False) -> Optional[ParentType]:
         """
         Set the Categorical to be unordered.
 
@@ -237,7 +268,9 @@ def as_unordered(self, inplace=False):
 
         return self._return_or_inplace(out_col, inplace=inplace)
 
-    def add_categories(self, new_categories, inplace=False):
+    def add_categories(
+        self, new_categories: Any, inplace: bool = False
+    ) -> Optional[ParentType]:
         """
         Add new categories.
 
@@ -320,7 +353,9 @@ def add_categories(self, new_categories, inplace=False):
 
         return self._return_or_inplace(out_col, inplace=inplace)
 
-    def remove_categories(self, removals, inplace=False):
+    def remove_categories(
+        self, removals: Any, inplace: bool = False,
+    ) -> Optional[ParentType]:
         """
         Remove the specified categories.
 
@@ -411,8 +446,12 @@ def remove_categories(self, removals, inplace=False):
         return self._return_or_inplace(out_col, inplace=inplace)
 
     def set_categories(
-        self, new_categories, ordered=None, rename=False, inplace=False,
-    ):
+        self,
+        new_categories: Any,
+        ordered: bool = False,
+        rename: bool = False,
+        inplace: bool = False,
+    ) -> Optional[ParentType]:
         """
         Set the categories to the specified new_categories.
 
@@ -539,7 +578,12 @@ def set_categories(
                 )
         return self._return_or_inplace(out_col, inplace=inplace)
 
-    def reorder_categories(self, new_categories, ordered=False, inplace=False):
+    def reorder_categories(
+        self,
+        new_categories: Any,
+        ordered: bool = False,
+        inplace: bool = False,
+    ) -> Optional[ParentType]:
         """
         Reorder categories as specified in new_categories.
 
@@ -621,9 +665,9 @@ def reorder_categories(self, new_categories, ordered=False, inplace=False):
 
         return self._return_or_inplace(out_col, inplace=inplace)
 
-    def _categories_equal(self, new_categories, ordered=None):
-        ordered = ordered if ordered is not None else self.ordered
-
+    def _categories_equal(
+        self, new_categories: ColumnBase, ordered=False
+    ) -> bool:
         cur_categories = self._column.categories
         if len(new_categories) != len(cur_categories):
             return False
@@ -640,8 +684,12 @@ def _categories_equal(self, new_categories, ordered=None):
         return cur_categories.equals(new_categories)
 
     def _set_categories(
-        self, current_categories, new_categories, is_unique=False, ordered=None
-    ):
+        self,
+        current_categories: Any,
+        new_categories: Any,
+        is_unique: bool = False,
+        ordered: bool = False,
+    ) -> CategoricalColumn:
         """Returns a new CategoricalColumn with the categories set to the
         specified *new_categories*.
 
@@ -705,14 +753,17 @@ class CategoricalColumn(column.ColumnBase):
     """Implements operations for Columns of Categorical type
     """
 
+    _codes: Optional[NumericalColumn]
+    _children: Tuple[NumericalColumn]
+
     def __init__(
         self,
-        dtype,
-        mask=None,
-        size=None,
-        offset=0,
-        null_count=None,
-        children=(),
+        dtype: CategoricalDtype,
+        mask: Buffer = None,
+        size: int = None,
+        offset: int = 0,
+        null_count: int = None,
+        children: Tuple["column.ColumnBase", ...] = (),
     ):
         """
         Parameters
@@ -722,7 +773,7 @@ def __init__(
             The validity mask
         offset : int
             Data offset
-        children : Tuple[Column]
+        children : Tuple[ColumnBase]
             Two non-null columns containing the categories and codes
             respectively
         """
@@ -745,24 +796,23 @@ def __init__(
             null_count=null_count,
             children=children,
         )
-
         self._codes = None
 
     @property
-    def base_size(self):
+    def base_size(self) -> int:
         return int(
             (self.base_children[0].size) / self.base_children[0].dtype.itemsize
         )
 
-    def __contains__(self, item):
+    def __contains__(self, item: ScalarLike) -> bool:
         try:
             self._encode(item)
         except ValueError:
             return False
         return self._encode(item) in self.as_numerical
 
-    def serialize(self):
-        header = {}
+    def serialize(self) -> Tuple[dict, list]:
+        header = {}  # type: Dict[Any, Any]
         frames = []
         header["type-serialized"] = pickle.dumps(type(self))
         header["dtype"], dtype_frames = self.dtype.serialize()
@@ -771,7 +821,7 @@ def serialize(self):
         header["data"], data_frames = self.codes.serialize()
         header["data_frames_count"] = len(data_frames)
         frames.extend(data_frames)
-        if self.nullable:
+        if self.mask is not None:
             mask_header, mask_frames = self.mask.serialize()
             header["mask"] = mask_header
             frames.extend(mask_frames)
@@ -779,7 +829,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
-    def deserialize(cls, header, frames):
+    def deserialize(cls, header: dict, frames: list) -> CategoricalColumn:
         n_dtype_frames = header["dtype_frames_count"]
         dtype = CategoricalDtype.deserialize(
             header["dtype"], frames[:n_dtype_frames]
@@ -796,11 +846,14 @@ def deserialize(cls, header, frames):
             mask = Buffer.deserialize(
                 header["mask"], [frames[n_dtype_frames + n_data_frames]]
             )
-        return column.build_column(
-            data=None,
-            dtype=dtype,
-            mask=mask,
-            children=(column.as_column(data.base_data, dtype=data.dtype),),
+        return cast(
+            CategoricalColumn,
+            column.build_column(
+                data=None,
+                dtype=dtype,
+                mask=mask,
+                children=(column.as_column(data.base_data, dtype=data.dtype),),
+            ),
         )
 
     def set_base_data(self, value):
@@ -812,16 +865,16 @@ def set_base_data(self, value):
         else:
             super().set_base_data(value)
 
-    def set_base_mask(self, value):
+    def set_base_mask(self, value: Optional[Buffer]):
         super().set_base_mask(value)
         self._codes = None
 
-    def set_base_children(self, value):
+    def set_base_children(self, value: Tuple[ColumnBase, ...]):
         super().set_base_children(value)
         self._codes = None
 
     @property
-    def children(self):
+    def children(self) -> Tuple[NumericalColumn]:
         if self._children is None:
             codes_column = self.base_children[0]
 
@@ -829,20 +882,26 @@ def children(self):
             buf.ptr = buf.ptr + (self.offset * codes_column.dtype.itemsize)
             buf.size = self.size * codes_column.dtype.itemsize
 
-            codes_column = column.build_column(
-                data=buf, dtype=codes_column.dtype, size=self.size,
+            codes_column = cast(
+                cudf.core.column.NumericalColumn,
+                column.build_column(
+                    data=buf, dtype=codes_column.dtype, size=self.size,
+                ),
             )
             self._children = (codes_column,)
         return self._children
 
     @property
-    def as_numerical(self):
-        return column.build_column(
-            data=self.codes.data, dtype=self.codes.dtype, mask=self.mask
+    def as_numerical(self) -> NumericalColumn:
+        return cast(
+            cudf.core.column.NumericalColumn,
+            column.build_column(
+                data=self.codes.data, dtype=self.codes.dtype, mask=self.mask
+            ),
         )
 
     @property
-    def categories(self):
+    def categories(self) -> ColumnBase:
         return self.dtype.categories._values
 
     @categories.setter
@@ -852,30 +911,82 @@ def categories(self, value):
         )
 
     @property
-    def codes(self):
+    def codes(self) -> NumericalColumn:
         if self._codes is None:
             self._codes = self.children[0].set_mask(self.mask)
-        return self._codes
+        return cast(cudf.core.column.NumericalColumn, self._codes)
 
     @property
-    def ordered(self):
+    def ordered(self) -> bool:
         return self.dtype.ordered
 
     @ordered.setter
-    def ordered(self, value):
+    def ordered(self, value: bool):
         self.dtype.ordered = value
 
-    def cat(self, parent=None):
+    def cat(self, parent: ParentType = None):
         return CategoricalAccessor(self, parent=parent)
 
-    def unary_operator(self, unaryop):
+    def unary_operator(self, unaryop: str):
         raise TypeError(
             f"Series of dtype `category` cannot perform the operation: "
             f"{unaryop}"
         )
 
-    def binary_operator(self, op, rhs, reflect=False):
+    def __setitem__(self, key, value):
+        if cudf.utils.dtypes.is_scalar(value):
+            value = self._encode(value) if value is not None else value
+        else:
+            value = cudf.core.column.as_column(value).astype(self.dtype)
+            value = value.codes
+        codes = self.codes
+        codes[key] = value
+        out = cudf.core.column.build_categorical_column(
+            categories=self.categories,
+            codes=codes,
+            mask=codes.base_mask,
+            size=codes.size,
+            offset=self.offset,
+            ordered=self.ordered,
+        )
+        self._mimic_inplace(out, inplace=True)
+
+    def _fill(
+        self,
+        fill_value: ScalarLike,
+        begin: int,
+        end: int,
+        inplace: bool = False,
+    ) -> "column.ColumnBase":
+        if end <= begin or begin >= self.size:
+            return self if inplace else self.copy()
+
+        fill_code = self._encode(fill_value)
+        fill_scalar = as_device_scalar(fill_code, self.codes.dtype)
+
+        result = self if inplace else self.copy()
+
+        libcudf.filling.fill_in_place(result.codes, begin, end, fill_scalar)
+        return result
+
+    def slice(
+        self, start: int, stop: int, stride: int = None
+    ) -> "column.ColumnBase":
+        codes = self.codes.slice(start, stop, stride)
+        return cudf.core.column.build_categorical_column(
+            categories=self.categories,
+            codes=cudf.core.column.as_column(
+                codes.base_data, dtype=codes.dtype
+            ),
+            mask=codes.base_mask,
+            ordered=self.ordered,
+            size=codes.size,
+            offset=codes.offset,
+        )
 
+    def binary_operator(
+        self, op: str, rhs, reflect: bool = False
+    ) -> ColumnBase:
         if not (self.ordered and rhs.ordered) and op not in ("eq", "ne"):
             if op in ("lt", "gt", "le", "ge"):
                 raise TypeError(
@@ -889,7 +1000,7 @@ def binary_operator(self, op, rhs, reflect=False):
             raise TypeError("Categoricals can only compare with the same type")
         return self.as_numerical.binary_operator(op, rhs.as_numerical)
 
-    def normalize_binop_value(self, other):
+    def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn:
 
         if isinstance(other, np.ndarray) and other.ndim == 0:
             other = other.item()
@@ -905,7 +1016,9 @@ def normalize_binop_value(self, other):
         )
         return col
 
-    def sort_by_values(self, ascending=True, na_position="last"):
+    def sort_by_values(
+        self, ascending: bool = True, na_position="last"
+    ) -> Tuple[CategoricalColumn, NumericalColumn]:
         codes, inds = self.as_numerical.sort_by_values(ascending, na_position)
         col = column.build_categorical_column(
             categories=self.dtype.categories,
@@ -916,19 +1029,21 @@ def sort_by_values(self, ascending=True, na_position="last"):
         )
         return col, inds
 
-    def element_indexing(self, index):
+    def element_indexing(self, index: int) -> ScalarLike:
         val = self.as_numerical.element_indexing(index)
-        return self._decode(val) if val is not None else val
+        return self._decode(int(val)) if val is not None else val
 
     @property
-    def __cuda_array_interface__(self):
+    def __cuda_array_interface__(self) -> Mapping[str, Any]:
         raise TypeError(
             "Categorical does not support `__cuda_array_interface__`."
             " Please consider using `.codes` or `.categories`"
             " if you need this functionality."
         )
 
-    def to_pandas(self, index=None, nullable=False):
+    def to_pandas(
+        self, index: ColumnLike = None, nullable: bool = False, **kwargs
+    ) -> pd.Series:
         signed_dtype = min_signed_type(len(self.categories))
         codes = self.cat().codes.astype(signed_dtype).fillna(-1).to_array()
         categories = self.categories.to_pandas()
@@ -938,7 +1053,7 @@ def to_pandas(self, index=None, nullable=False):
         return pd.Series(data, index=index)
 
     @property
-    def values_host(self):
+    def values_host(self) -> np.ndarray:
         """
         Return a numpy representation of the CategoricalColumn.
         """
@@ -951,7 +1066,16 @@ def values(self):
         """
         raise NotImplementedError("cudf.Categorical is not yet implemented")
 
-    def unique(self):
+    def clip(self, lo: ScalarLike, hi: ScalarLike) -> "column.ColumnBase":
+        return (
+            self.astype(self.categories.dtype).clip(lo, hi).astype(self.dtype)
+        )
+
+    @property
+    def data_array_view(self) -> cuda.devicearray.DeviceNDArray:
+        return self.codes.data_array_view
+
+    def unique(self) -> CategoricalColumn:
         codes = self.as_numerical.unique()
         return column.build_categorical_column(
             categories=self.categories,
@@ -962,18 +1086,23 @@ def unique(self):
             ordered=self.ordered,
         )
 
-    def _encode(self, value):
+    def _encode(self, value) -> ScalarLike:
         return self.categories.find_first_value(value)
 
-    def _decode(self, value):
+    def _decode(self, value: int) -> ScalarLike:
         if value == self.default_na_value():
             return None
         return self.categories.element_indexing(value)
 
-    def default_na_value(self):
+    def default_na_value(self) -> ScalarLike:
         return -1
 
-    def find_and_replace(self, to_replace, replacement, all_nan):
+    def find_and_replace(
+        self,
+        to_replace: ColumnLike,
+        replacement: ColumnLike,
+        all_nan: bool = False,
+    ) -> CategoricalColumn:
         """
         Return col with *to_replace* replaced with *replacement*.
         """
@@ -1038,7 +1167,9 @@ def find_and_replace(self, to_replace, replacement, all_nan):
             ordered=self.dtype.ordered,
         )
 
-    def fillna(self, fill_value=None, method=None):
+    def fillna(
+        self, fill_value: Any = None, method: Any = None, dtype: Dtype = None
+    ) -> CategoricalColumn:
         """
         Fill null values with *fill_value*
         """
@@ -1084,20 +1215,22 @@ def fillna(self, fill_value=None, method=None):
 
         return result
 
-    def find_first_value(self, value, closest=False):
+    def find_first_value(
+        self, value: ScalarLike, closest: bool = False
+    ) -> int:
         """
         Returns offset of first value that matches
         """
         return self.as_numerical.find_first_value(self._encode(value))
 
-    def find_last_value(self, value, closest=False):
+    def find_last_value(self, value: ScalarLike, closest: bool = False) -> int:
         """
         Returns offset of last value that matches
         """
         return self.as_numerical.find_last_value(self._encode(value))
 
     @property
-    def is_monotonic_increasing(self):
+    def is_monotonic_increasing(self) -> bool:
         if not hasattr(self, "_is_monotonic_increasing"):
             self._is_monotonic_increasing = (
                 self.ordered and self.as_numerical.is_monotonic_increasing
@@ -1105,14 +1238,16 @@ def is_monotonic_increasing(self):
         return self._is_monotonic_increasing
 
     @property
-    def is_monotonic_decreasing(self):
+    def is_monotonic_decreasing(self) -> bool:
         if not hasattr(self, "_is_monotonic_decreasing"):
             self._is_monotonic_decreasing = (
                 self.ordered and self.as_numerical.is_monotonic_decreasing
             )
         return self._is_monotonic_decreasing
 
-    def as_categorical_column(self, dtype, **kwargs):
+    def as_categorical_column(
+        self, dtype: Dtype, **kwargs
+    ) -> CategoricalColumn:
         if isinstance(dtype, str) and dtype == "category":
             return self
         if (
@@ -1129,6 +1264,9 @@ def as_categorical_column(self, dtype, **kwargs):
                 categories=dtype.categories, ordered=dtype.ordered
             )
 
+        if not isinstance(dtype, CategoricalDtype):
+            raise ValueError("dtype must be CategoricalDtype")
+
         if not isinstance(self.categories, type(dtype.categories._values)):
             # If both categories are of different Column types,
             # return a column full of Nulls.
@@ -1138,25 +1276,25 @@ def as_categorical_column(self, dtype, **kwargs):
             new_categories=dtype.categories, ordered=dtype.ordered
         )
 
-    def as_numerical_column(self, dtype):
+    def as_numerical_column(self, dtype: Dtype) -> NumericalColumn:
         return self._get_decategorized_column().as_numerical_column(dtype)
 
-    def as_string_column(self, dtype, **kwargs):
+    def as_string_column(self, dtype, format=None) -> StringColumn:
         return self._get_decategorized_column().as_string_column(
-            dtype, **kwargs
+            dtype, format=format
         )
 
-    def as_datetime_column(self, dtype, **kwargs):
+    def as_datetime_column(self, dtype, **kwargs) -> DatetimeColumn:
         return self._get_decategorized_column().as_datetime_column(
             dtype, **kwargs
         )
 
-    def as_timedelta_column(self, dtype, **kwargs):
+    def as_timedelta_column(self, dtype, **kwargs) -> TimeDeltaColumn:
         return self._get_decategorized_column().as_timedelta_column(
             dtype, **kwargs
         )
 
-    def _get_decategorized_column(self):
+    def _get_decategorized_column(self) -> ColumnBase:
         if self.null_count == len(self):
             # self.categories is empty; just return codes
             return self.cat().codes._column
@@ -1165,7 +1303,7 @@ def _get_decategorized_column(self):
         out = out.set_mask(self.mask)
         return out
 
-    def copy(self, deep=True):
+    def copy(self, deep: bool = True) -> CategoricalColumn:
         if deep:
             copied_col = libcudf.copying.copy_column(self)
             copied_cat = libcudf.copying.copy_column(self.dtype._categories)
@@ -1192,12 +1330,13 @@ def copy(self, deep=True):
                 size=self.size,
             )
 
-    def __sizeof__(self):
+    def __sizeof__(self) -> int:
         return (
             self.cat().categories.__sizeof__() + self.cat().codes.__sizeof__()
         )
 
-    def _memory_usage(self, deep=False):
+    def _memory_usage(self, **kwargs) -> int:
+        deep = kwargs.get("deep", False)
         if deep:
             return self.__sizeof__()
         else:
@@ -1206,22 +1345,25 @@ def _memory_usage(self, deep=False):
                 + self.cat().codes.memory_usage()
             )
 
-    def _mimic_inplace(self, other_col, inplace=False):
+    def _mimic_inplace(
+        self, other_col: ColumnBase, inplace: bool = False
+    ) -> Optional[ColumnBase]:
         out = super()._mimic_inplace(other_col, inplace=inplace)
-        if inplace:
+        if inplace and isinstance(other_col, CategoricalColumn):
             self._codes = other_col._codes
         return out
 
-    def view(self, dtype):
+    def view(self, dtype: Dtype) -> ColumnBase:
         raise NotImplementedError(
             "Categorical column views are not currently supported"
         )
 
 
-def _create_empty_categorical_column(categorical_column, dtype):
-
+def _create_empty_categorical_column(
+    categorical_column: CategoricalColumn, dtype: "CategoricalDtype"
+) -> CategoricalColumn:
     return column.build_categorical_column(
-        categories=dtype.categories,
+        categories=column.as_column(dtype.categories),
         codes=column.as_column(
             cudf.utils.utils.scalar_broadcast_to(
                 categorical_column.default_na_value(),
@@ -1236,7 +1378,9 @@ def _create_empty_categorical_column(categorical_column, dtype):
     )
 
 
-def pandas_categorical_as_column(categorical, codes=None):
+def pandas_categorical_as_column(
+    categorical: ColumnLike, codes: ColumnLike = None
+) -> CategoricalColumn:
 
     """Creates a CategoricalColumn from a pandas.Categorical
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 1a32842b027..670dd456de9 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1,9 +1,24 @@
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+from __future__ import annotations
 
+import builtins
 import pickle
 import warnings
-from numbers import Number
+from collections.abc import MutableSequence
 from types import SimpleNamespace
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Mapping,
+    Optional,
+    Sequence,
+    Tuple,
+    TypeVar,
+    Union,
+    cast,
+)
 
 import cupy
 import numpy as np
@@ -22,6 +37,7 @@
 from cudf._lib.scalar import as_device_scalar
 from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count
 from cudf._lib.transform import bools_to_mask
+from cudf._typing import BinaryOperand, ColumnLike, Dtype, ScalarLike
 from cudf.core.abc import Serializable
 from cudf.core.buffer import Buffer
 from cudf.core.dtypes import CategoricalDtype
@@ -32,6 +48,7 @@
     cudf_dtypes_to_pandas_dtypes,
     get_time_unit,
     is_categorical_dtype,
+    is_decimal_dtype,
     is_list_dtype,
     is_numerical_dtype,
     is_scalar,
@@ -43,68 +60,34 @@
 )
 from cudf.utils.utils import mask_dtype
 
+T = TypeVar("T", bound="ColumnBase")
 
-class ColumnBase(Column, Serializable):
-    def __init__(
-        self,
-        data,
-        size,
-        dtype,
-        mask=None,
-        offset=0,
-        null_count=None,
-        children=(),
-    ):
-        """
-        Parameters
-        ----------
-        data : Buffer
-        dtype
-            The type associated with the data Buffer
-        mask : Buffer, optional
-        children : tuple, optional
-        """
-        super().__init__(
-            data,
-            size=size,
-            dtype=dtype,
-            mask=mask,
-            offset=offset,
-            children=children,
-        )
 
-    def as_frame(self):
+class ColumnBase(Column, Serializable):
+    def as_frame(self) -> "cudf.core.frame.Frame":
         """
         Converts a Column to Frame
         """
         return cudf.core.frame.Frame({None: self.copy(deep=False)})
 
     @property
-    def data_array_view(self):
+    def data_array_view(self) -> "cuda.devicearray.DeviceNDArray":
         """
         View the data as a device array object
         """
-        if self.dtype == "object":
-            raise ValueError("Cannot get an array view of a StringColumn")
-
-        if is_categorical_dtype(self.dtype):
-            return self.codes.data_array_view
-        else:
-            dtype = self.dtype
-
         result = cuda.as_cuda_array(self.data)
         # Workaround until `.view(...)` can change itemsize
         # xref: https://github.com/numba/numba/issues/4829
         result = cuda.devicearray.DeviceNDArray(
-            shape=(result.nbytes // dtype.itemsize,),
-            strides=(dtype.itemsize,),
-            dtype=dtype,
+            shape=(result.nbytes // self.dtype.itemsize,),
+            strides=(self.dtype.itemsize,),
+            dtype=self.dtype,
             gpu_data=result.gpu_data,
         )
         return result
 
     @property
-    def mask_array_view(self):
+    def mask_array_view(self) -> "cuda.devicearray.DeviceNDArray":
         """
         View the mask as a device array
         """
@@ -121,10 +104,12 @@ def mask_array_view(self):
         )
         return result
 
-    def __len__(self):
+    def __len__(self) -> int:
         return self.size
 
-    def to_pandas(self, index=None, nullable=False, **kwargs):
+    def to_pandas(
+        self, index: ColumnLike = None, nullable: bool = False, **kwargs
+    ) -> "pd.Series":
         if nullable and self.dtype in cudf_dtypes_to_pandas_dtypes:
             pandas_nullable_dtype = cudf_dtypes_to_pandas_dtypes[self.dtype]
             arrow_array = self.to_arrow()
@@ -143,14 +128,14 @@ def __iter__(self):
         cudf.utils.utils.raise_iteration_error(obj=self)
 
     @property
-    def values_host(self):
+    def values_host(self) -> "np.ndarray":
         """
         Return a numpy representation of the Column.
         """
         return self.data_array_view.copy_to_host()
 
     @property
-    def values(self):
+    def values(self) -> "cupy.ndarray":
         """
         Return a CuPy representation of the Column.
         """
@@ -162,14 +147,18 @@ def values(self):
 
         return cupy.asarray(self.data_array_view)
 
-    def clip(self, lo, hi):
-        if is_categorical_dtype(self):
-            input_col = self.astype(self.categories.dtype)
-            return libcudf.replace.clip(input_col, lo, hi).astype(self.dtype)
-        else:
-            return libcudf.replace.clip(self, lo, hi)
+    def find_and_replace(
+        self: T,
+        to_replace: ColumnLike,
+        replacement: ColumnLike,
+        all_nan: bool = False,
+    ) -> T:
+        raise NotImplementedError
+
+    def clip(self, lo: ScalarLike, hi: ScalarLike) -> ColumnBase:
+        return libcudf.replace.clip(self, lo, hi)
 
-    def equals(self, other, check_dtypes=False):
+    def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool:
         if self is other:
             return True
         if other is None or len(self) != len(other):
@@ -179,21 +168,32 @@ def equals(self, other, check_dtypes=False):
                 return False
         return (self == other).min()
 
-    def all(self):
+    def all(self) -> bool:
         return bool(libcudf.reduce.reduce("all", self, dtype=np.bool_))
 
-    def any(self):
+    def any(self) -> bool:
         return bool(libcudf.reduce.reduce("any", self, dtype=np.bool_))
 
-    def __sizeof__(self):
-        n = self.data.size
+    def __sizeof__(self) -> int:
+        n = 0
+        if self.data is not None:
+            n += self.data.size
         if self.nullable:
             n += bitmask_allocation_size_bytes(self.size)
         return n
 
-    @classmethod
-    def _concat(cls, objs, dtype=None):
+    def cat(
+        self, parent=None
+    ) -> "cudf.core.column.categorical.CategoricalAccessor":
+        raise NotImplementedError()
+
+    def str(self, parent=None) -> "cudf.core.column.string.StringMethods":
+        raise NotImplementedError()
 
+    @classmethod
+    def _concat(
+        cls, objs: "MutableSequence[ColumnBase]", dtype: Dtype = None
+    ) -> ColumnBase:
         if len(objs) == 0:
             dtype = pd.api.types.pandas_dtype(dtype)
             if is_categorical_dtype(dtype):
@@ -281,7 +281,7 @@ def _concat(cls, objs, dtype=None):
 
         if is_categorical:
             col = build_categorical_column(
-                categories=cats,
+                categories=as_column(cats),
                 codes=as_column(col.base_data, dtype=col.dtype),
                 mask=col.base_mask,
                 size=col.size,
@@ -290,11 +290,17 @@ def _concat(cls, objs, dtype=None):
 
         return col
 
-    def dropna(self):
-        dropped_col = self.as_frame().dropna()._as_column()
+    def dropna(self, drop_nan: bool = False) -> ColumnBase:
+        if drop_nan:
+            col = self.nans_to_nulls()
+        else:
+            col = self
+        dropped_col = (
+            col.as_frame()._drop_na_rows(drop_nan=drop_nan)._as_column()
+        )
         return dropped_col
 
-    def to_arrow(self):
+    def to_arrow(self) -> pa.Array:
         """Convert to PyArrow Array
 
         Examples
@@ -343,7 +349,7 @@ def to_arrow(self):
         )["None"].chunk(0)
 
     @classmethod
-    def from_arrow(cls, array):
+    def from_arrow(cls, array: pa.Array) -> ColumnBase:
         """
         Convert PyArrow Array/ChunkedArray to column
 
@@ -405,15 +411,18 @@ def from_arrow(cls, array):
             "None"
         ]
 
-    def _get_mask_as_column(self):
+    def _get_mask_as_column(self) -> ColumnBase:
         return libcudf.transform.mask_to_bools(
             self.base_mask, self.offset, self.offset + len(self)
         )
 
-    def _memory_usage(self, **kwargs):
+    def _memory_usage(self, **kwargs) -> int:
         return self.__sizeof__()
 
-    def to_gpu_array(self, fillna=None):
+    def default_na_value(self) -> Any:
+        raise NotImplementedError()
+
+    def to_gpu_array(self, fillna=None) -> "cuda.devicearray.DeviceNDArray":
         """Get a dense numba device array for the data.
 
         Parameters
@@ -430,9 +439,9 @@ def to_gpu_array(self, fillna=None):
         if fillna:
             return self.fillna(self.default_na_value()).data_array_view
         else:
-            return self.dropna().data_array_view
+            return self.dropna(drop_nan=False).data_array_view
 
-    def to_array(self, fillna=None):
+    def to_array(self, fillna=None) -> "np.array":
         """Get a dense numpy array for the data.
 
         Parameters
@@ -451,13 +460,16 @@ def to_array(self, fillna=None):
 
         return self.to_gpu_array(fillna=fillna).copy_to_host()
 
-    def _fill(self, fill_value, begin=0, end=-1, inplace=False):
+    def _fill(
+        self,
+        fill_value: ScalarLike,
+        begin: int,
+        end: int,
+        inplace: bool = False,
+    ) -> Optional[ColumnBase]:
         if end <= begin or begin >= self.size:
             return self if inplace else self.copy()
 
-        if is_categorical_dtype(self.dtype):
-            return self._fill_categorical(fill_value, begin, end, inplace)
-
         fill_scalar = as_device_scalar(fill_value, self.dtype)
 
         if not inplace:
@@ -477,7 +489,6 @@ def _fill(self, fill_value, begin=0, end=-1, inplace=False):
 
         return self
 
-    def _fill_categorical(self, fill_value, begin, end, inplace):
         fill_code = self._encode(fill_value)
         fill_scalar = as_device_scalar(fill_code, self.codes.dtype)
 
@@ -486,16 +497,16 @@ def _fill_categorical(self, fill_value, begin, end, inplace):
         libcudf.filling.fill_in_place(result.codes, begin, end, fill_scalar)
         return result
 
-    def shift(self, offset, fill_value):
+    def shift(self, offset: int, fill_value: ScalarLike) -> ColumnBase:
         return libcudf.copying.shift(self, offset, fill_value)
 
     @property
-    def valid_count(self):
+    def valid_count(self) -> int:
         """Number of non-null values"""
         return len(self) - self.null_count
 
     @property
-    def nullmask(self):
+    def nullmask(self) -> Buffer:
         """The gpu buffer for the null-mask
         """
         if self.nullable:
@@ -503,7 +514,7 @@ def nullmask(self):
         else:
             raise ValueError("Column has no null mask")
 
-    def copy(self, deep=True):
+    def copy(self, deep: bool = True) -> ColumnBase:
         """Columns are immutable, so a deep copy produces a copy of the
         underlying data and mask and a shallow copy creates a new column and
         copies the references of the data and mask.
@@ -520,7 +531,7 @@ def copy(self, deep=True):
                 children=self.base_children,
             )
 
-    def view(self, dtype):
+    def view(self, dtype: Dtype) -> ColumnBase:
         """
         View the data underlying a column as different dtype.
         The source column must divide evenly into the size of
@@ -562,6 +573,7 @@ def view(self, dtype):
                     + f" total bytes into {dtype} with size {dtype.itemsize}"
                 )
 
+            assert self.base_data is not None
             new_buf_ptr = (
                 self.base_data.ptr + self.offset * self.dtype.itemsize
             )
@@ -573,7 +585,7 @@ def view(self, dtype):
             )
             return build_column(view_buf, dtype=dtype)
 
-    def element_indexing(self, index):
+    def element_indexing(self, index: int):
         """Default implementation for indexing to an element
 
         Raises
@@ -588,46 +600,29 @@ def element_indexing(self, index):
 
         return libcudf.copying.get_element(self, index).value
 
-    def __getitem__(self, arg):
+    def slice(self, start: int, stop: int, stride: int = None) -> ColumnBase:
+        if start < 0:
+            start = start + len(self)
+        if stop < 0:
+            stop = stop + len(self)
+        if start >= stop:
+            return column_empty(0, self.dtype, masked=True)
+        # compute mask slice
+        if stride == 1 or stride is None:
+            return libcudf.copying.column_slice(self, [start, stop])[0]
+        else:
+            # Need to create a gather map for given slice with stride
+            gather_map = arange(
+                start=start, stop=stop, step=stride, dtype=np.dtype(np.int32),
+            )
+            return self.take(gather_map)
 
-        if isinstance(arg, Number):
-            arg = int(arg)
-            return self.element_indexing(arg)
+    def __getitem__(self, arg) -> Union[ScalarLike, ColumnBase]:
+        if is_scalar(arg):
+            return self.element_indexing(int(arg))
         elif isinstance(arg, slice):
-
-            if is_categorical_dtype(self):
-                codes = self.codes[arg]
-                return build_categorical_column(
-                    categories=self.categories,
-                    codes=as_column(codes.base_data, dtype=codes.dtype),
-                    mask=codes.base_mask,
-                    ordered=self.ordered,
-                    size=codes.size,
-                    offset=codes.offset,
-                )
-
             start, stop, stride = arg.indices(len(self))
-
-            if start < 0:
-                start = start + len(self)
-            if stop < 0:
-                stop = stop + len(self)
-
-            if start >= stop:
-                return column_empty(0, self.dtype, masked=True)
-            # compute mask slice
-            if stride == 1 or stride is None:
-
-                return libcudf.copying.column_slice(self, [start, stop])[0]
-            else:
-                # Need to create a gather map for given slice with stride
-                gather_map = arange(
-                    start=start,
-                    stop=stop,
-                    step=stride,
-                    dtype=np.dtype(np.int32),
-                )
-                return self.take(gather_map)
+            return self.slice(start, stop, stride)
         else:
             arg = as_column(arg)
             if len(arg) == 0:
@@ -638,7 +633,7 @@ def __getitem__(self, arg):
                 return self.apply_boolean_mask(arg)
             raise NotImplementedError(type(arg))
 
-    def __setitem__(self, key, value):
+    def __setitem__(self, key: Any, value: Any):
         """
         Set the value of self[key] to value.
 
@@ -679,10 +674,7 @@ def __setitem__(self, key, value):
             nelem = len(key)
 
         if is_scalar(value):
-            if is_categorical_dtype(self.dtype):
-                value = self._encode(value)
-            else:
-                value = self.dtype.type(value) if value is not None else value
+            value = self.dtype.type(value) if value is not None else value
         else:
             if len(value) != nelem:
                 msg = (
@@ -692,9 +684,6 @@ def __setitem__(self, key, value):
                 )
                 raise ValueError(msg)
             value = as_column(value).astype(self.dtype)
-            if is_categorical_dtype(value.dtype):
-                value = value.cat().set_categories(self.categories)
-                assert self.dtype == value.dtype
 
         if (
             isinstance(key, slice)
@@ -705,34 +694,11 @@ def __setitem__(self, key, value):
             out = libcudf.copying.copy_range(
                 value, self, 0, nelem, key_start, key_stop, False
             )
-            if is_categorical_dtype(value.dtype):
-                out = build_categorical_column(
-                    categories=value.categories,
-                    codes=as_column(out.base_data, dtype=out.dtype),
-                    mask=out.base_mask,
-                    size=out.size,
-                    offset=out.offset,
-                    ordered=value.ordered,
-                )
         else:
             try:
                 if is_scalar(value):
                     input = self
-                    if is_categorical_dtype(self.dtype):
-                        input = self.codes
-
                     out = input.as_frame()._scatter(key, [value])._as_column()
-
-                    if is_categorical_dtype(self.dtype):
-                        out = build_categorical_column(
-                            categories=self.categories,
-                            codes=as_column(out.base_data, dtype=out.dtype),
-                            mask=out.base_mask,
-                            size=out.size,
-                            offset=out.offset,
-                            ordered=self.ordered,
-                        )
-
                 else:
                     if not isinstance(value, Column):
                         value = as_column(value)
@@ -750,7 +716,12 @@ def __setitem__(self, key, value):
 
         self._mimic_inplace(out, inplace=True)
 
-    def fillna(self, value=None, method=None, dtype=None):
+    def fillna(
+        self: T,
+        value: Any = None,
+        method: builtins.str = None,
+        dtype: Dtype = None,
+    ) -> T:
         """Fill null values with ``value``.
 
         Returns a copy with null filled.
@@ -759,7 +730,7 @@ def fillna(self, value=None, method=None, dtype=None):
             input_col=self, replacement=value, method=method, dtype=dtype
         )
 
-    def isnull(self):
+    def isnull(self) -> ColumnBase:
         """Identify missing values in a Column.
         """
         result = libcudf.unary.is_null(self)
@@ -771,12 +742,12 @@ def isnull(self):
 
         return result
 
-    def isna(self):
+    def isna(self) -> ColumnBase:
         """Identify missing values in a Column. Alias for isnull.
         """
         return self.isnull()
 
-    def notnull(self):
+    def notnull(self) -> ColumnBase:
         """Identify non-missing values in a Column.
         """
         result = libcudf.unary.is_valid(self)
@@ -788,12 +759,14 @@ def notnull(self):
 
         return result
 
-    def notna(self):
+    def notna(self) -> ColumnBase:
         """Identify non-missing values in a Column. Alias for notnull.
         """
         return self.notnull()
 
-    def find_first_value(self, value):
+    def find_first_value(
+        self, value: ScalarLike, closest: bool = False
+    ) -> int:
         """
         Returns offset of first value that matches
         """
@@ -804,7 +777,7 @@ def find_first_value(self, value):
             raise ValueError("value not found")
         return indices[0]
 
-    def find_last_value(self, value):
+    def find_last_value(self, value: ScalarLike, closest: bool = False) -> int:
         """
         Returns offset of last value that matches
         """
@@ -815,21 +788,26 @@ def find_last_value(self, value):
             raise ValueError("value not found")
         return indices[-1]
 
-    def append(self, other):
+    def append(self, other: ColumnBase) -> ColumnBase:
         return ColumnBase._concat([self, as_column(other)])
 
-    def quantile(self, q, interpolation, exact):
+    def quantile(
+        self,
+        q: Union[float, Sequence[float]],
+        interpolation: builtins.str,
+        exact: bool,
+    ) -> ColumnBase:
         raise TypeError(f"cannot perform quantile with type {self.dtype}")
 
-    def median(self, skipna=None):
+    def median(self, skipna: bool = None) -> ScalarLike:
         raise TypeError(f"cannot perform median with type {self.dtype}")
 
-    def take(self, indices, keep_index=True):
+    def take(self: T, indices: ColumnBase, keep_index: bool = True) -> T:
         """Return Column by taking values from the corresponding *indices*.
         """
         # Handle zero size
         if indices.size == 0:
-            return column_empty_like(self, newsize=0)
+            return cast(T, column_empty_like(self, newsize=0))
         try:
             return (
                 self.as_frame()
@@ -843,7 +821,7 @@ def take(self, indices, keep_index=True):
                 ) from e
             raise
 
-    def isin(self, values):
+    def isin(self, values: Sequence) -> ColumnBase:
         """Check whether values are contained in the Column.
 
         Parameters
@@ -898,17 +876,17 @@ def isin(self, values):
                 rhs = as_column(pd.Categorical.from_codes([-1], categories=[]))
                 rhs = rhs.cat().set_categories(lhs_cats).astype(self.dtype)
 
-        lhs = cudf.DataFrame({"x": lhs, "orig_order": arange(len(lhs))})
-        rhs = cudf.DataFrame(
+        ldf = cudf.DataFrame({"x": lhs, "orig_order": arange(len(lhs))})
+        rdf = cudf.DataFrame(
             {"x": rhs, "bool": full(len(rhs), True, dtype="bool")}
         )
-        res = lhs.merge(rhs, on="x", how="left").sort_values(by="orig_order")
+        res = ldf.merge(rdf, on="x", how="left").sort_values(by="orig_order")
         res = res.drop_duplicates(subset="orig_order", ignore_index=True)
         res = res._data["bool"].fillna(False)
 
         return res
 
-    def as_mask(self):
+    def as_mask(self) -> Buffer:
         """Convert booleans to bitmask
 
         Returns
@@ -928,15 +906,15 @@ def to_dlpack(self):
         return cudf.io.dlpack.to_dlpack(self)
 
     @property
-    def is_unique(self):
+    def is_unique(self) -> bool:
         return self.distinct_count() == len(self)
 
     @property
-    def is_monotonic(self):
+    def is_monotonic(self) -> bool:
         return self.is_monotonic_increasing
 
     @property
-    def is_monotonic_increasing(self):
+    def is_monotonic_increasing(self) -> bool:
         if not hasattr(self, "_is_monotonic_increasing"):
             if self.has_nulls:
                 self._is_monotonic_increasing = False
@@ -947,7 +925,7 @@ def is_monotonic_increasing(self):
         return self._is_monotonic_increasing
 
     @property
-    def is_monotonic_decreasing(self):
+    def is_monotonic_decreasing(self) -> bool:
         if not hasattr(self, "_is_monotonic_decreasing"):
             if self.has_nulls:
                 self._is_monotonic_decreasing = False
@@ -957,14 +935,16 @@ def is_monotonic_decreasing(self):
                 )
         return self._is_monotonic_decreasing
 
-    def get_slice_bound(self, label, side, kind):
+    def get_slice_bound(
+        self, label: ScalarLike, side: builtins.str, kind: builtins.str
+    ) -> int:
         """
         Calculate slice bound that corresponds to given label.
         Returns leftmost (one-past-the-rightmost if ``side=='right'``) position
         of given label.
         Parameters
         ----------
-        label : object
+        label : Scalar
         side : {'left', 'right'}
         kind : {'ix', 'loc', 'getitem'}
         """
@@ -979,21 +959,29 @@ def get_slice_bound(self, label, side, kind):
         #       Not currently using `kind` argument.
         if side == "left":
             return self.find_first_value(label, closest=True)
-        if side == "right":
+        elif side == "right":
             return self.find_last_value(label, closest=True) + 1
+        else:
+            raise ValueError(f"Invalid value for side: {side}")
 
-    def sort_by_values(self, ascending=True, na_position="last"):
+    def sort_by_values(
+        self: ColumnBase,
+        ascending: bool = True,
+        na_position: builtins.str = "last",
+    ) -> Tuple[ColumnBase, "cudf.core.column.NumericalColumn"]:
         col_inds = self.as_frame()._get_sorted_inds(ascending, na_position)
-        col_keys = self[col_inds]
+        col_keys = self.take(col_inds)
         return col_keys, col_inds
 
-    def distinct_count(self, method="sort", dropna=True):
+    def distinct_count(
+        self, method: builtins.str = "sort", dropna: bool = True
+    ) -> int:
         if method != "sort":
             msg = "non sort based distinct_count() not implemented yet"
             raise NotImplementedError(msg)
         return cpp_distinct_count(self, ignore_nulls=dropna)
 
-    def astype(self, dtype, **kwargs):
+    def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
         if is_categorical_dtype(dtype):
             return self.as_categorical_column(dtype, **kwargs)
         elif pd.api.types.pandas_dtype(dtype).type in {
@@ -1015,7 +1003,7 @@ def astype(self, dtype, **kwargs):
         else:
             return self.as_numerical_column(dtype)
 
-    def as_categorical_column(self, dtype, **kwargs):
+    def as_categorical_column(self, dtype, **kwargs) -> ColumnBase:
         if "ordered" in kwargs:
             ordered = kwargs["ordered"]
         else:
@@ -1058,26 +1046,36 @@ def as_categorical_column(self, dtype, **kwargs):
             ordered=ordered,
         )
 
-    def as_numerical_column(self, dtype):
+    def as_numerical_column(
+        self, dtype: Dtype
+    ) -> "cudf.core.column.NumericalColumn":
         raise NotImplementedError
 
-    def as_datetime_column(self, dtype, **kwargs):
+    def as_datetime_column(
+        self, dtype: Dtype, **kwargs
+    ) -> "cudf.core.column.DatetimeColumn":
         raise NotImplementedError
 
-    def as_timedelta_column(self, dtype, **kwargs):
+    def as_timedelta_column(
+        self, dtype: Dtype, **kwargs
+    ) -> "cudf.core.column.TimeDeltaColumn":
         raise NotImplementedError
 
-    def as_string_column(self, dtype, **kwargs):
+    def as_string_column(
+        self, dtype: Dtype, format=None
+    ) -> "cudf.core.column.StringColumn":
         raise NotImplementedError
 
-    def apply_boolean_mask(self, mask):
+    def apply_boolean_mask(self, mask) -> ColumnBase:
         mask = as_column(mask, dtype="bool")
         result = (
             self.as_frame()._apply_boolean_mask(boolean_mask=mask)._as_column()
         )
         return result
 
-    def argsort(self, ascending=True, na_position="last"):
+    def argsort(
+        self, ascending: bool = True, na_position: builtins.str = "last"
+    ) -> ColumnBase:
 
         sorted_indices = self.as_frame()._get_sorted_inds(
             ascending=ascending, na_position=na_position
@@ -1085,7 +1083,7 @@ def argsort(self, ascending=True, na_position="last"):
         return sorted_indices
 
     @property
-    def __cuda_array_interface__(self):
+    def __cuda_array_interface__(self) -> Mapping[builtins.str, Any]:
         output = {
             "shape": (len(self),),
             "strides": (self.dtype.itemsize,),
@@ -1157,14 +1155,18 @@ def __ge__(self, other):
         return self.binary_operator("ge", other)
 
     def searchsorted(
-        self, value, side="left", ascending=True, na_position="last"
+        self,
+        value,
+        side: builtins.str = "left",
+        ascending: bool = True,
+        na_position: builtins.str = "last",
     ):
         values = as_column(value).as_frame()
         return self.as_frame().searchsorted(
             values, side, ascending=ascending, na_position=na_position
         )
 
-    def unique(self):
+    def unique(self) -> ColumnBase:
         """
         Get unique values in the data
         """
@@ -1174,17 +1176,18 @@ def unique(self):
             ._as_column()
         )
 
-    def serialize(self):
-        header = {}
+    def serialize(self) -> Tuple[dict, list]:
+        header = {}  # type: Dict[Any, Any]
         frames = []
         header["type-serialized"] = pickle.dumps(type(self))
         header["dtype"] = self.dtype.str
 
-        data_header, data_frames = self.data.serialize()
-        header["data"] = data_header
-        frames.extend(data_frames)
+        if self.data is not None:
+            data_header, data_frames = self.data.serialize()
+            header["data"] = data_header
+            frames.extend(data_frames)
 
-        if self.nullable:
+        if self.mask is not None:
             mask_header, mask_frames = self.mask.serialize()
             header["mask"] = mask_header
             frames.extend(mask_frames)
@@ -1193,7 +1196,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
-    def deserialize(cls, header, frames):
+    def deserialize(cls, header: dict, frames: list) -> ColumnBase:
         dtype = header["dtype"]
         data = Buffer.deserialize(header["data"], [frames[0]])
         mask = None
@@ -1201,61 +1204,71 @@ def deserialize(cls, header, frames):
             mask = Buffer.deserialize(header["mask"], [frames[1]])
         return build_column(data=data, dtype=dtype, mask=mask)
 
-    def min(self, skipna=None, dtype=None):
+    def binary_operator(
+        self, op: builtins.str, other: BinaryOperand, reflect: bool = False
+    ) -> ColumnBase:
+        raise NotImplementedError
+
+    def min(self, skipna: bool = None, dtype: Dtype = None):
         result_col = self._process_for_reduction(skipna=skipna)
         if isinstance(result_col, ColumnBase):
             return libcudf.reduce.reduce("min", result_col, dtype=dtype)
         else:
             return result_col
 
-    def max(self, skipna=None, dtype=None):
+    def max(self, skipna: bool = None, dtype: Dtype = None):
         result_col = self._process_for_reduction(skipna=skipna)
         if isinstance(result_col, ColumnBase):
             return libcudf.reduce.reduce("max", result_col, dtype=dtype)
         else:
             return result_col
 
-    def sum(self, skipna=None, dtype=None, min_count=0):
+    def sum(
+        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
+    ):
         raise TypeError(f"cannot perform sum with type {self.dtype}")
 
-    def product(self, skipna=None, dtype=None, min_count=0):
+    def product(
+        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
+    ):
         raise TypeError(f"cannot perform prod with type {self.dtype}")
 
-    def mean(self, skipna=None, dtype=None):
+    def mean(self, skipna: bool = None, dtype: Dtype = None):
         raise TypeError(f"cannot perform mean with type {self.dtype}")
 
-    def std(self, skipna=None, ddof=1, dtype=np.float64):
+    def std(self, skipna: bool = None, ddof=1, dtype: Dtype = np.float64):
         raise TypeError(f"cannot perform std with type {self.dtype}")
 
-    def var(self, skipna=None, ddof=1, dtype=np.float64):
+    def var(self, skipna: bool = None, ddof=1, dtype: Dtype = np.float64):
         raise TypeError(f"cannot perform var with type {self.dtype}")
 
-    def kurtosis(self, skipna=None):
+    def kurtosis(self, skipna: bool = None):
         raise TypeError(f"cannot perform kurt with type {self.dtype}")
 
-    def skew(self, skipna=None):
+    def skew(self, skipna: bool = None):
         raise TypeError(f"cannot perform skew with type {self.dtype}")
 
-    def cov(self, other):
+    def cov(self, other: ColumnBase):
         raise TypeError(
             f"cannot perform covarience with types {self.dtype}, "
             f"{other.dtype}"
         )
 
-    def corr(self, other):
+    def corr(self, other: ColumnBase):
         raise TypeError(
             f"cannot perform corr with types {self.dtype}, {other.dtype}"
         )
 
-    def nans_to_nulls(self):
+    def nans_to_nulls(self: T) -> T:
         if self.dtype.kind == "f":
-            col = self.fillna(np.nan)
-            newmask = libcudf.transform.nans_to_nulls(col)
+            newmask = libcudf.transform.nans_to_nulls(self)
             return self.set_mask(newmask)
         else:
             return self
 
-    def _process_for_reduction(self, skipna=None, min_count=0):
+    def _process_for_reduction(
+        self, skipna: bool = None, min_count: int = 0
+    ) -> Union[ColumnBase, ScalarLike]:
         skipna = True if skipna is None else skipna
 
         if skipna:
@@ -1280,8 +1293,13 @@ def _process_for_reduction(self, skipna=None, min_count=0):
         return result_col
 
     def scatter_to_table(
-        self, row_indices, column_indices, names, nrows=None, ncols=None
-    ):
+        self,
+        row_indices: ColumnBase,
+        column_indices: ColumnBase,
+        names: List[Any],
+        nrows: int = None,
+        ncols: int = None,
+    ) -> "cudf.core.frame.Frame":
         """
         Scatters values from the column into a table.
 
@@ -1326,7 +1344,12 @@ def scatter_to_table(
         )
 
 
-def column_empty_like(column, dtype=None, masked=False, newsize=None):
+def column_empty_like(
+    column: ColumnBase,
+    dtype: Dtype = None,
+    masked: bool = False,
+    newsize: int = None,
+) -> ColumnBase:
     """Allocate a new column like the given *column*
     """
     if dtype is None:
@@ -1338,6 +1361,7 @@ def column_empty_like(column, dtype=None, masked=False, newsize=None):
         and is_categorical_dtype(column.dtype)
         and dtype == column.dtype
     ):
+        column = cast("cudf.core.column.CategoricalColumn", column)
         codes = column_empty_like(column.codes, masked=masked, newsize=newsize)
         return build_column(
             data=None,
@@ -1350,7 +1374,9 @@ def column_empty_like(column, dtype=None, masked=False, newsize=None):
     return column_empty(row_count, dtype, masked)
 
 
-def column_empty_like_same_mask(column, dtype):
+def column_empty_like_same_mask(
+    column: ColumnBase, dtype: Dtype
+) -> ColumnBase:
     """Create a new empty Column with the same length and the same mask.
 
     Parameters
@@ -1364,11 +1390,13 @@ def column_empty_like_same_mask(column, dtype):
     return result
 
 
-def column_empty(row_count, dtype="object", masked=False):
+def column_empty(
+    row_count: int, dtype: Dtype = "object", masked: bool = False
+) -> ColumnBase:
     """Allocate a new column like the given row_count and dtype.
     """
     dtype = pd.api.types.pandas_dtype(dtype)
-    children = ()
+    children = ()  # type: Tuple[ColumnBase, ...]
 
     if is_categorical_dtype(dtype):
         data = None
@@ -1401,8 +1429,15 @@ def column_empty(row_count, dtype="object", masked=False):
 
 
 def build_column(
-    data, dtype, mask=None, size=None, offset=0, null_count=None, children=()
-):
+    data: Union[Buffer, None],
+    dtype: Dtype,
+    *,
+    size: int = None,
+    mask: Buffer = None,
+    offset: int = 0,
+    null_count: int = None,
+    children: Tuple[ColumnBase, ...] = (),
+) -> ColumnBase:
     """
     Build a Column of the appropriate type from the given parameters
 
@@ -1437,6 +1472,7 @@ def build_column(
             children=children,
         )
     elif dtype.type is np.datetime64:
+        assert data is not None
         return cudf.core.column.DatetimeColumn(
             data=data,
             dtype=dtype,
@@ -1446,6 +1482,7 @@ def build_column(
             null_count=null_count,
         )
     elif dtype.type is np.timedelta64:
+        assert data is not None
         return cudf.core.column.TimeDeltaColumn(
             data=data,
             dtype=dtype,
@@ -1473,6 +1510,15 @@ def build_column(
         )
     elif is_struct_dtype(dtype):
         return cudf.core.column.StructColumn(
+            data=data,
+            dtype=dtype,
+            size=size,
+            mask=mask,
+            null_count=null_count,
+            children=children,
+        )
+    elif is_decimal_dtype(dtype):
+        return cudf.core.column.DecimalColumn(
             data=data,
             size=size,
             dtype=dtype,
@@ -1481,6 +1527,7 @@ def build_column(
             children=children,
         )
     else:
+        assert data is not None
         return cudf.core.column.NumericalColumn(
             data=data,
             dtype=dtype,
@@ -1492,14 +1539,14 @@ def build_column(
 
 
 def build_categorical_column(
-    categories,
-    codes,
-    mask=None,
-    size=None,
-    offset=0,
-    null_count=None,
-    ordered=None,
-):
+    categories: ColumnBase,
+    codes: ColumnBase,
+    mask: Buffer = None,
+    size: int = None,
+    offset: int = 0,
+    null_count: int = None,
+    ordered: bool = None,
+) -> "cudf.core.column.CategoricalColumn":
     """
     Build a CategoricalColumn
 
@@ -1523,9 +1570,9 @@ def build_categorical_column(
     if codes.dtype != codes_dtype:
         codes = codes.astype(codes_dtype)
 
-    dtype = CategoricalDtype(categories=as_column(categories), ordered=ordered)
+    dtype = CategoricalDtype(categories=categories, ordered=ordered)
 
-    return build_column(
+    result = build_column(
         data=None,
         dtype=dtype,
         mask=mask,
@@ -1534,9 +1581,15 @@ def build_categorical_column(
         null_count=null_count,
         children=(codes,),
     )
+    return cast("cudf.core.column.CategoricalColumn", result)
 
 
-def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
+def as_column(
+    arbitrary: Any,
+    nan_as_null: bool = None,
+    dtype: Dtype = None,
+    length: int = None,
+):
     """Create a Column from an arbitrary object
 
     Parameters
@@ -1773,7 +1826,10 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
                 mask = data.mask
 
             data = cudf.core.column.timedelta.TimeDeltaColumn(
-                data=buffer, mask=mask, dtype=arbitrary.dtype
+                data=buffer,
+                size=len(arbitrary),
+                mask=mask,
+                dtype=arbitrary.dtype,
             )
         elif arb_dtype.kind in ("O", "U"):
             data = as_column(
@@ -1822,9 +1878,7 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
             np.asarray(arbitrary), dtype=dtype, nan_as_null=nan_as_null
         )
     elif isinstance(arbitrary, cudf.Scalar):
-        data = libcudf.column.make_column_from_scalar(
-            arbitrary, length if length else 1
-        )
+        data = ColumnBase.from_scalar(arbitrary, length if length else 1)
     elif isinstance(arbitrary, pd.core.arrays.masked.BaseMaskedArray):
         cudf_dtype = arbitrary._data.dtype
 
@@ -1853,6 +1907,14 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
                                 "Cannot create list column from given data"
                             )
                         return as_column(data, nan_as_null=nan_as_null)
+                    if isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
+                        data = pa.array(
+                            arbitrary,
+                            type=pa.decimal128(
+                                precision=dtype.precision, scale=dtype.scale
+                            ),
+                        )
+                        return cudf.core.column.DecimalColumn.from_arrow(data)
                     dtype = pd.api.types.pandas_dtype(dtype)
                     if is_categorical_dtype(dtype):
                         raise TypeError
@@ -1898,7 +1960,11 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
     return data
 
 
-def column_applymap(udf, column, out_dtype):
+def column_applymap(
+    udf: Callable[[ScalarLike], ScalarLike],
+    column: ColumnBase,
+    out_dtype: Dtype,
+) -> ColumnBase:
     """Apply an element-wise function to transform the values in the Column.
 
     Parameters
@@ -1946,7 +2012,7 @@ def kernel_non_masked(values, results):
     return as_column(results)
 
 
-def _data_from_cuda_array_interface_desc(obj):
+def _data_from_cuda_array_interface_desc(obj) -> Buffer:
     desc = obj.__cuda_array_interface__
     ptr = desc["data"][0]
     nelem = desc["shape"][0] if len(desc["shape"]) > 0 else 1
@@ -1956,7 +2022,7 @@ def _data_from_cuda_array_interface_desc(obj):
     return data
 
 
-def _mask_from_cuda_array_interface_desc(obj):
+def _mask_from_cuda_array_interface_desc(obj) -> Union[Buffer, None]:
     desc = obj.__cuda_array_interface__
     mask = desc.get("mask", None)
 
@@ -1979,7 +2045,7 @@ def _mask_from_cuda_array_interface_desc(obj):
     return mask
 
 
-def serialize_columns(columns):
+def serialize_columns(columns) -> Tuple[List[dict], List]:
     """
     Return the headers and frames resulting
     from serializing a list of Column
@@ -1994,7 +2060,7 @@ def serialize_columns(columns):
     frames : list
         list of frames
     """
-    headers = []
+    headers = []  # type List[Dict[Any, Any], ...]
     frames = []
 
     if len(columns) > 0:
@@ -2006,7 +2072,7 @@ def serialize_columns(columns):
     return headers, frames
 
 
-def deserialize_columns(headers, frames):
+def deserialize_columns(headers: List[dict], frames: List) -> List[ColumnBase]:
     """
     Construct a list of Columns from a list of headers
     and frames.
@@ -2024,7 +2090,12 @@ def deserialize_columns(headers, frames):
     return columns
 
 
-def arange(start, stop=None, step=1, dtype=None):
+def arange(
+    start: Union[int, float],
+    stop: Union[int, float] = None,
+    step: Union[int, float] = 1,
+    dtype=None,
+) -> ColumnBase:
     """
     Returns a column with evenly spaced values within a given interval.
 
@@ -2077,7 +2148,7 @@ def arange(start, stop=None, step=1, dtype=None):
     )
 
 
-def full(size, fill_value, dtype=None):
+def full(size: int, fill_value: ScalarLike, dtype: Dtype = None) -> ColumnBase:
     """
     Returns a column of given size and dtype, filled with a given value.
 
@@ -2108,7 +2179,4 @@ def full(size, fill_value, dtype=None):
     4    7
     dtype: int8
     """
-
-    return libcudf.column.make_column_from_scalar(
-        cudf.Scalar(fill_value, dtype), size
-    )
+    return ColumnBase.from_scalar(cudf.Scalar(fill_value, dtype), size)
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 4561b1f68f2..8ae16288050 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -1,7 +1,10 @@
 # Copyright (c) 2019-2020, NVIDIA CORPORATION.
+from __future__ import annotations
+
 import datetime as dt
 import re
 from numbers import Number
+from typing import Any, Sequence, Union, cast
 
 import numpy as np
 import pandas as pd
@@ -9,7 +12,9 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf.core.column import column, string
+from cudf._typing import DatetimeLikeScalar, Dtype, DtypeObj, ScalarLike
+from cudf.core.buffer import Buffer
+from cudf.core.column import ColumnBase, column, string
 from cudf.utils.dtypes import is_scalar
 from cudf.utils.utils import _fillna_natwise
 
@@ -34,7 +39,13 @@
 
 class DatetimeColumn(column.ColumnBase):
     def __init__(
-        self, data, dtype, mask=None, size=None, offset=0, null_count=None
+        self,
+        data: Buffer,
+        dtype: DtypeObj,
+        mask: Buffer = None,
+        size: int = None,
+        offset: int = 0,
+        null_count: int = None,
     ):
         """
         Parameters
@@ -66,49 +77,51 @@ def __init__(
 
         self._time_unit, _ = np.datetime_data(self.dtype)
 
-    def __contains__(self, item):
+    def __contains__(self, item: ScalarLike) -> bool:
         try:
-            item = np.datetime64(item, self._time_unit)
+            item_as_dt64 = np.datetime64(item, self._time_unit)
         except ValueError:
             # If item cannot be converted to datetime type
             # np.datetime64 raises ValueError, hence `item`
             # cannot exist in `self`.
             return False
-        return item.astype("int64") in self.as_numerical
+        return item_as_dt64.astype("int64") in self.as_numerical
 
     @property
-    def time_unit(self):
+    def time_unit(self) -> str:
         return self._time_unit
 
     @property
-    def year(self):
+    def year(self) -> ColumnBase:
         return self.get_dt_field("year")
 
     @property
-    def month(self):
+    def month(self) -> ColumnBase:
         return self.get_dt_field("month")
 
     @property
-    def day(self):
+    def day(self) -> ColumnBase:
         return self.get_dt_field("day")
 
     @property
-    def hour(self):
+    def hour(self) -> ColumnBase:
         return self.get_dt_field("hour")
 
     @property
-    def minute(self):
+    def minute(self) -> ColumnBase:
         return self.get_dt_field("minute")
 
     @property
-    def second(self):
+    def second(self) -> ColumnBase:
         return self.get_dt_field("second")
 
     @property
-    def weekday(self):
+    def weekday(self) -> ColumnBase:
         return self.get_dt_field("weekday")
 
-    def to_pandas(self, index=None, **kwargs):
+    def to_pandas(
+        self, index: "cudf.Index" = None, nullable: bool = False, **kwargs
+    ) -> "cudf.Series":
         # Workaround until following issue is fixed:
         # https://issues.apache.org/jira/browse/ARROW-9772
 
@@ -122,10 +135,10 @@ def to_pandas(self, index=None, **kwargs):
 
         return pd_series
 
-    def get_dt_field(self, field):
+    def get_dt_field(self, field: str) -> ColumnBase:
         return libcudf.datetime.extract_datetime_component(self, field)
 
-    def normalize_binop_value(self, other):
+    def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:
         if isinstance(other, cudf.Scalar):
             return other
 
@@ -162,30 +175,41 @@ def normalize_binop_value(self, other):
             raise TypeError(f"cannot normalize {type(other)}")
 
     @property
-    def as_numerical(self):
-        return column.build_column(
-            data=self.base_data,
-            dtype=np.int64,
-            mask=self.base_mask,
-            offset=self.offset,
-            size=self.size,
+    def as_numerical(self) -> "cudf.core.column.NumericalColumn":
+        return cast(
+            "cudf.core.column.NumericalColumn",
+            column.build_column(
+                data=self.base_data,
+                dtype=np.int64,
+                mask=self.base_mask,
+                offset=self.offset,
+                size=self.size,
+            ),
         )
 
-    def as_datetime_column(self, dtype, **kwargs):
+    def as_datetime_column(self, dtype: Dtype, **kwargs) -> DatetimeColumn:
         dtype = np.dtype(dtype)
         if dtype == self.dtype:
             return self
         return libcudf.unary.cast(self, dtype=dtype)
 
-    def as_timedelta_column(self, dtype, **kwargs):
+    def as_timedelta_column(
+        self, dtype: Dtype, **kwargs
+    ) -> "cudf.core.column.TimeDeltaColumn":
         raise TypeError(
             f"cannot astype a datetimelike from [{self.dtype}] to [{dtype}]"
         )
 
-    def as_numerical_column(self, dtype):
-        return self.as_numerical.astype(dtype)
+    def as_numerical_column(
+        self, dtype: Dtype
+    ) -> "cudf.core.column.NumericalColumn":
+        return cast(
+            "cudf.core.column.NumericalColumn", self.as_numerical.astype(dtype)
+        )
 
-    def as_string_column(self, dtype, format=None):
+    def as_string_column(
+        self, dtype: Dtype, format=None
+    ) -> "cudf.core.column.StringColumn":
         if format is None:
             format = _dtype_to_format_conversion.get(
                 self.dtype.name, "%Y-%m-%d %H:%M:%S"
@@ -195,20 +219,25 @@ def as_string_column(self, dtype, format=None):
                 np.dtype(self.dtype)
             ](self, format)
         else:
-            return column.column_empty(0, dtype="object", masked=False)
+            return cast(
+                "cudf.core.column.StringColumn",
+                column.column_empty(0, dtype="object", masked=False),
+            )
 
-    def default_na_value(self):
+    def default_na_value(self) -> DatetimeLikeScalar:
         """Returns the default NA value for this column
         """
         return np.datetime64("nat", self.time_unit)
 
-    def mean(self, skipna=None, dtype=np.float64):
+    def mean(self, skipna=None, dtype=np.float64) -> ScalarLike:
         return pd.Timestamp(
             self.as_numerical.mean(skipna=skipna, dtype=dtype),
             unit=self.time_unit,
         )
 
-    def quantile(self, q, interpolation, exact):
+    def quantile(
+        self, q: Union[float, Sequence[float]], interpolation: str, exact: bool
+    ) -> ColumnBase:
         result = self.as_numerical.quantile(
             q=q, interpolation=interpolation, exact=exact
         )
@@ -216,18 +245,23 @@ def quantile(self, q, interpolation, exact):
             return pd.Timestamp(result, unit=self.time_unit)
         return result.astype(self.dtype)
 
-    def binary_operator(self, op, rhs, reflect=False):
+    def binary_operator(
+        self,
+        op: str,
+        rhs: Union[ColumnBase, "cudf.Scalar"],
+        reflect: bool = False,
+    ) -> ColumnBase:
         if isinstance(rhs, cudf.DateOffset):
             return binop_offset(self, rhs, op)
         lhs, rhs = self, rhs
         if op in ("eq", "ne", "lt", "gt", "le", "ge"):
             out_dtype = np.bool
         elif op == "add" and pd.api.types.is_timedelta64_dtype(rhs.dtype):
-            out_dtype = cudf.core.column.timedelta._timedelta_binary_op_add(
+            out_dtype = cudf.core.column.timedelta._timedelta_add_result_dtype(
                 rhs, lhs
             )
         elif op == "sub" and pd.api.types.is_timedelta64_dtype(rhs.dtype):
-            out_dtype = cudf.core.column.timedelta._timedelta_binary_op_sub(
+            out_dtype = cudf.core.column.timedelta._timedelta_sub_result_dtype(
                 rhs if reflect else lhs, lhs if reflect else rhs
             )
         elif op == "sub" and pd.api.types.is_datetime64_dtype(rhs.dtype):
@@ -244,13 +278,11 @@ def binary_operator(self, op, rhs, reflect=False):
                 f"Series of dtype {self.dtype} cannot perform "
                 f" the operation {op}"
             )
+        return binop(lhs, rhs, op=op, out_dtype=out_dtype, reflect=reflect)
 
-        if reflect:
-            lhs, rhs = rhs, lhs
-
-        return binop(lhs, rhs, op=op, out_dtype=out_dtype)
-
-    def fillna(self, fill_value=None, method=None):
+    def fillna(
+        self, fill_value: Any = None, method: str = None, dtype: Dtype = None
+    ) -> DatetimeColumn:
         if fill_value is not None:
             if cudf.utils.utils.isnat(fill_value):
                 return _fillna_natwise(self)
@@ -262,7 +294,9 @@ def fillna(self, fill_value=None, method=None):
 
         return super().fillna(fill_value, method)
 
-    def find_first_value(self, value, closest=False):
+    def find_first_value(
+        self, value: ScalarLike, closest: bool = False
+    ) -> int:
         """
         Returns offset of first value that matches
         """
@@ -270,7 +304,7 @@ def find_first_value(self, value, closest=False):
         value = column.as_column(value, dtype=self.dtype).as_numerical[0]
         return self.as_numerical.find_first_value(value, closest=closest)
 
-    def find_last_value(self, value, closest=False):
+    def find_last_value(self, value: ScalarLike, closest: bool = False) -> int:
         """
         Returns offset of last value that matches
         """
@@ -279,10 +313,10 @@ def find_last_value(self, value, closest=False):
         return self.as_numerical.find_last_value(value, closest=closest)
 
     @property
-    def is_unique(self):
+    def is_unique(self) -> bool:
         return self.as_numerical.is_unique
 
-    def can_cast_safely(self, to_dtype):
+    def can_cast_safely(self, to_dtype: Dtype) -> bool:
         if np.issubdtype(to_dtype, np.datetime64):
 
             to_res, _ = np.datetime_data(to_dtype)
@@ -315,7 +349,15 @@ def can_cast_safely(self, to_dtype):
 
 
 @annotate("BINARY_OP", color="orange", domain="cudf_python")
-def binop(lhs, rhs, op, out_dtype):
+def binop(
+    lhs: Union[ColumnBase, ScalarLike],
+    rhs: Union[ColumnBase, ScalarLike],
+    op: str,
+    out_dtype: Dtype,
+    reflect: bool,
+) -> ColumnBase:
+    if reflect:
+        lhs, rhs = rhs, lhs
     out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
     return out
 
@@ -329,11 +371,10 @@ def binop_offset(lhs, rhs, op):
         return out
 
 
-def infer_format(element, **kwargs):
+def infer_format(element: str, **kwargs) -> str:
     """
     Infers datetime format from a string, also takes cares for `ms` and `ns`
     """
-
     fmt = pd.core.tools.datetimes._guess_datetime_format(element, **kwargs)
 
     if fmt is not None:
@@ -345,8 +386,8 @@ def infer_format(element, **kwargs):
 
     # There is possibility that the element is of following format
     # '00:00:03.333333 2016-01-01'
-    second_part = re.split(r"(\D+)", element_parts[1], maxsplit=1)
-    subsecond_fmt = ".%" + str(len(second_part[0])) + "f"
+    second_parts = re.split(r"(\D+)", element_parts[1], maxsplit=1)
+    subsecond_fmt = ".%" + str(len(second_parts[0])) + "f"
 
     first_part = pd.core.tools.datetimes._guess_datetime_format(
         element_parts[0], **kwargs
@@ -360,16 +401,16 @@ def infer_format(element, **kwargs):
     if first_part is None:
         raise ValueError("Unable to infer the timestamp format from the data")
 
-    if len(second_part) > 1:
+    if len(second_parts) > 1:
         # "Z" indicates Zulu time(widely used in aviation) - Which is
         # UTC timezone that currently cudf only supports. Having any other
         # unsuppported timezone will let the code fail below
         # with a ValueError.
-        second_part.remove("Z")
-        second_part = "".join(second_part[1:])
+        second_parts.remove("Z")
+        second_part = "".join(second_parts[1:])
 
         if len(second_part) > 1:
-            # Only infer if second_part is not an empty string.
+            # Only infer if second_parts is not an empty string.
             second_part = pd.core.tools.datetimes._guess_datetime_format(
                 second_part, **kwargs
             )
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
new file mode 100644
index 00000000000..58156c3826c
--- /dev/null
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+import cupy as cp
+import numpy as np
+import pyarrow as pa
+
+from cudf import _lib as libcudf
+from cudf.core.buffer import Buffer
+from cudf.core.column import ColumnBase
+from cudf.core.dtypes import Decimal64Dtype
+from cudf.utils.utils import pa_mask_buffer_to_mask
+
+
+class DecimalColumn(ColumnBase):
+    @classmethod
+    def from_arrow(cls, data: pa.Array):
+        dtype = Decimal64Dtype.from_arrow(data.type)
+        mask_buf = data.buffers()[0]
+        mask = (
+            mask_buf
+            if mask_buf is None
+            else pa_mask_buffer_to_mask(mask_buf, len(data))
+        )
+        data_128 = cp.array(np.frombuffer(data.buffers()[1]).view("int64"))
+        data_64 = data_128[::2].copy()
+        return cls(
+            data=Buffer(data_64.view("uint8")),
+            size=len(data),
+            dtype=dtype,
+            mask=mask,
+        )
+
+    def to_arrow(self):
+        data_buf_64 = self.base_data.to_host_array().view("int64")
+        data_buf_128 = np.empty(len(data_buf_64) * 2, dtype="int64")
+        # use striding to set the first 64 bits of each 128-bit chunk:
+        data_buf_128[::2] = data_buf_64
+        # use striding again to set the remaining bits of each 128-bit chunk:
+        # 0 for non-negative values, -1 for negative values:
+        data_buf_128[1::2] = np.piecewise(
+            data_buf_64, [data_buf_64 < 0], [-1, 0]
+        )
+        data_buf = pa.py_buffer(data_buf_128)
+        mask_buf = (
+            self.base_mask
+            if self.base_mask is None
+            else pa.py_buffer(self.base_mask.to_host_array())
+        )
+        return pa.Array.from_buffers(
+            type=self.dtype.to_arrow(),
+            length=self.size,
+            buffers=[mask_buf, data_buf],
+        )
+
+    def binary_operator(self, op, other, reflect=False):
+        if reflect:
+            self, other = other, self
+        result = libcudf.binaryop.binaryop(self, other, op, "int32")
+        result.dtype.precision = _binop_precision(self.dtype, other.dtype, op)
+        return result
+
+
+def _binop_precision(l_dtype, r_dtype, op):
+    """
+    Returns the result precision when performing the
+    binary operation `op` for the given dtypes.
+
+    See: https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql
+    """  # noqa: E501
+    p1, p2 = l_dtype.precision, r_dtype.precision
+    s1, s2 = l_dtype.scale, r_dtype.scale
+    if op in ("add", "sub"):
+        return max(s1, s2) + max(p1 - s1, p2 - s2) + 1
+    elif op == "mul":
+        return p1 + p2 + 1
+    else:
+        raise NotImplementedError()
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index c2aa41a5de1..8641bc88806 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -173,8 +173,7 @@ def __init__(self, column, parent=None):
             raise AttributeError(
                 "Can only use .list accessor with a 'list' dtype"
             )
-        self._column = column
-        self._parent = parent
+        super().__init__(column=column, parent=parent)
 
     @property
     def leaves(self):
diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py
index 8395c9c3da6..eec9c2a7860 100644
--- a/python/cudf/cudf/core/column/methods.py
+++ b/python/cudf/cudf/core/column/methods.py
@@ -1,9 +1,57 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Optional, Union, overload
+
+from typing_extensions import Literal
+
 import cudf
 
+if TYPE_CHECKING:
+    from cudf.core.column import ColumnBase
+
 
 class ColumnMethodsMixin:
+    _column: ColumnBase
+    _parent: Optional[Union["cudf.Series", "cudf.Index"]]
+
+    def __init__(
+        self,
+        column: ColumnBase,
+        parent: Union["cudf.Series", "cudf.Index"] = None,
+    ):
+        self._column = column
+        self._parent = parent
+
+    @overload
+    def _return_or_inplace(
+        self, new_col, inplace: Literal[False], expand=False, retain_index=True
+    ) -> Union["cudf.Series", "cudf.Index"]:
+        ...
+
+    @overload
+    def _return_or_inplace(
+        self, new_col, expand: bool = False, retain_index: bool = True
+    ) -> Union["cudf.Series", "cudf.Index"]:
+        ...
+
+    @overload
+    def _return_or_inplace(
+        self, new_col, inplace: Literal[True], expand=False, retain_index=True
+    ) -> None:
+        ...
+
+    @overload
+    def _return_or_inplace(
+        self,
+        new_col,
+        inplace: bool = False,
+        expand: bool = False,
+        retain_index: bool = True,
+    ) -> Optional[Union["cudf.Series", "cudf.Index"]]:
+        ...
+
     def _return_or_inplace(
         self, new_col, inplace=False, expand=False, retain_index=True
     ):
@@ -19,31 +67,29 @@ def _return_or_inplace(
                     ),
                     inplace=True,
                 )
+                return None
             else:
                 self._column._mimic_inplace(new_col, inplace=True)
+                return None
         else:
+            if self._parent is None:
+                return new_col
             if expand or isinstance(
                 self._parent, (cudf.DataFrame, cudf.MultiIndex)
             ):
                 # This branch indicates the passed as new_col
-                # is actually a table-like data
+                # is a Table
                 table = new_col
 
-                if isinstance(table, cudf._lib.table.Table):
-                    if isinstance(self._parent, cudf.Index):
-                        idx = self._parent._constructor_expanddim._from_table(
-                            table=table
-                        )
-                        idx.names = None
-                        return idx
-                    else:
-                        return self._parent._constructor_expanddim(
-                            data=table._data, index=self._parent.index
-                        )
+                if isinstance(self._parent, cudf.Index):
+                    idx = self._parent._constructor_expanddim._from_table(
+                        table=table
+                    )
+                    idx.names = None
+                    return idx
                 else:
                     return self._parent._constructor_expanddim(
-                        {index: value for index, value in enumerate(table)},
-                        index=self._parent.index,
+                        data=table._data, index=self._parent.index
                     )
             elif isinstance(self._parent, cudf.Series):
                 if retain_index:
@@ -59,7 +105,4 @@ def _return_or_inplace(
                     new_col, name=self._parent.name
                 )
             else:
-                if self._parent is None:
-                    return new_col
-                else:
-                    return self._parent._mimic_inplace(new_col, inplace=False)
+                return self._parent._mimic_inplace(new_col, inplace=False)
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 54a6d274843..f77c408f205 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -1,6 +1,8 @@
 # Copyright (c) 2018-2021, NVIDIA CORPORATION.
+from __future__ import annotations
 
 from numbers import Number
+from typing import Any, Callable, Sequence, Union, cast
 
 import numpy as np
 import pandas as pd
@@ -10,8 +12,15 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.quantiles import quantile as cpp_quantile
+from cudf._typing import BinaryOperand, ColumnLike, Dtype, DtypeObj, ScalarLike
 from cudf.core.buffer import Buffer
-from cudf.core.column import as_column, build_column, column, string
+from cudf.core.column import (
+    ColumnBase,
+    as_column,
+    build_column,
+    column,
+    string,
+)
 from cudf.utils import cudautils, utils
 from cudf.utils.dtypes import (
     min_column_type,
@@ -21,9 +30,15 @@
 )
 
 
-class NumericalColumn(column.ColumnBase):
+class NumericalColumn(ColumnBase):
     def __init__(
-        self, data, dtype, mask=None, size=None, offset=0, null_count=None
+        self,
+        data: Buffer,
+        dtype: DtypeObj,
+        mask: Buffer = None,
+        size: int = None,
+        offset: int = 0,
+        null_count: int = None,
     ):
         """
         Parameters
@@ -39,6 +54,7 @@ def __init__(
         if size is None:
             size = data.size // dtype.itemsize
             size = size - offset
+
         super().__init__(
             data,
             size=size,
@@ -48,7 +64,7 @@ def __init__(
             null_count=null_count,
         )
 
-    def __contains__(self, item):
+    def __contains__(self, item: ScalarLike) -> bool:
         """
         Returns True if column contains item, else False.
         """
@@ -66,10 +82,12 @@ def __contains__(self, item):
             self, column.as_column([item], dtype=self.dtype)
         ).any()
 
-    def unary_operator(self, unaryop):
+    def unary_operator(self, unaryop: str) -> ColumnBase:
         return _numeric_column_unaryop(self, op=unaryop)
 
-    def binary_operator(self, binop, rhs, reflect=False):
+    def binary_operator(
+        self, binop: str, rhs: BinaryOperand, reflect: bool = False,
+    ) -> ColumnBase:
         int_dtypes = [
             np.dtype("int8"),
             np.dtype("int16"),
@@ -80,32 +98,33 @@ def binary_operator(self, binop, rhs, reflect=False):
             np.dtype("uint32"),
             np.dtype("uint64"),
         ]
-        tmp = rhs
-        if reflect:
-            tmp = self
-        if isinstance(rhs, (NumericalColumn, cudf.Scalar)) or np.isscalar(rhs):
+        if rhs is None:
+            out_dtype = self.dtype
+        else:
+            if not (
+                isinstance(rhs, (NumericalColumn, cudf.Scalar,),)
+                or np.isscalar(rhs)
+            ):
+                msg = "{!r} operator not supported between {} and {}"
+                raise TypeError(msg.format(binop, type(self), type(rhs)))
             out_dtype = np.result_type(self.dtype, rhs.dtype)
             if binop in ["mod", "floordiv"]:
+                tmp = self if reflect else rhs
                 if (tmp.dtype in int_dtypes) and (
                     (np.isscalar(tmp) and (0 == tmp))
                     or ((isinstance(tmp, NumericalColumn)) and (0.0 in tmp))
                 ):
                     out_dtype = np.dtype("float64")
-        elif rhs is None:
-            out_dtype = self.dtype
-        else:
-            raise TypeError(
-                f"'{binop}' operator not supported between "
-                f"{type(self).__name__} and {type(rhs).__name__}"
-            )
         return _numeric_column_binop(
             lhs=self, rhs=rhs, op=binop, out_dtype=out_dtype, reflect=reflect
         )
 
-    def _apply_scan_op(self, op):
+    def _apply_scan_op(self, op: str) -> ColumnBase:
         return libcudf.reduce.scan(op, self, True)
 
-    def normalize_binop_value(self, other):
+    def normalize_binop_value(
+        self, other: ScalarLike
+    ) -> Union[ColumnBase, ScalarLike]:
         if other is None:
             return other
         if isinstance(other, cudf.Scalar):
@@ -122,8 +141,8 @@ def normalize_binop_value(self, other):
                 return other
             other_dtype = np.promote_types(self.dtype, other_dtype)
             if other_dtype == np.dtype("float16"):
-                other = np.dtype("float32").type(other)
-                other_dtype = other.dtype
+                other_dtype = np.dtype("float32")
+                other = other_dtype.type(other)
             if self.dtype.kind == "b":
                 other_dtype = min_signed_type(other)
             if np.isscalar(other):
@@ -134,104 +153,110 @@ def normalize_binop_value(self, other):
                     other, size=len(self), dtype=other_dtype
                 )
                 return column.build_column(
-                    data=Buffer.from_array_like(ary),
-                    dtype=ary.dtype,
-                    mask=self.mask,
+                    data=Buffer(ary), dtype=ary.dtype, mask=self.mask,
                 )
         else:
             raise TypeError(f"cannot broadcast {type(other)}")
 
-    def int2ip(self):
+    def int2ip(self) -> "cudf.core.column.StringColumn":
         if self.dtype != np.dtype("int64"):
             raise TypeError("Only int64 type can be converted to ip")
 
         return libcudf.string_casting.int2ip(self)
 
-    def as_string_column(self, dtype, **kwargs):
+    def as_string_column(
+        self, dtype: Dtype, format=None
+    ) -> "cudf.core.column.StringColumn":
         if len(self) > 0:
             return string._numeric_to_str_typecast_functions[
                 np.dtype(self.dtype)
             ](self)
         else:
-            return as_column([], dtype="object")
-
-    def as_datetime_column(self, dtype, **kwargs):
+            return cast(
+                "cudf.core.column.StringColumn", as_column([], dtype="object")
+            )
 
-        return build_column(
-            data=self.astype("int64").base_data,
-            dtype=dtype,
-            mask=self.base_mask,
-            offset=self.offset,
-            size=self.size,
+    def as_datetime_column(
+        self, dtype: Dtype, **kwargs
+    ) -> "cudf.core.column.DatetimeColumn":
+        return cast(
+            "cudf.core.column.DatetimeColumn",
+            build_column(
+                data=self.astype("int64").base_data,
+                dtype=dtype,
+                mask=self.base_mask,
+                offset=self.offset,
+                size=self.size,
+            ),
         )
 
-    def as_timedelta_column(self, dtype, **kwargs):
-
-        return build_column(
-            data=self.astype("int64").base_data,
-            dtype=dtype,
-            mask=self.base_mask,
-            offset=self.offset,
-            size=self.size,
+    def as_timedelta_column(
+        self, dtype: Dtype, **kwargs
+    ) -> "cudf.core.column.TimeDeltaColumn":
+        return cast(
+            "cudf.core.column.TimeDeltaColumn",
+            build_column(
+                data=self.astype("int64").base_data,
+                dtype=dtype,
+                mask=self.base_mask,
+                offset=self.offset,
+                size=self.size,
+            ),
         )
 
-    def as_numerical_column(self, dtype):
+    def as_numerical_column(self, dtype: Dtype) -> NumericalColumn:
         dtype = np.dtype(dtype)
         if dtype == self.dtype:
             return self
         return libcudf.unary.cast(self, dtype)
 
-    def sum(self, skipna=None, dtype=None, min_count=0):
-        result_col = self._process_for_reduction(
+    def reduce(self, op: str, skipna: bool = None, **kwargs) -> float:
+        min_count = kwargs.pop("min_count", 0)
+        preprocessed = self._process_for_reduction(
             skipna=skipna, min_count=min_count
         )
-        if isinstance(result_col, cudf.core.column.ColumnBase):
-            return libcudf.reduce.reduce("sum", result_col, dtype=dtype)
+        if isinstance(preprocessed, ColumnBase):
+            return libcudf.reduce.reduce(op, preprocessed, **kwargs)
         else:
-            return result_col
+            return cast(float, preprocessed)
 
-    def product(self, skipna=None, dtype=None, min_count=0):
-        result_col = self._process_for_reduction(
-            skipna=skipna, min_count=min_count
+    def sum(
+        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
+    ) -> float:
+        return self.reduce(
+            "sum", skipna=skipna, dtype=dtype, min_count=min_count
         )
-        if isinstance(result_col, cudf.core.column.ColumnBase):
-            return libcudf.reduce.reduce("product", result_col, dtype=dtype)
-        else:
-            return result_col
 
-    def mean(self, skipna=None, dtype=np.float64):
-        result_col = self._process_for_reduction(skipna=skipna)
-        if isinstance(result_col, cudf.core.column.ColumnBase):
-            return libcudf.reduce.reduce("mean", result_col, dtype=dtype)
-        else:
-            return result_col
+    def product(
+        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
+    ) -> float:
+        return self.reduce(
+            "product", skipna=skipna, dtype=dtype, min_count=min_count
+        )
 
-    def var(self, skipna=None, ddof=1, dtype=np.float64):
-        result = self._process_for_reduction(skipna=skipna)
-        if isinstance(result, cudf.core.column.ColumnBase):
-            return libcudf.reduce.reduce("var", result, dtype=dtype, ddof=ddof)
-        else:
-            return result
+    def mean(self, skipna: bool = None, dtype: Dtype = np.float64) -> float:
+        return self.reduce("mean", skipna=skipna, dtype=dtype)
 
-    def std(self, skipna=None, ddof=1, dtype=np.float64):
-        result_col = self._process_for_reduction(skipna=skipna)
-        if isinstance(result_col, cudf.core.column.ColumnBase):
-            return libcudf.reduce.reduce(
-                "std", result_col, dtype=dtype, ddof=ddof
-            )
-        else:
-            return result_col
+    def var(
+        self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64
+    ) -> float:
+        return self.reduce("var", skipna=skipna, dtype=dtype, ddof=ddof)
 
-    def sum_of_squares(self, dtype=None):
+    def std(
+        self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64
+    ) -> float:
+        return self.reduce("std", skipna=skipna, dtype=dtype, ddof=ddof)
+
+    def sum_of_squares(self, dtype: Dtype = None) -> float:
         return libcudf.reduce.reduce("sum_of_squares", self, dtype=dtype)
 
-    def kurtosis(self, skipna=None):
+    def kurtosis(self, skipna: bool = None) -> float:
         skipna = True if skipna is None else skipna
 
         if len(self) == 0 or (not skipna and self.has_nulls):
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
 
-        self = self.nans_to_nulls().dropna()
+        self = self.nans_to_nulls().dropna()  # type: ignore
 
         if len(self) < 4:
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
@@ -250,13 +275,13 @@ def kurtosis(self, skipna=None):
         kurt = term_one_section_one * term_one_section_two - 3 * term_two
         return kurt
 
-    def skew(self, skipna=None):
+    def skew(self, skipna: bool = None) -> ScalarLike:
         skipna = True if skipna is None else skipna
 
         if len(self) == 0 or (not skipna and self.has_nulls):
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
 
-        self = self.nans_to_nulls().dropna()
+        self = self.nans_to_nulls().dropna()  # type: ignore
 
         if len(self) < 3:
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
@@ -273,7 +298,9 @@ def skew(self, skipna=None):
         skew = unbiased_coef * m3 / (m2 ** (3 / 2))
         return skew
 
-    def quantile(self, q, interpolation, exact):
+    def quantile(
+        self, q: Union[float, Sequence[float]], interpolation: str, exact: bool
+    ) -> NumericalColumn:
         if isinstance(q, Number) or cudf.utils.dtypes.is_list_like(q):
             np_array_q = np.asarray(q)
             if np.logical_or(np_array_q < 0, np_array_q > 1).any():
@@ -284,15 +311,14 @@ def quantile(self, q, interpolation, exact):
         # will only have values in range [0, 1]
         result = self._numeric_quantile(q, interpolation, exact)
         if isinstance(q, Number):
-            result = result[0]
             return (
                 cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
-                if result is cudf.NA
-                else result
+                if result[0] is cudf.NA
+                else result[0]
             )
         return result
 
-    def median(self, skipna=None):
+    def median(self, skipna: bool = None) -> NumericalColumn:
         skipna = True if skipna is None else skipna
 
         if not skipna and self.has_nulls:
@@ -301,24 +327,17 @@ def median(self, skipna=None):
         # enforce linear in case the default ever changes
         return self.quantile(0.5, interpolation="linear", exact=True)
 
-    def _numeric_quantile(self, q, interpolation, exact):
-        is_number = isinstance(q, Number)
-
-        if is_number:
-            quant = [float(q)]
-        elif isinstance(q, list) or isinstance(q, np.ndarray):
-            quant = q
-        else:
-            msg = "`q` must be either a single element, list or numpy array"
-            raise TypeError(msg)
-
+    def _numeric_quantile(
+        self, q: Union[float, Sequence[float]], interpolation: str, exact: bool
+    ) -> NumericalColumn:
+        quant = [float(q)] if not isinstance(q, (Sequence, np.ndarray)) else q
         # get sorted indices and exclude nulls
         sorted_indices = self.as_frame()._get_sorted_inds(True, "first")
         sorted_indices = sorted_indices[self.null_count :]
 
         return cpp_quantile(self, quant, interpolation, sorted_indices, exact)
 
-    def cov(self, other):
+    def cov(self, other: ColumnBase) -> float:
         if (
             len(self) == 0
             or len(other) == 0
@@ -330,7 +349,7 @@ def cov(self, other):
         cov_sample = result.sum() / (len(self) - 1)
         return cov_sample
 
-    def corr(self, other):
+    def corr(self, other: ColumnBase) -> float:
         if len(self) == 0 or len(other) == 0:
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
 
@@ -341,12 +360,14 @@ def corr(self, other):
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
         return cov / lhs_std / rhs_std
 
-    def round(self, decimals=0):
+    def round(self, decimals: int = 0) -> NumericalColumn:
         """Round the values in the Column to the given number of decimals.
         """
         return libcudf.round.round(self, decimal_places=decimals)
 
-    def applymap(self, udf, out_dtype=None):
+    def applymap(
+        self, udf: Callable[[ScalarLike], ScalarLike], out_dtype: Dtype = None
+    ) -> ColumnBase:
         """Apply an element-wise function to transform the values in the Column.
 
         Parameters
@@ -367,7 +388,7 @@ def applymap(self, udf, out_dtype=None):
         out = column.column_applymap(udf=udf, column=self, out_dtype=out_dtype)
         return out
 
-    def default_na_value(self):
+    def default_na_value(self) -> ScalarLike:
         """Returns the default NA value for this column
         """
         dkind = self.dtype.kind
@@ -382,7 +403,12 @@ def default_na_value(self):
         else:
             raise TypeError(f"numeric column of {self.dtype} has no NaN value")
 
-    def find_and_replace(self, to_replace, replacement, all_nan):
+    def find_and_replace(
+        self,
+        to_replace: ColumnLike,
+        replacement: ColumnLike,
+        all_nan: bool = False,
+    ) -> NumericalColumn:
         """
         Return col with *to_replace* replaced with *value*.
         """
@@ -409,38 +435,52 @@ def find_and_replace(self, to_replace, replacement, all_nan):
             replaced, to_replace_col, replacement_col
         )
 
-    def fillna(self, fill_value=None, method=None):
+    def fillna(
+        self,
+        fill_value: Any = None,
+        method: str = None,
+        dtype: Dtype = None,
+        fill_nan: bool = True,
+    ) -> NumericalColumn:
         """
         Fill null values with *fill_value*
         """
+        if fill_nan:
+            col = self.nans_to_nulls()
+        else:
+            col = self
+
         if method is not None:
-            return super().fillna(fill_value, method)
+            return super(NumericalColumn, col).fillna(fill_value, method)
 
         if (
             isinstance(fill_value, cudf.Scalar)
-            and fill_value.dtype == self.dtype
+            and fill_value.dtype == col.dtype
         ):
-            return super().fillna(fill_value, method)
+            return super(NumericalColumn, col).fillna(fill_value, method)
+
         if np.isscalar(fill_value):
             # castsafely to the same dtype as self
-            fill_value_casted = self.dtype.type(fill_value)
+            fill_value_casted = col.dtype.type(fill_value)
             if not np.isnan(fill_value) and (fill_value_casted != fill_value):
                 raise TypeError(
                     f"Cannot safely cast non-equivalent "
-                    f"{type(fill_value).__name__} to {self.dtype.name}"
+                    f"{type(fill_value).__name__} to {col.dtype.name}"
                 )
             fill_value = cudf.Scalar(fill_value_casted)
         else:
             fill_value = column.as_column(fill_value, nan_as_null=False)
             # cast safely to the same dtype as self
-            if is_integer_dtype(self.dtype):
-                fill_value = _safe_cast_to_int(fill_value, self.dtype)
+            if is_integer_dtype(col.dtype):
+                fill_value = _safe_cast_to_int(fill_value, col.dtype)
             else:
-                fill_value = fill_value.astype(self.dtype)
+                fill_value = fill_value.astype(col.dtype)
 
-        return super().fillna(fill_value, method)
+        return super(NumericalColumn, col).fillna(fill_value, method)
 
-    def find_first_value(self, value, closest=False):
+    def find_first_value(
+        self, value: ScalarLike, closest: bool = False
+    ) -> int:
         """
         Returns offset of first value that matches. For monotonic
         columns, returns the offset of the first larger value
@@ -469,7 +509,7 @@ def find_first_value(self, value, closest=False):
             raise ValueError("value not found")
         return found
 
-    def find_last_value(self, value, closest=False):
+    def find_last_value(self, value: ScalarLike, closest: bool = False) -> int:
         """
         Returns offset of last value that matches. For monotonic
         columns, returns the offset of the last smaller value
@@ -498,7 +538,7 @@ def find_last_value(self, value, closest=False):
             raise ValueError("value not found")
         return found
 
-    def can_cast_safely(self, to_dtype):
+    def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
         """
         Returns true if all the values in self can be
         safely cast to dtype
@@ -585,10 +625,10 @@ def can_cast_safely(self, to_dtype):
         elif self.dtype.kind == "f" and to_dtype.kind in {"i", "u"}:
             info = np.iinfo(to_dtype)
             min_, max_ = info.min, info.max
+
             # best we can do is hope to catch it here and avoid compare
             if (self.min() >= min_) and (self.max() <= max_):
-
-                filled = self.fillna(0)
+                filled = self.fillna(0, fill_nan=False)
                 if (cudf.Series(filled) % 1 == 0).all():
                     return True
                 else:
@@ -596,9 +636,17 @@ def can_cast_safely(self, to_dtype):
             else:
                 return False
 
+        return False
+
 
 @annotate("BINARY_OP", color="orange", domain="cudf_python")
-def _numeric_column_binop(lhs, rhs, op, out_dtype, reflect=False):
+def _numeric_column_binop(
+    lhs: Union[ColumnBase, ScalarLike],
+    rhs: Union[ColumnBase, ScalarLike],
+    op: str,
+    out_dtype: Dtype,
+    reflect: bool = False,
+) -> ColumnBase:
     if reflect:
         lhs, rhs = rhs, lhs
 
@@ -615,7 +663,7 @@ def _numeric_column_binop(lhs, rhs, op, out_dtype, reflect=False):
     return out
 
 
-def _numeric_column_unaryop(operand, op):
+def _numeric_column_unaryop(operand: ColumnBase, op: str) -> ColumnBase:
     if callable(op):
         return libcudf.transform.transform(operand, op)
 
@@ -623,7 +671,7 @@ def _numeric_column_unaryop(operand, op):
     return libcudf.unary.unary_operation(operand, op)
 
 
-def _safe_cast_to_int(col, dtype):
+def _safe_cast_to_int(col: ColumnBase, dtype: DtypeObj) -> ColumnBase:
     """
     Cast given NumericalColumn to given integer dtype safely.
     """
@@ -642,7 +690,9 @@ def _safe_cast_to_int(col, dtype):
         )
 
 
-def _normalize_find_and_replace_input(input_column_dtype, col_to_normalize):
+def _normalize_find_and_replace_input(
+    input_column_dtype: DtypeObj, col_to_normalize: Union[ColumnBase, list]
+) -> ColumnBase:
     normalized_column = column.as_column(
         col_to_normalize,
         dtype=input_column_dtype if len(col_to_normalize) <= 0 else None,
@@ -684,7 +734,9 @@ def _normalize_find_and_replace_input(input_column_dtype, col_to_normalize):
     return normalized_column.astype(input_column_dtype)
 
 
-def digitize(column, bins, right=False):
+def digitize(
+    column: ColumnBase, bins: np.ndarray, right: bool = False
+) -> ColumnBase:
     """Return the indices of the bins to which each value in column belongs.
 
     Parameters
@@ -699,7 +751,7 @@ def digitize(column, bins, right=False):
 
     Returns
     -------
-    A device array containing the indices
+    A column containing the indices
     """
     if not column.dtype == bins.dtype:
         raise ValueError(
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index f5df440b865..0124b421266 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -1,10 +1,15 @@
 # Copyright (c) 2019-2020, NVIDIA CORPORATION.
+from __future__ import annotations
+
+import builtins
 import pickle
 import warnings
+from typing import Any, Dict, Optional, Sequence, Tuple, Union, cast, overload
 
 import cupy
 import numpy as np
 import pandas as pd
+from numba import cuda
 from nvtx import annotate
 
 import cudf
@@ -140,6 +145,7 @@
     translate as cpp_translate,
 )
 from cudf._lib.strings.wrap import wrap as cpp_wrap
+from cudf._typing import ColumnLike, Dtype, ScalarLike
 from cudf.core.buffer import Buffer
 from cudf.core.column import column, datetime
 from cudf.core.column.methods import ColumnMethodsMixin
@@ -197,6 +203,9 @@
 }
 
 
+ParentType = Union["cudf.Series", "cudf.Index"]
+
+
 class StringMethods(ColumnMethodsMixin):
     def __init__(self, column, parent=None):
         """
@@ -214,10 +223,9 @@ def __init__(self, column, parent=None):
             raise AttributeError(
                 "Can only use .str accessor with string values"
             )
-        self._column = column
-        self._parent = parent
+        super().__init__(column=column, parent=parent)
 
-    def htoi(self):
+    def htoi(self) -> ParentType:
         """
         Returns integer value represented by each hex string.
         String is interpretted to have hex (base-16) characters.
@@ -242,7 +250,7 @@ def htoi(self):
 
         return self._return_or_inplace(out, inplace=False)
 
-    def ip2int(self):
+    def ip2int(self) -> ParentType:
         """
         This converts ip strings to integers
 
@@ -279,7 +287,7 @@ def __getitem__(self, key):
         else:
             return self.get(key)
 
-    def len(self):
+    def len(self) -> ParentType:
         """
         Computes the length of each element in the Series/Index.
 
@@ -301,7 +309,7 @@ def len(self):
 
         return self._return_or_inplace(cpp_count_characters(self._column))
 
-    def byte_count(self):
+    def byte_count(self) -> ParentType:
         """
         Computes the number of bytes of each string in the Series/Index.
 
@@ -328,6 +336,16 @@ def byte_count(self):
         """
         return self._return_or_inplace(cpp_count_bytes(self._column),)
 
+    @overload
+    def cat(self, sep: str = None, na_rep: str = None) -> str:
+        ...
+
+    @overload
+    def cat(
+        self, others, sep: str = None, na_rep: str = None
+    ) -> Union[ParentType, "cudf.core.column.StringColumn"]:
+        ...
+
     def cat(self, others=None, sep=None, na_rep=None):
         """
         Concatenate strings in the Series/Index with given separator.
@@ -339,28 +357,28 @@ def cat(self, others=None, sep=None, na_rep=None):
 
         Parameters
         ----------
-            others : Series or List of str
-                Strings to be appended.
-                The number of strings must match ``size()`` of this instance.
-                This must be either a Series of string dtype or a Python
-                list of strings.
+        others : Series or List of str
+            Strings to be appended.
+            The number of strings must match ``size()`` of this instance.
+            This must be either a Series of string dtype or a Python
+            list of strings.
 
-            sep : str
-                If specified, this separator will be appended to each string
-                before appending the others.
+        sep : str
+            If specified, this separator will be appended to each string
+            before appending the others.
 
-            na_rep : str
-                This character will take the place of any null strings
-                (not empty strings) in either list.
+        na_rep : str
+            This character will take the place of any null strings
+            (not empty strings) in either list.
 
-                -  If ``na_rep`` is ``None``, and ``others`` is ``None``,
-                   missing values in the Series/Index are
-                   omitted from the result.
+            -  If ``na_rep`` is ``None``, and ``others`` is ``None``,
+               missing values in the Series/Index are
+               omitted from the result.
 
-                -  If ``na_rep`` is ``None``, and ``others`` is
-                   not ``None``, a row containing a missing value
-                   in any of the columns (before concatenation)
-                   will have a missing value in the result.
+            -  If ``na_rep`` is ``None``, and ``others`` is
+               not ``None``, a row containing a missing value
+               in any of the columns (before concatenation)
+               will have a missing value in the result.
 
         Returns
         -------
@@ -441,7 +459,7 @@ def cat(self, others=None, sep=None, na_rep=None):
                 out = out[0]
         return out
 
-    def join(self, sep):
+    def join(self, sep) -> ParentType:
         """
         Join lists contained as elements in the Series/Index with passed
         delimiter.
@@ -453,7 +471,9 @@ def join(self, sep):
             "Columns of arrays / lists are not yet " "supported"
         )
 
-    def extract(self, pat, flags=0, expand=True):
+    def extract(
+        self, pat: str, flags: int = 0, expand: bool = True
+    ) -> ParentType:
         """
         Extract capture groups in the regex `pat` as columns in a DataFrame.
 
@@ -517,7 +537,14 @@ def extract(self, pat, flags=0, expand=True):
         else:
             return self._return_or_inplace(out, expand=expand)
 
-    def contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
+    def contains(
+        self,
+        pat: Union[str, Sequence],
+        case: bool = True,
+        flags: int = 0,
+        na=np.nan,
+        regex: bool = True,
+    ) -> ParentType:
         """
         Test if pattern or regex is contained within a string of a Series or
         Index.
@@ -646,7 +673,15 @@ def contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
             )
         return self._return_or_inplace(result_col)
 
-    def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True):
+    def replace(
+        self,
+        pat: Union[str, Sequence],
+        repl: Union[str, Sequence],
+        n: int = -1,
+        case=None,
+        flags: int = 0,
+        regex: bool = True,
+    ) -> ParentType:
         """
         Replace occurrences of pattern/regex in the Series/Index with some
         other string. Equivalent to `str.replace()
@@ -748,7 +783,7 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True):
             ),
         )
 
-    def replace_with_backrefs(self, pat, repl):
+    def replace_with_backrefs(self, pat: str, repl: str) -> ParentType:
         """
         Use the ``repl`` back-ref template to create a new string
         with the extracted elements found using the ``pat`` expression.
@@ -778,7 +813,9 @@ def replace_with_backrefs(self, pat, repl):
             cpp_replace_with_backrefs(self._column, pat, repl)
         )
 
-    def slice(self, start=None, stop=None, step=None):
+    def slice(
+        self, start: int = None, stop: int = None, step: int = None
+    ) -> ParentType:
         """
         Slice substrings from each element in the Series or Index.
 
@@ -847,7 +884,7 @@ def slice(self, start=None, stop=None, step=None):
             cpp_slice_strings(self._column, start, stop, step),
         )
 
-    def isinteger(self):
+    def isinteger(self) -> ParentType:
         """
         Check whether all characters in each string form integer.
 
@@ -907,7 +944,7 @@ def isinteger(self):
         """
         return self._return_or_inplace(cpp_is_integer(self._column))
 
-    def ishex(self):
+    def ishex(self) -> ParentType:
         """
         Check whether all characters in each string form a hex integer.
 
@@ -946,7 +983,7 @@ def ishex(self):
         """
         return self._return_or_inplace(str_cast.is_hex(self._column))
 
-    def istimestamp(self, format):
+    def istimestamp(self, format: str) -> ParentType:
         """
         Check whether all characters in each string can be converted to
         a timestamp using the given format.
@@ -970,7 +1007,7 @@ def istimestamp(self, format):
             str_cast.istimestamp(self._column, format)
         )
 
-    def isfloat(self):
+    def isfloat(self) -> ParentType:
         """
         Check whether all characters in each string form floating value.
 
@@ -1033,7 +1070,7 @@ def isfloat(self):
         """
         return self._return_or_inplace(cpp_is_float(self._column))
 
-    def isdecimal(self):
+    def isdecimal(self) -> ParentType:
         """
         Check whether all characters in each string are decimal.
 
@@ -1094,7 +1131,7 @@ def isdecimal(self):
         """
         return self._return_or_inplace(cpp_is_decimal(self._column))
 
-    def isalnum(self):
+    def isalnum(self) -> ParentType:
         """
         Check whether all characters in each string are alphanumeric.
 
@@ -1163,7 +1200,7 @@ def isalnum(self):
         """
         return self._return_or_inplace(cpp_is_alnum(self._column))
 
-    def isalpha(self):
+    def isalpha(self) -> ParentType:
         """
         Check whether all characters in each string are alphabetic.
 
@@ -1219,7 +1256,7 @@ def isalpha(self):
         """
         return self._return_or_inplace(cpp_is_alpha(self._column))
 
-    def isdigit(self):
+    def isdigit(self) -> ParentType:
         """
         Check whether all characters in each string are digits.
 
@@ -1281,7 +1318,7 @@ def isdigit(self):
         """
         return self._return_or_inplace(cpp_is_digit(self._column))
 
-    def isnumeric(self):
+    def isnumeric(self) -> ParentType:
         """
         Check whether all characters in each string are numeric.
 
@@ -1349,7 +1386,7 @@ def isnumeric(self):
         """
         return self._return_or_inplace(cpp_is_numeric(self._column))
 
-    def isupper(self):
+    def isupper(self) -> ParentType:
         """
         Check whether all characters in each string are uppercase.
 
@@ -1406,7 +1443,7 @@ def isupper(self):
         """
         return self._return_or_inplace(cpp_is_upper(self._column))
 
-    def islower(self):
+    def islower(self) -> ParentType:
         """
         Check whether all characters in each string are lowercase.
 
@@ -1463,7 +1500,7 @@ def islower(self):
         """
         return self._return_or_inplace(cpp_is_lower(self._column))
 
-    def isipv4(self):
+    def isipv4(self) -> ParentType:
         """
         Check whether all characters in each string form an IPv4 address.
 
@@ -1487,7 +1524,7 @@ def isipv4(self):
         """
         return self._return_or_inplace(str_cast.is_ipv4(self._column))
 
-    def lower(self):
+    def lower(self) -> ParentType:
         """
         Converts all characters to lowercase.
 
@@ -1526,7 +1563,7 @@ def lower(self):
         """
         return self._return_or_inplace(cpp_to_lower(self._column))
 
-    def upper(self):
+    def upper(self) -> ParentType:
         """
         Convert each string to uppercase.
         This only applies to ASCII characters at this time.
@@ -1575,7 +1612,7 @@ def upper(self):
         """
         return self._return_or_inplace(cpp_to_upper(self._column))
 
-    def capitalize(self):
+    def capitalize(self) -> ParentType:
         """
         Convert strings in the Series/Index to be capitalized.
         This only applies to ASCII characters at this time.
@@ -1603,7 +1640,7 @@ def capitalize(self):
         """
         return self._return_or_inplace(cpp_capitalize(self._column))
 
-    def swapcase(self):
+    def swapcase(self) -> ParentType:
         """
         Change each lowercase character to uppercase and vice versa.
         This only applies to ASCII characters at this time.
@@ -1648,7 +1685,7 @@ def swapcase(self):
         """
         return self._return_or_inplace(cpp_swapcase(self._column))
 
-    def title(self):
+    def title(self) -> ParentType:
         """
         Uppercase the first letter of each letter after a space
         and lowercase the rest.
@@ -1693,7 +1730,9 @@ def title(self):
         """
         return self._return_or_inplace(cpp_title(self._column))
 
-    def filter_alphanum(self, repl=None, keep=True):
+    def filter_alphanum(
+        self, repl: str = None, keep: bool = True
+    ) -> ParentType:
         """
         Remove non-alphanumeric characters from strings in this column.
 
@@ -1728,7 +1767,9 @@ def filter_alphanum(self, repl=None, keep=True):
             cpp_filter_alphanum(self._column, cudf.Scalar(repl), keep),
         )
 
-    def slice_from(self, starts, stops):
+    def slice_from(
+        self, starts: "cudf.Series", stops: "cudf.Series"
+    ) -> ParentType:
         """
         Return substring of each string using positions for each string.
 
@@ -1771,7 +1812,9 @@ def slice_from(self, starts, stops):
             ),
         )
 
-    def slice_replace(self, start=None, stop=None, repl=None):
+    def slice_replace(
+        self, start: int = None, stop: int = None, repl: str = None
+    ) -> ParentType:
         """
         Replace the specified section of each string with a new string.
 
@@ -1856,7 +1899,7 @@ def slice_replace(self, start=None, stop=None, repl=None):
             cpp_slice_replace(self._column, start, stop, cudf.Scalar(repl)),
         )
 
-    def insert(self, start=0, repl=None):
+    def insert(self, start: int = 0, repl: str = None) -> ParentType:
         """
         Insert the specified string into each string in the specified
         position.
@@ -1906,7 +1949,7 @@ def insert(self, start=0, repl=None):
             cpp_string_insert(self._column, start, cudf.Scalar(repl)),
         )
 
-    def get(self, i=0):
+    def get(self, i: int = 0) -> ParentType:
         """
         Extract element from each component at specified position.
 
@@ -1950,7 +1993,9 @@ def get(self, i=0):
 
         return self._return_or_inplace(cpp_string_get(self._column, i))
 
-    def split(self, pat=None, n=-1, expand=None):
+    def split(
+        self, pat: str = None, n: int = -1, expand: bool = None
+    ) -> ParentType:
         """
         Split strings around given separator/delimiter.
 
@@ -2079,14 +2124,14 @@ def split(self, pat=None, n=-1, expand=None):
 
         if expand:
             if self._column.null_count == len(self._column):
-                result_table = [self._column.copy()]
+                result_table = cudf.core.frame.Frame({0: self._column.copy()})
             else:
                 result_table = cpp_split(
                     self._column, cudf.Scalar(pat, "str"), n
                 )
                 if len(result_table._data) == 1:
-                    if result_table._data[0].null_count == len(self._parent):
-                        result_table = []
+                    if result_table._data[0].null_count == len(self._column):
+                        result_table = cudf.core.frame.Frame({})
         else:
             result_table = cpp_split_record(
                 self._column, cudf.Scalar(pat, "str"), n
@@ -2094,7 +2139,9 @@ def split(self, pat=None, n=-1, expand=None):
 
         return self._return_or_inplace(result_table, expand=expand)
 
-    def rsplit(self, pat=None, n=-1, expand=None):
+    def rsplit(
+        self, pat: str = None, n: int = -1, expand: bool = None
+    ) -> ParentType:
         """
         Split strings around given separator/delimiter.
 
@@ -2232,18 +2279,18 @@ def rsplit(self, pat=None, n=-1, expand=None):
 
         if expand:
             if self._column.null_count == len(self._column):
-                result_table = [self._column.copy()]
+                result_table = cudf.core.frame.Frame({0: self._column.copy()})
             else:
                 result_table = cpp_rsplit(self._column, cudf.Scalar(pat), n)
                 if len(result_table._data) == 1:
-                    if result_table._data[0].null_count == len(self._parent):
-                        result_table = []
+                    if result_table._data[0].null_count == len(self._column):
+                        result_table = cudf.core.frame.Frame({})
         else:
             result_table = cpp_rsplit_record(self._column, cudf.Scalar(pat), n)
 
         return self._return_or_inplace(result_table, expand=expand)
 
-    def partition(self, sep=" ", expand=True):
+    def partition(self, sep: str = " ", expand: bool = True) -> ParentType:
         """
         Split the string at the first occurrence of sep.
 
@@ -2323,7 +2370,7 @@ def partition(self, sep=" ", expand=True):
             cpp_partition(self._column, cudf.Scalar(sep)), expand=expand
         )
 
-    def rpartition(self, sep=" ", expand=True):
+    def rpartition(self, sep: str = " ", expand: bool = True) -> ParentType:
         """
         Split the string at the last occurrence of sep.
 
@@ -2387,7 +2434,9 @@ def rpartition(self, sep=" ", expand=True):
             cpp_rpartition(self._column, cudf.Scalar(sep)), expand=expand
         )
 
-    def pad(self, width, side="left", fillchar=" "):
+    def pad(
+        self, width: int, side: str = "left", fillchar: str = " "
+    ) -> ParentType:
         """
         Pad strings in the Series/Index up to width.
 
@@ -2472,7 +2521,7 @@ def pad(self, width, side="left", fillchar=" "):
             cpp_pad(self._column, width, fillchar, side)
         )
 
-    def zfill(self, width):
+    def zfill(self, width: int) -> ParentType:
         """
         Pad strings in the Series/Index by prepending ‘0’ characters.
 
@@ -2545,7 +2594,7 @@ def zfill(self, width):
 
         return self._return_or_inplace(cpp_zfill(self._column, width))
 
-    def center(self, width, fillchar=" "):
+    def center(self, width: int, fillchar: str = " ") -> ParentType:
         """
         Filling left and right side of strings in the Series/Index with an
         additional character.
@@ -2617,7 +2666,7 @@ def center(self, width, fillchar=" "):
             cpp_center(self._column, width, fillchar)
         )
 
-    def ljust(self, width, fillchar=" "):
+    def ljust(self, width: int, fillchar: str = " ") -> ParentType:
         """
         Filling right side of strings in the Series/Index with an additional
         character. Equivalent to `str.ljust()
@@ -2671,7 +2720,7 @@ def ljust(self, width, fillchar=" "):
             cpp_ljust(self._column, width, fillchar)
         )
 
-    def rjust(self, width, fillchar=" "):
+    def rjust(self, width: int, fillchar: str = " ") -> ParentType:
         """
         Filling left side of strings in the Series/Index with an additional
         character. Equivalent to `str.rjust()
@@ -2725,7 +2774,7 @@ def rjust(self, width, fillchar=" "):
             cpp_rjust(self._column, width, fillchar)
         )
 
-    def strip(self, to_strip=None):
+    def strip(self, to_strip: str = None) -> ParentType:
         """
         Remove leading and trailing characters.
 
@@ -2784,7 +2833,7 @@ def strip(self, to_strip=None):
             cpp_strip(self._column, cudf.Scalar(to_strip))
         )
 
-    def lstrip(self, to_strip=None):
+    def lstrip(self, to_strip: str = None) -> ParentType:
         """
         Remove leading and trailing characters.
 
@@ -2831,7 +2880,7 @@ def lstrip(self, to_strip=None):
             cpp_lstrip(self._column, cudf.Scalar(to_strip))
         )
 
-    def rstrip(self, to_strip=None):
+    def rstrip(self, to_strip: str = None) -> ParentType:
         """
         Remove leading and trailing characters.
 
@@ -2886,7 +2935,7 @@ def rstrip(self, to_strip=None):
             cpp_rstrip(self._column, cudf.Scalar(to_strip))
         )
 
-    def wrap(self, width, **kwargs):
+    def wrap(self, width: int, **kwargs) -> ParentType:
         """
         Wrap long strings in the Series/Index to be formatted in
         paragraphs with length less than a given width.
@@ -2980,7 +3029,7 @@ def wrap(self, width, **kwargs):
 
         return self._return_or_inplace(cpp_wrap(self._column, width))
 
-    def count(self, pat, flags=0):
+    def count(self, pat: str, flags: int = 0) -> ParentType:
         """
         Count occurrences of pattern in each string of the Series/Index.
 
@@ -3040,7 +3089,9 @@ def count(self, pat, flags=0):
 
         return self._return_or_inplace(cpp_count_re(self._column, pat))
 
-    def findall(self, pat, flags=0, expand=True):
+    def findall(
+        self, pat: str, flags: int = 0, expand: bool = True
+    ) -> ParentType:
         """
         Find all occurrences of pattern or regular expression in the
         Series/Index.
@@ -3108,7 +3159,7 @@ def findall(self, pat, flags=0, expand=True):
             cpp_findall(self._column, pat), expand=expand
         )
 
-    def isempty(self):
+    def isempty(self) -> ParentType:
         """
         Check whether each string is an empty string.
 
@@ -3128,9 +3179,9 @@ def isempty(self):
         4    False
         dtype: bool
         """
-        return self._return_or_inplace((self._parent == "").fillna(False))
+        return self._return_or_inplace((self._column == "").fillna(False))
 
-    def isspace(self):
+    def isspace(self) -> ParentType:
         """
         Check whether all characters in each string are whitespace.
 
@@ -3186,7 +3237,7 @@ def isspace(self):
         """
         return self._return_or_inplace(cpp_isspace(self._column))
 
-    def endswith(self, pat):
+    def endswith(self, pat: str) -> ParentType:
         """
         Test if the end of each string element matches a pattern.
 
@@ -3240,7 +3291,7 @@ def endswith(self, pat):
 
         return self._return_or_inplace(result_col)
 
-    def startswith(self, pat):
+    def startswith(self, pat: Union[str, Sequence]) -> ParentType:
         """
         Test if the start of each string element matches a pattern.
 
@@ -3300,7 +3351,7 @@ def startswith(self, pat):
 
         return self._return_or_inplace(result_col)
 
-    def find(self, sub, start=0, end=None):
+    def find(self, sub: str, start: int = 0, end: int = None) -> ParentType:
         """
         Return lowest indexes in each strings in the Series/Index
         where the substring is fully contained between ``[start:end]``.
@@ -3355,7 +3406,7 @@ def find(self, sub, start=0, end=None):
 
         return self._return_or_inplace(result_col)
 
-    def rfind(self, sub, start=0, end=None):
+    def rfind(self, sub: str, start: int = 0, end: int = None) -> ParentType:
         """
         Return highest indexes in each strings in the Series/Index
         where the substring is fully contained between ``[start:end]``.
@@ -3414,7 +3465,7 @@ def rfind(self, sub, start=0, end=None):
 
         return self._return_or_inplace(result_col)
 
-    def index(self, sub, start=0, end=None):
+    def index(self, sub: str, start: int = 0, end: int = None) -> ParentType:
         """
         Return lowest indexes in each strings where the substring
         is fully contained between ``[start:end]``. This is the same
@@ -3474,7 +3525,7 @@ def index(self, sub, start=0, end=None):
         else:
             return result
 
-    def rindex(self, sub, start=0, end=None):
+    def rindex(self, sub: str, start: int = 0, end: int = None) -> ParentType:
         """
         Return highest indexes in each strings where the substring
         is fully contained between ``[start:end]``. This is the same
@@ -3534,7 +3585,7 @@ def rindex(self, sub, start=0, end=None):
         else:
             return result
 
-    def match(self, pat, case=True, flags=0):
+    def match(self, pat: str, case: bool = True, flags: int = 0) -> ParentType:
         """
         Determine if each string matches a regular expression.
 
@@ -3579,7 +3630,7 @@ def match(self, pat, case=True, flags=0):
 
         return self._return_or_inplace(cpp_match_re(self._column, pat))
 
-    def url_decode(self):
+    def url_decode(self) -> ParentType:
         """
         Returns a URL-decoded format of each string.
         No format checking is performed. All characters
@@ -3609,7 +3660,7 @@ def url_decode(self):
 
         return self._return_or_inplace(cpp_url_decode(self._column))
 
-    def url_encode(self):
+    def url_encode(self) -> ParentType:
         """
         Returns a URL-encoded format of each string.
         No format checking is performed.
@@ -3640,7 +3691,7 @@ def url_encode(self):
         """
         return self._return_or_inplace(cpp_url_encode(self._column))
 
-    def code_points(self):
+    def code_points(self) -> ParentType:
         """
         Returns an array by filling it with the UTF-8 code point
         values for each character of each string.
@@ -3673,14 +3724,14 @@ def code_points(self):
         """
 
         new_col = cpp_code_points(self._column)
-        if self._parent is None:
-            return new_col
-        elif isinstance(self._parent, cudf.Series):
+        if isinstance(self._parent, cudf.Series):
             return cudf.Series(new_col, name=self._parent.name)
         elif isinstance(self._parent, cudf.Index):
             return cudf.core.index.as_index(new_col, name=self._parent.name)
+        else:
+            return new_col
 
-    def translate(self, table):
+    def translate(self, table: dict) -> ParentType:
         """
         Map all characters in the string through the given
         mapping table.
@@ -3723,7 +3774,9 @@ def translate(self, table):
         table = str.maketrans(table)
         return self._return_or_inplace(cpp_translate(self._column, table))
 
-    def filter_characters(self, table, keep=True, repl=None):
+    def filter_characters(
+        self, table: dict, keep: bool = True, repl: str = None
+    ) -> ParentType:
         """
         Remove characters from each string using the character ranges
         in the given mapping table.
@@ -3774,7 +3827,7 @@ def filter_characters(self, table, keep=True, repl=None):
             ),
         )
 
-    def normalize_spaces(self):
+    def normalize_spaces(self) -> ParentType:
         """
         Remove extra whitespace between tokens and trim whitespace
         from the beginning and the end of each string.
@@ -3794,7 +3847,7 @@ def normalize_spaces(self):
         """
         return self._return_or_inplace(cpp_normalize_spaces(self._column))
 
-    def normalize_characters(self, do_lower=True):
+    def normalize_characters(self, do_lower: bool = True) -> ParentType:
         """
         Normalizes strings characters for tokenizing.
 
@@ -3843,7 +3896,7 @@ def normalize_characters(self, do_lower=True):
             cpp_normalize_characters(self._column, do_lower)
         )
 
-    def tokenize(self, delimiter=" "):
+    def tokenize(self, delimiter: str = " ") -> ParentType:
         """
         Each string is split into tokens using the provided delimiter(s).
         The sequence returned contains the tokens in the order
@@ -3890,7 +3943,9 @@ def tokenize(self, delimiter=" "):
                 for delimiters, but got {type(delimiter)}"
             )
 
-    def detokenize(self, indices, separator=" "):
+    def detokenize(
+        self, indices: "cudf.Series", separator: str = " "
+    ) -> ParentType:
         """
         Combines tokens into strings by concatenating them in the order
         in which they appear in the ``indices`` column. The ``separator`` is
@@ -3898,7 +3953,7 @@ def detokenize(self, indices, separator=" "):
 
         Parameters
         ----------
-        indices : list of ints
+        indices : Series
             Each value identifies the output row for the corresponding token.
         separator : str
             The string concatenated between each token in an output row.
@@ -3925,7 +3980,7 @@ def detokenize(self, indices, separator=" "):
             retain_index=False,
         )
 
-    def character_tokenize(self):
+    def character_tokenize(self) -> ParentType:
         """
         Each string is split into individual characters.
         The sequence returned contains each character as an individual string.
@@ -3973,14 +4028,14 @@ def character_tokenize(self):
         dtype: object
         """
         result_col = cpp_character_tokenize(self._column)
-        if self._parent is None:
-            return result_col
-        elif isinstance(self._parent, cudf.Series):
+        if isinstance(self._parent, cudf.Series):
             return cudf.Series(result_col, name=self._parent.name)
         elif isinstance(self._parent, cudf.Index):
             return cudf.core.index.as_index(result_col, name=self._parent.name)
+        else:
+            return result_col
 
-    def token_count(self, delimiter=" "):
+    def token_count(self, delimiter: str = " ") -> ParentType:
         """
         Each string is split into tokens using the provided delimiter.
         The returned integer sequence is the number of tokens in each string.
@@ -4022,7 +4077,7 @@ def token_count(self, delimiter=" "):
                 for delimiters, but got {type(delimiter)}"
             )
 
-    def ngrams(self, n=2, separator="_"):
+    def ngrams(self, n: int = 2, separator: str = "_") -> ParentType:
         """
         Generate the n-grams from a set of tokens, each record
         in series is treated a token.
@@ -4059,7 +4114,7 @@ def ngrams(self, n=2, separator="_"):
             cpp_generate_ngrams(self._column, n, separator), retain_index=False
         )
 
-    def character_ngrams(self, n=2):
+    def character_ngrams(self, n: int = 2) -> ParentType:
         """
         Generate the n-grams from characters in a column of strings.
 
@@ -4095,7 +4150,9 @@ def character_ngrams(self, n=2):
             cpp_generate_character_ngrams(self._column, n), retain_index=False
         )
 
-    def ngrams_tokenize(self, n=2, delimiter=" ", separator="_"):
+    def ngrams_tokenize(
+        self, n: int = 2, delimiter: str = " ", separator: str = "_"
+    ) -> ParentType:
         """
         Generate the n-grams using tokens from each string.
         This will tokenize each string and then generate ngrams for each
@@ -4131,7 +4188,9 @@ def ngrams_tokenize(self, n=2, delimiter=" ", separator="_"):
             retain_index=False,
         )
 
-    def replace_tokens(self, targets, replacements, delimiter=None):
+    def replace_tokens(
+        self, targets, replacements, delimiter: str = None
+    ) -> ParentType:
         """
         The targets tokens are searched for within each string in the series
         and replaced with the corresponding replacements if found.
@@ -4213,8 +4272,11 @@ def replace_tokens(self, targets, replacements, delimiter=None):
         )
 
     def filter_tokens(
-        self, min_token_length, replacement=None, delimiter=None
-    ):
+        self,
+        min_token_length: int,
+        replacement: str = None,
+        delimiter: str = None,
+    ) -> ParentType:
         """
         Remove tokens from within each string in the series that are
         smaller than min_token_length and optionally replace them
@@ -4282,13 +4344,13 @@ def filter_tokens(
 
     def subword_tokenize(
         self,
-        hash_file,
-        max_length=64,
-        stride=48,
-        do_lower=True,
-        do_truncate=False,
-        max_rows_tensor=500,
-    ):
+        hash_file: str,
+        max_length: int = 64,
+        stride: int = 48,
+        do_lower: bool = True,
+        do_truncate: bool = False,
+        max_rows_tensor: int = 500,
+    ) -> Tuple[cupy.ndarray, cupy.ndarray, cupy.ndarray]:
         """
         Run CUDA BERT subword tokenizer on cuDF strings column.
         Encodes words to token ids using vocabulary from a pretrained
@@ -4337,12 +4399,12 @@ def subword_tokenize(
 
         Returns
         -------
-        token-ids : Column
+        token-ids : cupy.ndarray
             The token-ids for each string padded with 0s to max_length.
-        attention-mask : Column
+        attention-mask : cupy.ndarray
             The mask for token-ids result where corresponding positions
             identify valid token-id values.
-        metadata : Column
+        metadata : cupy.ndarray
             Each row contains the index id of the original string and the
             first and last index of the token-ids that are non-padded and
             non-overlapping.
@@ -4383,7 +4445,7 @@ def subword_tokenize(
             cupy.asarray(metadata),
         )
 
-    def porter_stemmer_measure(self):
+    def porter_stemmer_measure(self) -> ParentType:
         """
         Compute the Porter Stemmer measure for each string.
         The Porter Stemmer algorithm is described `here
@@ -4406,7 +4468,7 @@ def porter_stemmer_measure(self):
             cpp_porter_stemmer_measure(self._column)
         )
 
-    def is_consonant(self, position):
+    def is_consonant(self, position) -> ParentType:
         """
         Return true for strings where the character at ``position`` is a
         consonant. The ``position`` parameter may also be a list of integers
@@ -4450,7 +4512,7 @@ def is_consonant(self, position):
             cpp_is_letter(self._column, ltype, position)
         )
 
-    def is_vowel(self, position):
+    def is_vowel(self, position) -> ParentType:
         """
         Return true for strings where the character at ``position`` is a
         vowel -- not a consonant. The ``position`` parameter may also be
@@ -4494,7 +4556,7 @@ def is_vowel(self, position):
             cpp_is_letter(self._column, ltype, position)
         )
 
-    def edit_distance(self, targets):
+    def edit_distance(self, targets) -> ParentType:
         """
         The ``targets`` strings are measured against the strings in this
         instance using the Levenshtein edit distance algorithm.
@@ -4576,8 +4638,17 @@ class StringColumn(column.ColumnBase):
     """Implements operations for Columns of String type
     """
 
+    _start_offset: Optional[int]
+    _end_offset: Optional[int]
+    _cached_sizeof: Optional[int]
+
     def __init__(
-        self, mask=None, size=None, offset=0, null_count=None, children=()
+        self,
+        mask: Buffer = None,
+        size: int = None,
+        offset: int = 0,
+        null_count: int = None,
+        children: Tuple["column.ColumnBase", ...] = (),
     ):
         """
         Parameters
@@ -4627,34 +4698,38 @@ def __init__(
         self._end_offset = None
 
     @property
-    def start_offset(self):
+    def start_offset(self) -> int:
         if self._start_offset is None:
             if (
                 len(self.base_children) == 2
                 and self.offset < self.base_children[0].size
             ):
-                self._start_offset = int(self.base_children[0][self.offset])
+                self._start_offset = int(
+                    self.base_children[0].element_indexing(self.offset)
+                )
             else:
                 self._start_offset = 0
 
         return self._start_offset
 
     @property
-    def end_offset(self):
+    def end_offset(self) -> int:
         if self._end_offset is None:
             if (
                 len(self.base_children) == 2
                 and (self.offset + self.size) < self.base_children[0].size
             ):
                 self._end_offset = int(
-                    self.base_children[0][self.offset + self.size]
+                    self.base_children[0].element_indexing(
+                        self.offset + self.size
+                    )
                 )
             else:
                 self._end_offset = 0
 
         return self._end_offset
 
-    def __sizeof__(self):
+    def __sizeof__(self) -> int:
         if self._cached_sizeof is None:
             n = 0
             if len(self.base_children) == 2:
@@ -4676,7 +4751,7 @@ def __sizeof__(self):
         return self._cached_sizeof
 
     @property
-    def base_size(self):
+    def base_size(self) -> int:
         if len(self.base_children) == 0:
             return 0
         else:
@@ -4685,7 +4760,13 @@ def base_size(self):
                 / self.base_children[0].dtype.itemsize
             )
 
-    def sum(self, skipna=None, dtype=None, min_count=0):
+    @property
+    def data_array_view(self) -> cuda.devicearray.DeviceNDArray:
+        raise ValueError("Cannot get an array view of a StringColumn")
+
+    def sum(
+        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
+    ):
         result_col = self._process_for_reduction(
             skipna=skipna, min_count=min_count
         )
@@ -4703,39 +4784,38 @@ def set_base_data(self, value):
         else:
             super().set_base_data(value)
 
-    def set_base_mask(self, value):
+    def set_base_mask(self, value: Optional[Buffer]):
         super().set_base_mask(value)
 
-    def set_base_children(self, value):
+    def set_base_children(self, value: Tuple["column.ColumnBase", ...]):
         # TODO: Implement dtype validation of the children here somehow
         super().set_base_children(value)
 
-    def __contains__(self, item):
+    def __contains__(self, item: ScalarLike) -> bool:
         return True in self.str().contains(f"^{item}$")
 
-    def str(self, parent=None):
+    def str(self, parent: ParentType = None) -> StringMethods:
         return StringMethods(self, parent=parent)
 
-    def unary_operator(self, unaryop):
+    def unary_operator(self, unaryop: builtins.str):
         raise TypeError(
             f"Series of dtype `str` cannot perform the operation: "
             f"{unaryop}"
         )
 
-    def __len__(self):
+    def __len__(self) -> int:
         return self.size
 
-    def _set_mask(self, value):
-        super()._set_mask(value)
-
     @property
-    def _nbytes(self):
+    def _nbytes(self) -> int:
         if self.size == 0:
             return 0
         else:
             return self.children[1].size
 
-    def as_numerical_column(self, dtype):
+    def as_numerical_column(
+        self, dtype: Dtype
+    ) -> "cudf.core.column.NumericalColumn":
         out_dtype = np.dtype(dtype)
 
         if out_dtype.kind in {"i", "u"}:
@@ -4775,42 +4855,49 @@ def _as_datetime_or_timedelta_column(self, dtype, format):
 
         return result_col
 
-    def as_datetime_column(self, dtype, format=None):
+    def as_datetime_column(
+        self, dtype: Dtype, **kwargs
+    ) -> "cudf.core.column.DatetimeColumn":
         out_dtype = np.dtype(dtype)
 
+        # infer on host from the first not na element
+        # or return all null column if all values
+        # are null in current column
+        format = kwargs.get("format", None)
         if format is None:
-            # infer on host from the first not na element
-            # or return all null column if all values
-            # are null in current column
             if self.null_count == len(self):
-                return column.column_empty(
-                    len(self), dtype=out_dtype, masked=True
+                return cast(
+                    "cudf.core.column.DatetimeColumn",
+                    column.column_empty(
+                        len(self), dtype=out_dtype, masked=True
+                    ),
                 )
             else:
-                format = datetime.infer_format(self[self.notna()][0])
+                format = datetime.infer_format(
+                    self.apply_boolean_mask(self.notna()).element_indexing(0)
+                )
 
         return self._as_datetime_or_timedelta_column(out_dtype, format)
 
-    def as_timedelta_column(self, dtype, format=None):
+    def as_timedelta_column(
+        self, dtype: Dtype, **kwargs
+    ) -> "cudf.core.column.TimeDeltaColumn":
         out_dtype = np.dtype(dtype)
-
-        if format is None:
-            format = "%D days %H:%M:%S"
-
+        format = "%D days %H:%M:%S"
         return self._as_datetime_or_timedelta_column(out_dtype, format)
 
-    def as_string_column(self, dtype):
+    def as_string_column(self, dtype: Dtype, format=None) -> StringColumn:
         return self
 
     @property
-    def values_host(self):
+    def values_host(self) -> np.ndarray:
         """
         Return a numpy representation of the StringColumn.
         """
         return self.to_pandas().values
 
     @property
-    def values(self):
+    def values(self) -> cupy.ndarray:
         """
         Return a CuPy representation of the StringColumn.
         """
@@ -4818,7 +4905,7 @@ def values(self):
             "String Arrays is not yet implemented in cudf"
         )
 
-    def to_array(self, fillna=None):
+    def to_array(self, fillna: bool = None) -> np.ndarray:
         """Get a dense numpy array for the data.
 
         Notes
@@ -4851,8 +4938,8 @@ def __arrow_array__(self, type=None):
             "consider using .to_arrow()"
         )
 
-    def serialize(self):
-        header = {"null_count": self.null_count}
+    def serialize(self) -> Tuple[dict, list]:
+        header = {"null_count": self.null_count}  # type: Dict[Any, Any]
         header["type-serialized"] = pickle.dumps(type(self))
         header["size"] = self.size
 
@@ -4872,7 +4959,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
-    def deserialize(cls, header, frames):
+    def deserialize(cls, header: dict, frames: list) -> StringColumn:
         size = header["size"]
         if not isinstance(size, int):
             size = pickle.loads(size)
@@ -4880,26 +4967,28 @@ def deserialize(cls, header, frames):
         # Deserialize the mask, value, and offset frames
         buffers = [Buffer(each_frame) for each_frame in frames]
 
+        nbuf = None
         if header["null_count"] > 0:
             nbuf = buffers[2]
-        else:
-            nbuf = None
 
         children = []
         for h, b in zip(header["subheaders"], buffers[:2]):
             column_type = pickle.loads(h["type-serialized"])
             children.append(column_type.deserialize(h, [b]))
 
-        col = column.build_column(
-            data=None,
-            dtype="str",
-            mask=nbuf,
-            children=tuple(children),
-            size=size,
+        col = cast(
+            StringColumn,
+            column.build_column(
+                data=None,
+                dtype="str",
+                mask=nbuf,
+                children=tuple(children),
+                size=size,
+            ),
         )
         return col
 
-    def can_cast_safely(self, to_dtype):
+    def can_cast_safely(self, to_dtype: Dtype) -> bool:
         to_dtype = np.dtype(to_dtype)
 
         if self.dtype == to_dtype:
@@ -4911,7 +5000,12 @@ def can_cast_safely(self, to_dtype):
         else:
             return True
 
-    def find_and_replace(self, to_replace, replacement, all_nan):
+    def find_and_replace(
+        self,
+        to_replace: ColumnLike,
+        replacement: ColumnLike,
+        all_nan: bool = False,
+    ) -> StringColumn:
         """
         Return col with *to_replace* replaced with *value*
         """
@@ -4919,7 +5013,12 @@ def find_and_replace(self, to_replace, replacement, all_nan):
         replacement = column.as_column(replacement, dtype=self.dtype)
         return libcudf.replace.replace(self, to_replace, replacement)
 
-    def fillna(self, fill_value=None, method=None):
+    def fillna(
+        self,
+        fill_value: Any = None,
+        method: builtins.str = None,
+        dtype: Dtype = None,
+    ) -> StringColumn:
         if fill_value is not None:
             if not is_scalar(fill_value):
                 fill_value = column.as_column(fill_value, dtype=self.dtype)
@@ -4927,24 +5026,26 @@ def fillna(self, fill_value=None, method=None):
         else:
             return super().fillna(method=method)
 
-    def _find_first_and_last(self, value):
+    def _find_first_and_last(self, value: ScalarLike) -> Tuple[int, int]:
         found_indices = self.str().contains(f"^{value}$")
         found_indices = libcudf.unary.cast(found_indices, dtype=np.int32)
         first = column.as_column(found_indices).find_first_value(1)
         last = column.as_column(found_indices).find_last_value(1)
         return first, last
 
-    def find_first_value(self, value, closest=False):
+    def find_first_value(
+        self, value: ScalarLike, closest: bool = False
+    ) -> int:
         return self._find_first_and_last(value)[0]
 
-    def find_last_value(self, value, closest=False):
+    def find_last_value(self, value: ScalarLike, closest: bool = False) -> int:
         return self._find_first_and_last(value)[1]
 
-    def normalize_binop_value(self, other):
+    def normalize_binop_value(self, other) -> "column.ColumnBase":
         # fastpath: gpu scalar
         if isinstance(other, cudf.Scalar) and other.dtype == "object":
             return column.as_column(other, length=len(self))
-        if isinstance(other, column.Column):
+        if isinstance(other, column.ColumnBase):
             return other.astype(self.dtype)
         elif isinstance(other, str) or other is None:
             col = utils.scalar_broadcast_to(
@@ -4959,16 +5060,18 @@ def normalize_binop_value(self, other):
         else:
             raise TypeError(f"cannot broadcast {type(other)}")
 
-    def default_na_value(self):
+    def default_na_value(self) -> ScalarLike:
         return None
 
-    def binary_operator(self, op, rhs, reflect=False):
+    def binary_operator(
+        self, op: builtins.str, rhs, reflect: bool = False
+    ) -> "column.ColumnBase":
         lhs = self
         if reflect:
             lhs, rhs = rhs, lhs
         if isinstance(rhs, (StringColumn, str, cudf.Scalar)):
             if op == "add":
-                return lhs.str().cat(others=rhs)
+                return cast("column.ColumnBase", lhs.str().cat(others=rhs))
             elif op in ("eq", "ne", "gt", "lt", "ge", "le"):
                 return _string_column_binop(self, rhs, op=op, out_dtype="bool")
 
@@ -4977,7 +5080,7 @@ def binary_operator(self, op, rhs, reflect=False):
         )
 
     @property
-    def is_unique(self):
+    def is_unique(self) -> bool:
         return len(self.unique()) == len(self)
 
     @property
@@ -4986,19 +5089,17 @@ def __cuda_array_interface__(self):
             "Strings are not yet supported via `__cuda_array_interface__`"
         )
 
-    def _mimic_inplace(self, other_col, inplace=False):
-        out = super()._mimic_inplace(other_col, inplace=inplace)
-        return out
-
     @copy_docstring(column.ColumnBase.view)
-    def view(self, dtype):
+    def view(self, dtype) -> "cudf.core.column.ColumnBase":
         if self.null_count > 0:
             raise ValueError(
                 "Can not produce a view of a string column with nulls"
             )
         dtype = np.dtype(dtype)
-        str_byte_offset = self.base_children[0][self.offset]
-        str_end_byte_offset = self.base_children[0][self.offset + self.size]
+        str_byte_offset = self.base_children[0].element_indexing(self.offset)
+        str_end_byte_offset = self.base_children[0].element_indexing(
+            self.offset + self.size
+        )
         char_dtype_size = self.base_children[1].dtype.itemsize
 
         n_bytes_to_view = (
@@ -5016,7 +5117,12 @@ def view(self, dtype):
 
 
 @annotate("BINARY_OP", color="orange", domain="cudf_python")
-def _string_column_binop(lhs, rhs, op, out_dtype):
+def _string_column_binop(
+    lhs: "column.ColumnBase",
+    rhs: "column.ColumnBase",
+    op: str,
+    out_dtype: Dtype,
+) -> "column.ColumnBase":
     out = libcudf.binaryop.binaryop(lhs=lhs, rhs=rhs, op=op, dtype=out_dtype)
     return out
 
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 9036f1e2962..f797bdf9635 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -1,6 +1,9 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
+from __future__ import annotations
+
 import datetime as dt
 from numbers import Number
+from typing import Any, Sequence, Tuple, Union, cast
 
 import numpy as np
 import pandas as pd
@@ -9,6 +12,14 @@
 
 import cudf
 from cudf import _lib as libcudf
+from cudf._typing import (
+    BinaryOperand,
+    DatetimeLikeScalar,
+    Dtype,
+    DtypeObj,
+    ScalarLike,
+)
+from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase, column, string
 from cudf.core.column.datetime import _numpy_to_pandas_conversion
 from cudf.utils.dtypes import is_scalar, np_to_pa_dtype
@@ -24,7 +35,13 @@
 
 class TimeDeltaColumn(column.ColumnBase):
     def __init__(
-        self, data, dtype, size, mask=None, offset=0, null_count=None
+        self,
+        data: Buffer,
+        dtype: Dtype,
+        size: int = None,
+        mask: Buffer = None,
+        offset: int = 0,
+        null_count: int = None,
     ):
         """
         Parameters
@@ -46,7 +63,9 @@ def __init__(
         dtype = np.dtype(dtype)
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
-
+        if size is None:
+            size = data.size // dtype.itemsize
+            size = size - offset
         super().__init__(
             data,
             size=size,
@@ -61,7 +80,7 @@ def __init__(
 
         self._time_unit, _ = np.datetime_data(self.dtype)
 
-    def __contains__(self, item):
+    def __contains__(self, item: DatetimeLikeScalar) -> bool:
         try:
             item = np.timedelta64(item, self._time_unit)
         except ValueError:
@@ -71,7 +90,7 @@ def __contains__(self, item):
             return False
         return item.view("int64") in self.as_numerical
 
-    def to_arrow(self):
+    def to_arrow(self) -> pa.Array:
         mask = None
         if self.nullable:
             mask = pa.py_buffer(self.mask_array_view.copy_to_host())
@@ -84,7 +103,9 @@ def to_arrow(self):
             null_count=self.null_count,
         )
 
-    def to_pandas(self, index=None, **kwargs):
+    def to_pandas(
+        self, index=None, nullable: bool = False, **kwargs
+    ) -> pd.Series:
         # Workaround until following issue is fixed:
         # https://issues.apache.org/jira/browse/ARROW-9772
 
@@ -98,8 +119,10 @@ def to_pandas(self, index=None, **kwargs):
 
         return pd_series
 
-    def _binary_op_floordiv(self, rhs):
-        lhs, rhs = self, rhs
+    def _binary_op_floordiv(
+        self, rhs: BinaryOperand
+    ) -> Tuple["column.ColumnBase", BinaryOperand, DtypeObj]:
+        lhs = self  # type: column.ColumnBase
         if pd.api.types.is_timedelta64_dtype(rhs.dtype):
             common_dtype = determine_out_dtype(self.dtype, rhs.dtype)
             lhs = lhs.astype(common_dtype).astype("float64")
@@ -122,7 +145,7 @@ def _binary_op_floordiv(self, rhs):
 
         return lhs, rhs, out_dtype
 
-    def _binary_op_mul(self, rhs):
+    def _binary_op_mul(self, rhs: BinaryOperand) -> DtypeObj:
         if rhs.dtype.kind in ("f", "i", "u"):
             out_dtype = self.dtype
         else:
@@ -132,7 +155,7 @@ def _binary_op_mul(self, rhs):
             )
         return out_dtype
 
-    def _binary_op_mod(self, rhs):
+    def _binary_op_mod(self, rhs: BinaryOperand) -> DtypeObj:
         if pd.api.types.is_timedelta64_dtype(rhs.dtype):
             out_dtype = determine_out_dtype(self.dtype, rhs.dtype)
         elif rhs.dtype.kind in ("f", "i", "u"):
@@ -144,7 +167,7 @@ def _binary_op_mod(self, rhs):
             )
         return out_dtype
 
-    def _binary_op_eq_ne(self, rhs):
+    def _binary_op_eq_ne(self, rhs: BinaryOperand) -> DtypeObj:
         if pd.api.types.is_timedelta64_dtype(rhs.dtype):
             out_dtype = np.bool
         else:
@@ -154,7 +177,7 @@ def _binary_op_eq_ne(self, rhs):
             )
         return out_dtype
 
-    def _binary_op_lt_gt_le_ge(self, rhs):
+    def _binary_op_lt_gt_le_ge(self, rhs: BinaryOperand) -> DtypeObj:
         if pd.api.types.is_timedelta64_dtype(rhs.dtype):
             return np.bool
         else:
@@ -163,8 +186,10 @@ def _binary_op_lt_gt_le_ge(self, rhs):
                 f" and {rhs.dtype}"
             )
 
-    def _binary_op_truediv(self, rhs):
-        lhs, rhs = self, rhs
+    def _binary_op_truediv(
+        self, rhs: BinaryOperand
+    ) -> Tuple["column.ColumnBase", BinaryOperand, DtypeObj]:
+        lhs = self  # type: column.ColumnBase
         if pd.api.types.is_timedelta64_dtype(rhs.dtype):
             common_dtype = determine_out_dtype(self.dtype, rhs.dtype)
             lhs = lhs.astype(common_dtype).astype("float64")
@@ -187,7 +212,9 @@ def _binary_op_truediv(self, rhs):
 
         return lhs, rhs, out_dtype
 
-    def binary_operator(self, op, rhs, reflect=False):
+    def binary_operator(
+        self, op: str, rhs: BinaryOperand, reflect: bool = False
+    ) -> "column.ColumnBase":
         lhs, rhs = self, rhs
 
         if op in ("eq", "ne"):
@@ -199,14 +226,14 @@ def binary_operator(self, op, rhs, reflect=False):
         elif op == "mod":
             out_dtype = self._binary_op_mod(rhs)
         elif op == "truediv":
-            lhs, rhs, out_dtype = self._binary_op_truediv(rhs)
+            lhs, rhs, out_dtype = self._binary_op_truediv(rhs)  # type: ignore
         elif op == "floordiv":
-            lhs, rhs, out_dtype = self._binary_op_floordiv(rhs)
+            lhs, rhs, out_dtype = self._binary_op_floordiv(rhs)  # type: ignore
             op = "truediv"
         elif op == "add":
-            out_dtype = _timedelta_binary_op_add(lhs, rhs)
+            out_dtype = _timedelta_add_result_dtype(lhs, rhs)
         elif op == "sub":
-            out_dtype = _timedelta_binary_op_sub(lhs, rhs)
+            out_dtype = _timedelta_sub_result_dtype(lhs, rhs)
         else:
             raise TypeError(
                 f"Series of dtype {self.dtype} cannot perform "
@@ -214,10 +241,11 @@ def binary_operator(self, op, rhs, reflect=False):
             )
 
         if reflect:
-            lhs, rhs = rhs, lhs
+            lhs, rhs = rhs, lhs  # type: ignore
+
         return binop(lhs, rhs, op=op, out_dtype=out_dtype)
 
-    def normalize_binop_value(self, other):
+    def normalize_binop_value(self, other) -> BinaryOperand:
         if isinstance(other, cudf.Scalar):
             return other
 
@@ -247,30 +275,34 @@ def normalize_binop_value(self, other):
             raise TypeError(f"cannot normalize {type(other)}")
 
     @property
-    def as_numerical(self):
-
-        return column.build_column(
-            data=self.base_data,
-            dtype=np.int64,
-            mask=self.base_mask,
-            offset=self.offset,
-            size=self.size,
+    def as_numerical(self) -> "cudf.core.column.NumericalColumn":
+        return cast(
+            "cudf.core.column.NumericalColumn",
+            column.build_column(
+                data=self.base_data,
+                dtype=np.int64,
+                mask=self.base_mask,
+                offset=self.offset,
+                size=self.size,
+            ),
         )
 
-    def default_na_value(self):
+    def default_na_value(self) -> ScalarLike:
         """Returns the default NA value for this column
         """
         return np.timedelta64("nat", self.time_unit)
 
     @property
-    def time_unit(self):
+    def time_unit(self) -> str:
         return self._time_unit
 
-    def fillna(self, fill_value=None, method=None):
+    def fillna(
+        self, fill_value: Any = None, method: str = None, dtype: Dtype = None
+    ) -> TimeDeltaColumn:
         if fill_value is not None:
             if cudf.utils.utils.isnat(fill_value):
                 return _fillna_natwise(self)
-            col = self
+            col = self  # type: column.ColumnBase
             if is_scalar(fill_value):
                 if isinstance(fill_value, np.timedelta64):
                     dtype = determine_out_dtype(self.dtype, fill_value.dtype)
@@ -280,51 +312,61 @@ def fillna(self, fill_value=None, method=None):
                     fill_value = cudf.Scalar(fill_value, dtype=dtype)
             else:
                 fill_value = column.as_column(fill_value, nan_as_null=False)
-
-            return ColumnBase.fillna(col, fill_value)
+            return cast(TimeDeltaColumn, ColumnBase.fillna(col, fill_value))
         else:
             return super().fillna(method=method)
 
-    def as_numerical_column(self, dtype):
-        return self.as_numerical.astype(dtype)
+    def as_numerical_column(
+        self, dtype: Dtype
+    ) -> "cudf.core.column.NumericalColumn":
+        return cast(
+            "cudf.core.column.NumericalColumn", self.as_numerical.astype(dtype)
+        )
 
-    def as_datetime_column(self, dtype, **kwargs):
+    def as_datetime_column(
+        self, dtype: Dtype, **kwargs
+    ) -> "cudf.core.column.DatetimeColumn":
         raise TypeError(
             f"cannot astype a timedelta from [{self.dtype}] to [{dtype}]"
         )
 
-    def as_string_column(self, dtype, **kwargs):
-
-        if not kwargs.get("format"):
-            fmt = _dtype_to_format_conversion.get(
+    def as_string_column(
+        self, dtype: Dtype, format=None
+    ) -> "cudf.core.column.StringColumn":
+        if format is None:
+            format = _dtype_to_format_conversion.get(
                 self.dtype.name, "%D days %H:%M:%S"
             )
-            kwargs["format"] = fmt
         if len(self) > 0:
             return string._timedelta_to_str_typecast_functions[
                 np.dtype(self.dtype)
-            ](self, **kwargs)
+            ](self, format=format)
         else:
-            return column.column_empty(0, dtype="object", masked=False)
+            return cast(
+                "cudf.core.column.StringColumn",
+                column.column_empty(0, dtype="object", masked=False),
+            )
 
-    def as_timedelta_column(self, dtype, **kwargs):
+    def as_timedelta_column(self, dtype: Dtype, **kwargs) -> TimeDeltaColumn:
         dtype = np.dtype(dtype)
         if dtype == self.dtype:
             return self
         return libcudf.unary.cast(self, dtype=dtype)
 
-    def mean(self, skipna=None, dtype=np.float64):
+    def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta:
         return pd.Timedelta(
             self.as_numerical.mean(skipna=skipna, dtype=dtype),
             unit=self.time_unit,
         )
 
-    def median(self, skipna=None):
+    def median(self, skipna: bool = None) -> pd.Timedelta:
         return pd.Timedelta(
             self.as_numerical.median(skipna=skipna), unit=self.time_unit
         )
 
-    def quantile(self, q, interpolation, exact):
+    def quantile(
+        self, q: Union[float, Sequence[float]], interpolation: str, exact: bool
+    ) -> "column.ColumnBase":
         result = self.as_numerical.quantile(
             q=q, interpolation=interpolation, exact=exact
         )
@@ -332,7 +374,9 @@ def quantile(self, q, interpolation, exact):
             return pd.Timedelta(result, unit=self.time_unit)
         return result.astype(self.dtype)
 
-    def sum(self, skipna=None, dtype=None, min_count=0):
+    def sum(
+        self, skipna: bool = None, dtype: Dtype = None, min_count=0
+    ) -> pd.Timedelta:
         if len(self) == 0:
             return pd.Timedelta(None, unit=self.time_unit)
         else:
@@ -343,13 +387,15 @@ def sum(self, skipna=None, dtype=None, min_count=0):
                 unit=self.time_unit,
             )
 
-    def std(self, skipna=None, ddof=1, dtype=np.float64):
+    def std(
+        self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64
+    ) -> pd.Timedelta:
         return pd.Timedelta(
             self.as_numerical.std(skipna=skipna, ddof=ddof, dtype=dtype),
             unit=self.time_unit,
         )
 
-    def components(self, index=None):
+    def components(self, index=None) -> "cudf.DataFrame":
         """
         Return a Dataframe of the components of the Timedeltas.
 
@@ -443,7 +489,7 @@ def components(self, index=None):
         )
 
     @property
-    def days(self):
+    def days(self) -> "cudf.core.column.NumericalColumn":
         """
         Number of days for each element.
 
@@ -456,7 +502,7 @@ def days(self):
         )
 
     @property
-    def seconds(self):
+    def seconds(self) -> "cudf.core.column.NumericalColumn":
         """
         Number of seconds (>= 0 and less than 1 day).
 
@@ -479,7 +525,7 @@ def seconds(self):
         )
 
     @property
-    def microseconds(self):
+    def microseconds(self) -> "cudf.core.column.NumericalColumn":
         """
         Number of microseconds (>= 0 and less than 1 second).
 
@@ -499,7 +545,7 @@ def microseconds(self):
         )
 
     @property
-    def nanoseconds(self):
+    def nanoseconds(self) -> "cudf.core.column.NumericalColumn":
         """
         Return the number of nanoseconds (n), where 0 <= n < 1 microsecond.
 
@@ -524,12 +570,17 @@ def nanoseconds(self):
 
 
 @annotate("BINARY_OP", color="orange", domain="cudf_python")
-def binop(lhs, rhs, op, out_dtype):
+def binop(
+    lhs: "column.ColumnBase",
+    rhs: "column.ColumnBase",
+    op: str,
+    out_dtype: DtypeObj,
+) -> "cudf.core.column.ColumnBase":
     out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
     return out
 
 
-def determine_out_dtype(lhs_dtype, rhs_dtype):
+def determine_out_dtype(lhs_dtype: Dtype, rhs_dtype: Dtype) -> Dtype:
     if np.can_cast(np.dtype(lhs_dtype), np.dtype(rhs_dtype)):
         return rhs_dtype
     elif np.can_cast(np.dtype(rhs_dtype), np.dtype(lhs_dtype)):
@@ -538,7 +589,9 @@ def determine_out_dtype(lhs_dtype, rhs_dtype):
         raise TypeError(f"Cannot type-cast {lhs_dtype} and {rhs_dtype}")
 
 
-def _timedelta_binary_op_add(lhs, rhs):
+def _timedelta_add_result_dtype(
+    lhs: BinaryOperand, rhs: BinaryOperand
+) -> Dtype:
     if pd.api.types.is_timedelta64_dtype(rhs.dtype):
         out_dtype = determine_out_dtype(lhs.dtype, rhs.dtype)
     elif pd.api.types.is_datetime64_dtype(rhs.dtype):
@@ -557,7 +610,9 @@ def _timedelta_binary_op_add(lhs, rhs):
     return out_dtype
 
 
-def _timedelta_binary_op_sub(lhs, rhs):
+def _timedelta_sub_result_dtype(
+    lhs: BinaryOperand, rhs: BinaryOperand
+) -> Dtype:
     if pd.api.types.is_timedelta64_dtype(
         lhs.dtype
     ) and pd.api.types.is_timedelta64_dtype(rhs.dtype):
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index c750cc92f30..f5823528d02 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -1,5 +1,11 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
 import itertools
+from collections import OrderedDict
 from collections.abc import MutableMapping
+from typing import TYPE_CHECKING, Any, Tuple, Union
 
 import pandas as pd
 
@@ -11,9 +17,22 @@
     to_nested_dict,
 )
 
+if TYPE_CHECKING:
+    from cudf.core.column import ColumnBase
+
 
 class ColumnAccessor(MutableMapping):
-    def __init__(self, data=None, multiindex=False, level_names=None):
+
+    _data: "OrderedDict[Any, ColumnBase]"
+    multiindex: bool
+    _level_names: Tuple[Any, ...]
+
+    def __init__(
+        self,
+        data: Union[MutableMapping, ColumnAccessor] = None,
+        multiindex: bool = False,
+        level_names=None,
+    ):
         """
         Parameters
         ----------
@@ -33,7 +52,7 @@ def __init__(self, data=None, multiindex=False, level_names=None):
         if isinstance(data, ColumnAccessor):
             multiindex = multiindex or data.multiindex
             level_names = level_names or data.level_names
-            self._data = data
+            self._data = data._data
             self.multiindex = multiindex
             self._level_names = level_names
 
@@ -44,21 +63,21 @@ def __init__(self, data=None, multiindex=False, level_names=None):
     def __iter__(self):
         return self._data.__iter__()
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: Any) -> ColumnBase:
         return self._data[key]
 
-    def __setitem__(self, key, value):
+    def __setitem__(self, key: Any, value: Any):
         self.set_by_label(key, value)
         self._clear_cache()
 
-    def __delitem__(self, key):
+    def __delitem__(self, key: Any):
         self._data.__delitem__(key)
         self._clear_cache()
 
-    def __len__(self):
+    def __len__(self) -> int:
         return len(self._data)
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         data_repr = self._data.__repr__()
         multiindex_repr = self.multiindex.__repr__()
         level_names_repr = self.level_names.__repr__()
@@ -70,14 +89,14 @@ def __repr__(self):
         )
 
     @property
-    def level_names(self):
+    def level_names(self) -> Tuple[Any, ...]:
         if self._level_names is None or len(self._level_names) == 0:
             return tuple((None,) * max(1, self.nlevels))
         else:
             return self._level_names
 
     @property
-    def nlevels(self):
+    def nlevels(self) -> int:
         if len(self._data) == 0:
             return 0
         if not self.multiindex:
@@ -86,28 +105,28 @@ def nlevels(self):
             return len(next(iter(self.keys())))
 
     @property
-    def name(self):
+    def name(self) -> Any:
         if len(self._data) == 0:
             return None
         return self.level_names[-1]
 
     @property
-    def nrows(self):
+    def nrows(self) -> int:
         if len(self._data) == 0:
             return 0
         else:
             return len(next(iter(self.values())))
 
     @cached_property
-    def names(self):
+    def names(self) -> Tuple[Any, ...]:
         return tuple(self.keys())
 
     @cached_property
-    def columns(self):
+    def columns(self) -> Tuple[ColumnBase, ...]:
         return tuple(self.values())
 
     @cached_property
-    def _grouped_data(self):
+    def _grouped_data(self) -> MutableMapping:
         """
         If self.multiindex is True,
         return the underlying mapping as a nested mapping.
@@ -125,7 +144,7 @@ def _clear_cache(self):
             except AttributeError:
                 pass
 
-    def to_pandas_index(self):
+    def to_pandas_index(self) -> pd.Index:
         """"
         Convert the keys of the ColumnAccessor to a Pandas Index object.
         """
@@ -142,7 +161,7 @@ def to_pandas_index(self):
             result = pd.Index(self.names, name=self.name, tupleize_cols=False)
         return result
 
-    def insert(self, name, value, loc=-1):
+    def insert(self, name: Any, value: Any, loc: int = -1):
         """
         Insert column into the ColumnAccessor at the specified location.
 
@@ -176,10 +195,10 @@ def insert(self, name, value, loc=-1):
         else:
             new_keys = self.names[:loc] + (name,) + self.names[loc:]
             new_values = self.columns[:loc] + (value,) + self.columns[loc:]
-            self._data = self._data.__class__(zip(new_keys, new_values),)
+            self._data = self._data.__class__(zip(new_keys, new_values))
         self._clear_cache()
 
-    def copy(self, deep=False):
+    def copy(self, deep=False) -> ColumnAccessor:
         """
         Make a copy of this ColumnAccessor.
         """
@@ -195,7 +214,7 @@ def copy(self, deep=False):
             level_names=self.level_names,
         )
 
-    def select_by_label(self, key):
+    def select_by_label(self, key: Any) -> ColumnAccessor:
         """
         Return a subset of this column accessor,
         composed of the keys specified by `key`.
@@ -218,7 +237,7 @@ def select_by_label(self, key):
                     return self._select_by_label_with_wildcard(key)
             return self._select_by_label_grouped(key)
 
-    def select_by_index(self, index):
+    def select_by_index(self, index: Any) -> ColumnAccessor:
         """
         Return a ColumnAccessor composed of the columns
         specified by index.
@@ -243,7 +262,7 @@ def select_by_index(self, index):
             data, multiindex=self.multiindex, level_names=self.level_names,
         )
 
-    def set_by_label(self, key, value):
+    def set_by_label(self, key: Any, value: Any):
         """
         Add (or modify) column by name.
 
@@ -256,14 +275,14 @@ def set_by_label(self, key, value):
         self._data[key] = value
         self._clear_cache()
 
-    def _select_by_label_list_like(self, key):
+    def _select_by_label_list_like(self, key: Any) -> ColumnAccessor:
         return self.__class__(
             to_flat_dict({k: self._grouped_data[k] for k in key}),
             multiindex=self.multiindex,
             level_names=self.level_names,
         )
 
-    def _select_by_label_grouped(self, key):
+    def _select_by_label_grouped(self, key: Any) -> ColumnAccessor:
         result = self._grouped_data[key]
         if isinstance(result, cudf.core.column.ColumnBase):
             return self.__class__({key: result})
@@ -277,7 +296,7 @@ def _select_by_label_grouped(self, key):
                 level_names=self.level_names[len(key) :],
             )
 
-    def _select_by_label_slice(self, key):
+    def _select_by_label_slice(self, key: slice) -> ColumnAccessor:
         start, stop = key.start, key.stop
         if key.step is not None:
             raise TypeError("Label slicing with step is not supported")
@@ -303,7 +322,7 @@ def _select_by_label_slice(self, key):
             level_names=self.level_names,
         )
 
-    def _select_by_label_with_wildcard(self, key):
+    def _select_by_label_with_wildcard(self, key: Any) -> ColumnAccessor:
         key = self._pad_key(key, slice(None))
         return self.__class__(
             {k: self._data[k] for k in self._data if _compare_keys(k, key)},
@@ -311,7 +330,7 @@ def _select_by_label_with_wildcard(self, key):
             level_names=self.level_names,
         )
 
-    def _pad_key(self, key, pad_value=""):
+    def _pad_key(self, key: Any, pad_value="") -> Any:
         """
         Pad the provided key to a length equal to the number
         of levels.
@@ -323,7 +342,7 @@ def _pad_key(self, key, pad_value=""):
         return key + (pad_value,) * (self.nlevels - len(key))
 
 
-def _compare_keys(target, key):
+def _compare_keys(target: Any, key: Any) -> bool:
     """
     Compare `key` to `target`.
 
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index f9b61a60830..e5626190098 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -9,6 +9,7 @@
 import warnings
 from collections import OrderedDict, defaultdict
 from collections.abc import Iterable, Mapping, Sequence
+from typing import Any, Set
 
 import cupy
 import numpy as np
@@ -2364,7 +2365,7 @@ def iteritems(self):
         for k in self:
             yield (k, self[k])
 
-    @property
+    @property  # type: ignore
     @annotate("DATAFRAME_LOC", color="blue", domain="cudf_python")
     def loc(self):
         """
@@ -2535,14 +2536,14 @@ def at(self):
         """
         return self.loc
 
-    @property
+    @property  # type: ignore
     @annotate("DATAFRAME_COLUMNS_GETTER", color="yellow", domain="cudf_python")
     def columns(self):
         """Returns a tuple of columns
         """
         return self._data.to_pandas_index()
 
-    @columns.setter
+    @columns.setter  # type: ignore
     @annotate("DATAFRAME_COLUMNS_SETTER", color="yellow", domain="cudf_python")
     def columns(self, columns):
         if isinstance(columns, (cudf.MultiIndex, cudf.Index)):
@@ -4229,14 +4230,13 @@ def join(
         )
         return df
 
-    @copy_docstring(DataFrameGroupBy.__init__)
     def groupby(
         self,
         by=None,
         axis=0,
         level=None,
         as_index=True,
-        sort=True,
+        sort=False,
         group_keys=True,
         squeeze=False,
         observed=False,
@@ -4274,7 +4274,6 @@ def groupby(
             sort=sort,
         )
 
-    @copy_docstring(Rolling)
     def rolling(
         self, window, min_periods=None, center=False, axis=0, win_type=None
     ):
@@ -7272,7 +7271,7 @@ def equals(self, other):
                 return False
         return super().equals(other)
 
-    _accessors = set()
+    _accessors = set()  # type: Set[Any]
 
 
 def from_pandas(obj, nan_as_null=None):
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index ccd92de69fc..b89b3ddb2be 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -2,6 +2,7 @@
 
 import decimal
 import pickle
+from typing import Any
 
 import numpy as np
 import pandas as pd
@@ -9,10 +10,11 @@
 from pandas.api.extensions import ExtensionDtype
 
 import cudf
+from cudf._typing import Dtype
 
 
 class CategoricalDtype(ExtensionDtype):
-    def __init__(self, categories=None, ordered=None):
+    def __init__(self, categories=None, ordered: bool = None) -> None:
         """
         dtype similar to pd.CategoricalDtype with the categories
         stored on the GPU.
@@ -21,7 +23,7 @@ def __init__(self, categories=None, ordered=None):
         self.ordered = ordered
 
     @property
-    def categories(self):
+    def categories(self) -> "cudf.core.index.Index":
         if self._categories is None:
             return cudf.core.index.as_index(
                 cudf.core.column.column_empty(0, dtype="object", masked=False)
@@ -41,23 +43,23 @@ def str(self):
         return "|O08"
 
     @classmethod
-    def from_pandas(cls, dtype):
+    def from_pandas(cls, dtype: pd.CategoricalDtype) -> "CategoricalDtype":
         return CategoricalDtype(
             categories=dtype.categories, ordered=dtype.ordered
         )
 
-    def to_pandas(self):
+    def to_pandas(self) -> pd.CategoricalDtype:
         if self.categories is None:
             categories = None
         else:
             categories = self.categories.to_pandas()
         return pd.CategoricalDtype(categories=categories, ordered=self.ordered)
 
-    def _init_categories(self, categories):
+    def _init_categories(self, categories: Any):
         if categories is None:
             return categories
         if len(categories) == 0:
-            dtype = "object"
+            dtype = "object"  # type: Any
         else:
             dtype = None
 
@@ -68,7 +70,7 @@ def _init_categories(self, categories):
         else:
             return column
 
-    def __eq__(self, other):
+    def __eq__(self, other: Dtype) -> bool:
         if isinstance(other, str):
             return other == self.name
         elif other is self:
@@ -111,10 +113,10 @@ def deserialize(cls, header, frames):
 
 
 class ListDtype(ExtensionDtype):
+    _typ: pa.ListType
+    name: str = "list"
 
-    name = "list"
-
-    def __init__(self, element_type):
+    def __init__(self, element_type: Any) -> None:
         if isinstance(element_type, ListDtype):
             self._typ = pa.list_(element_type._typ)
         else:
@@ -124,7 +126,7 @@ def __init__(self, element_type):
             self._typ = pa.list_(element_type)
 
     @property
-    def element_type(self):
+    def element_type(self) -> Dtype:
         if isinstance(self._typ.value_type, pa.ListType):
             return ListDtype.from_arrow(self._typ.value_type)
         else:
@@ -220,18 +222,47 @@ def __hash__(self):
         return hash(self._typ)
 
 
-class DecimalDtype(ExtensionDtype):
+class Decimal64Dtype(ExtensionDtype):
 
     name = "decimal"
     _metadata = ("precision", "scale")
+    _MAX_PRECISION = np.floor(np.log10(np.iinfo("int64").max))
 
-    def __init__(self, precision, scale):
+    def __init__(self, precision, scale=0):
+        """
+        Parameters
+        ----------
+        precision : int
+            The total number of digits in each value of this dtype
+        scale : int, optional
+            The scale of the Decimal64Dtype. See Notes below.
+
+        Notes
+        -----
+            When the scale is positive:
+              - numbers with fractional parts (e.g., 0.0042) can be represented
+              - the scale is the total number of digits to the right of the
+                decimal point
+            When the scale is negative:
+              - only multiples of powers of 10 (including 10**0) can be
+                represented (e.g., 1729, 4200, 1000000)
+              - the scale represents the number of trailing zeros in the value.
+            For example, 42 is representable with precision=2 and scale=0.
+            13.0051 is representable with precision=6 and scale=4,
+            and *not* representable with precision<6 or scale<4.
+        """
+        self._validate(precision, scale)
         self._typ = pa.decimal128(precision, scale)
 
     @property
     def precision(self):
         return self._typ.precision
 
+    @precision.setter
+    def precision(self, value):
+        self._validate(value, self.scale)
+        self._typ = pa.decimal128(precision=value, scale=self.scale)
+
     @property
     def scale(self):
         return self._typ.scale
@@ -248,5 +279,25 @@ def to_arrow(self):
     def from_arrow(cls, typ):
         return cls(typ.precision, typ.scale)
 
+    @property
+    def itemsize(self):
+        return 8
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}"
+            f"(precision={self.precision}, scale={self.scale})"
+        )
+
     def __hash__(self):
         return hash(self._typ)
+
+    @classmethod
+    def _validate(cls, precision, scale=0):
+        if precision > Decimal64Dtype._MAX_PRECISION:
+            raise ValueError(
+                f"Cannot construct a {cls.__name__}"
+                f" with precision > {cls._MAX_PRECISION}"
+            )
+        if abs(scale) > precision:
+            raise ValueError(f"scale={scale} exceeds precision={precision}")
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index ad4069dfb68..3d12ac2e6cc 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -5,6 +5,7 @@
 import operator
 import warnings
 from collections import OrderedDict, abc as abc
+from typing import overload
 
 import cupy
 import numpy as np
@@ -12,6 +13,7 @@
 import pyarrow as pa
 from nvtx import annotate
 from pandas.api.types import is_dict_like, is_dtype_equal
+from typing_extensions import Literal
 
 import cudf
 from cudf import _lib as libcudf
@@ -39,9 +41,23 @@ class Frame(libcudf.table.Table):
     """
 
     @classmethod
-    def _from_table(cls, table):
+    def _from_table(cls, table: "Frame"):
         return cls(table._data, index=table._index)
 
+    @overload
+    def _mimic_inplace(self, result: "Frame") -> "Frame":
+        ...
+
+    @overload
+    def _mimic_inplace(self, result: "Frame", inplace: Literal[True]):
+        ...
+
+    @overload
+    def _mimic_inplace(
+        self, result: "Frame", inplace: Literal[False]
+    ) -> "Frame":
+        ...
+
     def _mimic_inplace(self, result, inplace=False):
         if inplace:
             for col in self._data:
@@ -1296,7 +1312,9 @@ def dropna(
         0  Alfred  Batmobile 1940-04-25
         """
         if axis == 0:
-            result = self._drop_na_rows(how=how, subset=subset, thresh=thresh)
+            result = self._drop_na_rows(
+                how=how, subset=subset, thresh=thresh, drop_nan=True
+            )
         else:
             result = self._drop_na_columns(
                 how=how, subset=subset, thresh=thresh
@@ -1443,7 +1461,9 @@ def fillna(
 
         return self._mimic_inplace(result, inplace=inplace)
 
-    def _drop_na_rows(self, how="any", subset=None, thresh=None):
+    def _drop_na_rows(
+        self, how="any", subset=None, thresh=None, drop_nan=False
+    ):
         """
         Drops null rows from `self`.
 
@@ -1475,12 +1495,23 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None):
         ]
         if len(subset_cols) == 0:
             return self.copy(deep=True)
-        result = self.__class__._from_table(
+
+        frame = self.copy(deep=False)
+        if drop_nan:
+            for name, col in frame._data.items():
+                if name in subset and isinstance(
+                    col, cudf.core.column.NumericalColumn
+                ):
+                    frame._data[name] = col.nans_to_nulls()
+                else:
+                    frame._data[name] = col
+
+        result = frame.__class__._from_table(
             libcudf.stream_compaction.drop_nulls(
-                self, how=how, keys=subset, thresh=thresh
+                frame, how=how, keys=subset, thresh=thresh
             )
         )
-        result._postprocess_columns(self)
+        result._postprocess_columns(frame)
         return result
 
     def _drop_na_columns(self, how="any", subset=None, thresh=None):
@@ -1501,7 +1532,10 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None):
                 thresh = len(df)
 
         for col in self._data.names:
-            if (len(df[col]) - df[col].null_count) < thresh:
+            no_threshold_valid_count = (
+                len(df[col]) - df[col].nans_to_nulls().null_count
+            ) < thresh
+            if no_threshold_valid_count:
                 continue
             out_cols.append(col)
 
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index b60815722c6..8af3b6f1d81 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -18,7 +18,7 @@ class GroupBy(Serializable):
     _MAX_GROUPS_BEFORE_WARN = 100
 
     def __init__(
-        self, obj, by=None, level=None, sort=True, as_index=True, dropna=True
+        self, obj, by=None, level=None, sort=False, as_index=True, dropna=True
     ):
         """
         Group a DataFrame or Series by a set of columns.
@@ -37,9 +37,9 @@ def __init__(
         level : int, level_name or list, optional
             For objects with a MultiIndex, `level` can be used to specify
             grouping by one or more levels of the MultiIndex.
-        sort : True, optional
-            If True (default), sort results by group9s). Note that
-            unlike Pandas, this also sorts values within each group.
+        sort : bool, default False
+            Sort the result by group keys. Differ from Pandas, cudf defaults
+            to False for better performance.
         as_index : bool, optional
             If as_index=True (default), the group names appear
             as the keys of the resulting DataFrame.
@@ -101,7 +101,7 @@ def size(self):
                     len(self.obj), "int8", masked=False
                 )
             )
-            .groupby(self.grouping)
+            .groupby(self.grouping, sort=self._sort)
             .agg("size")
         )
 
@@ -126,12 +126,13 @@ def agg(self, func):
         Examples
         --------
         >>> import cudf
-        >>> a = cudf.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 3]})
+        >>> a = cudf.DataFrame(
+            {'a': [1, 1, 2], 'b': [1, 2, 3], 'c': [2, 2, 1]})
         >>> a.groupby('a').agg('sum')
            b
         a
-        1  3
         2  3
+        1  3
 
         Specifying a list of aggregations to perform on each column.
 
@@ -139,8 +140,8 @@ def agg(self, func):
             b       c
           sum min sum min
         a
-        1   3   1   4   2
         2   3   3   1   1
+        1   3   1   4   2
 
         Using a dict to specify aggregations to perform per column.
 
@@ -148,8 +149,8 @@ def agg(self, func):
             a   b
           max min mean
         a
-        1   1   1  1.5
         2   2   3  3.0
+        1   1   1  1.5
 
         Using lambdas/callables to specify aggregations taking parameters.
 
@@ -591,7 +592,7 @@ def rolling(self, *args, **kwargs):
 
 class DataFrameGroupBy(GroupBy):
     def __init__(
-        self, obj, by=None, level=None, sort=True, as_index=True, dropna=True
+        self, obj, by=None, level=None, sort=False, as_index=True, dropna=True
     ):
         """
         Group DataFrame using a mapper or by a Series of columns.
@@ -618,10 +619,11 @@ def __init__(
             For aggregated output, return object with group labels as
             the index. Only relevant for DataFrame input.
             as_index=False is effectively “SQL-style” grouped output.
-        sort : bool, default True
-            Sort group keys. Get better performance by turning this off.
-            Note this does not influence the order of observations within each
-            group. Groupby preserves the order of rows within each group.
+        sort : bool, default False
+            Sort result by group key. Differ from Pandas, cudf defaults to
+            ``False`` for better performance. Note this does not influence
+            the order of observations within each group. Groupby preserves
+            the order of rows within each group.
         dropna : bool, optional
             If True (default), do not include the "null" group.
 
@@ -670,8 +672,8 @@ def __init__(
         >>> df.groupby(level="Type").mean()
                 Max Speed
         Type
-        Captive      210.0
         Wild         185.0
+        Captive      210.0
 
         """
         super().__init__(
@@ -689,12 +691,14 @@ def __getattribute__(self, key):
         except AttributeError:
             if key in self.obj:
                 return self.obj[key].groupby(
-                    self.grouping, dropna=self._dropna
+                    self.grouping, dropna=self._dropna, sort=self._sort
                 )
             raise
 
     def __getitem__(self, key):
-        return self.obj[key].groupby(self.grouping, dropna=self._dropna)
+        return self.obj[key].groupby(
+            self.grouping, dropna=self._dropna, sort=self._sort
+        )
 
     def nunique(self):
         """
@@ -705,7 +709,7 @@ def nunique(self):
 
 class SeriesGroupBy(GroupBy):
     def __init__(
-        self, obj, by=None, level=None, sort=True, as_index=True, dropna=True
+        self, obj, by=None, level=None, sort=False, as_index=True, dropna=True
     ):
         """
         Group Series using a mapper or by a Series of columns.
@@ -732,10 +736,11 @@ def __init__(
             For aggregated output, return object with group labels as
             the index. Only relevant for DataFrame input.
             as_index=False is effectively “SQL-style” grouped output.
-        sort : bool, default True
-            Sort group keys. Get better performance by turning this off.
-            Note this does not influence the order of observations within each
-            group. Groupby preserves the order of rows within each group.
+        sort : bool, default False
+            Sort result by group key. Differ from Pandas, cudf defaults to
+            ``False`` for better performance. Note this does not influence
+            the order of observations within each group. Groupby preserves
+            the order of rows within each group.
 
         Returns
         -------
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 219d355d3cc..e3899a403f1 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2018-2020, NVIDIA CORPORATION.
-from __future__ import division, print_function
+from __future__ import annotations, division, print_function
 
 import pickle
 from numbers import Number
+from typing import Any, Dict, Set, Type
 
 import cupy
 import numpy as np
@@ -132,6 +133,13 @@ def __init__(
         """
         pass
 
+    @cached_property
+    def _values(self) -> ColumnBase:
+        raise NotImplementedError
+
+    def __getitem__(self, key):
+        raise NotImplementedError()
+
     def drop_duplicates(self, keep="first"):
         """
         Return Index with duplicate values removed
@@ -1485,7 +1493,11 @@ def _from_table(cls, table):
         else:
             return as_index(table)
 
-    _accessors = set()
+    _accessors = set()  # type: Set[Any]
+
+    @property
+    def _constructor_expanddim(self):
+        return cudf.MultiIndex
 
 
 class RangeIndex(Index):
@@ -1773,7 +1785,7 @@ def find_label_range(self, first=None, last=None):
 
         return begin, end
 
-    @copy_docstring(_to_frame)
+    @copy_docstring(_to_frame)  # type: ignore
     def to_frame(self, index=True, name=None):
         return _to_frame(self, index, name)
 
@@ -2028,7 +2040,7 @@ def __getitem__(self, index):
         else:
             return res
 
-    @copy_docstring(_to_frame)
+    @copy_docstring(_to_frame)  # type: ignore
     def to_frame(self, index=True, name=None):
         return _to_frame(self, index, name)
 
@@ -2705,15 +2717,11 @@ def __repr__(self):
             + ")"
         )
 
-    @copy_docstring(StringMethods.__init__)
+    @copy_docstring(StringMethods.__init__)  # type: ignore
     @property
     def str(self):
         return StringMethods(column=self._values, parent=self)
 
-    @property
-    def _constructor_expanddim(self):
-        return cudf.MultiIndex
-
     def _clean_nulls_from_index(self):
         """
         Convert all na values(if any) in Index object
@@ -2725,7 +2733,7 @@ def _clean_nulls_from_index(self):
             return self
 
 
-def as_index(arbitrary, **kwargs):
+def as_index(arbitrary, **kwargs) -> Index:
     """Create an Index from an arbitrary object
 
     Currently supported inputs are:
@@ -2794,7 +2802,7 @@ def as_index(arbitrary, **kwargs):
     np.uint64: UInt64Index,
     np.float32: Float32Index,
     np.float64: Float64Index,
-}
+}  # type: Dict[Any, Type[Index]]
 
 _index_to_dtype = {
     Int8Index: np.int8,
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index 3872e296ed5..4ea32c77724 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -329,6 +329,9 @@ def _dispatch_scalar_unaop(self, op):
             return np.ceil(self.value)
         return getattr(self.value, op)()
 
+    def astype(self, dtype):
+        return Scalar(self.device_value, dtype)
+
 
 class _NAType(object):
     def __init__(self):
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 147262be08d..dfc687eb76d 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -5,6 +5,7 @@
 from collections import abc as abc
 from numbers import Number
 from shutil import get_terminal_size
+from typing import Any, Set
 from uuid import uuid4
 
 import cupy
@@ -43,6 +44,7 @@
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
     can_convert_to_column,
+    is_decimal_dtype,
     is_list_dtype,
     is_list_like,
     is_mixed_with_object_dtype,
@@ -1099,6 +1101,7 @@ def __repr__(self):
                 preprocess._column, cudf.core.column.CategoricalColumn
             )
             and not is_list_dtype(preprocess.dtype)
+            and not is_decimal_dtype(preprocess.dtype)
         ) or isinstance(
             preprocess._column, cudf.core.column.timedelta.TimeDeltaColumn
         ):
@@ -1705,17 +1708,17 @@ def __neg__(self):
         """
         return self.__mul__(-1)
 
-    @copy_docstring(CategoricalAccessor.__init__)
+    @copy_docstring(CategoricalAccessor.__init__)  # type: ignore
     @property
     def cat(self):
         return CategoricalAccessor(column=self._column, parent=self)
 
-    @copy_docstring(StringMethods.__init__)
+    @copy_docstring(StringMethods.__init__)  # type: ignore
     @property
     def str(self):
         return StringMethods(column=self._column, parent=self)
 
-    @copy_docstring(ListMethods.__init__)
+    @copy_docstring(ListMethods.__init__)  # type: ignore
     @property
     def list(self):
         return ListMethods(column=self._column, parent=self)
@@ -4180,7 +4183,7 @@ def groupby(
         axis=0,
         level=None,
         as_index=True,
-        sort=True,
+        sort=False,
         group_keys=True,
         squeeze=False,
         observed=False,
@@ -4442,7 +4445,7 @@ def keys(self):
         """
         return self.index
 
-    _accessors = set()
+    _accessors = set()  # type: Set[Any]
 
 
 truediv_int_dtype_corrections = {
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index a6ce2c85e42..7d1ab3a5435 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -374,7 +374,7 @@ def __init__(self, groupby, window, min_periods=None, center=False):
         self._group_keys = groupby.grouping.keys.take(sort_order)
         obj = groupby.obj.take(sort_order)
 
-        gb_size = groupby.size()
+        gb_size = groupby.size().sort_index()
         self._group_starts = (
             gb_size.cumsum().shift(1).fillna(0).repeat(gb_size)
         )
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index d6e0fedf8e0..7c8455b6575 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -7,9 +7,11 @@
 from pyarrow import orc as orc
 
 import cudf
-from cudf import _lib as libcudf
+from cudf._lib import orc as liborc
 from cudf.utils import ioutils
-from cudf.utils.metadata import orc_column_statistics_pb2 as cs_pb2
+from cudf.utils.metadata import (  # type: ignore
+    orc_column_statistics_pb2 as cs_pb2,
+)
 
 
 def _make_empty_df(filepath_or_buffer, columns):
@@ -127,7 +129,7 @@ def read_orc_statistics(
         column_names,
         raw_file_statistics,
         raw_stripes_statistics,
-    ) = libcudf.orc.read_raw_orc_statistics(filepath_or_buffer)
+    ) = liborc.read_raw_orc_statistics(filepath_or_buffer)
 
     # Parse column names
     column_names = [
@@ -257,7 +259,7 @@ def read_orc(
 
     if engine == "cudf":
         df = DataFrame._from_table(
-            libcudf.orc.read_orc(
+            liborc.read_orc(
                 filepath_or_buffer,
                 columns,
                 stripes,
@@ -324,9 +326,9 @@ def to_orc(df, fname, compression=None, enable_statistics=True, **kwargs):
     if ioutils.is_fsspec_open_file(path_or_buf):
         with path_or_buf as file_obj:
             file_obj = ioutils.get_IOBase_writer(file_obj)
-            libcudf.orc.write_orc(df, file_obj, compression, enable_statistics)
+            liborc.write_orc(df, file_obj, compression, enable_statistics)
     else:
-        libcudf.orc.write_orc(df, path_or_buf, compression, enable_statistics)
+        liborc.write_orc(df, path_or_buf, compression, enable_statistics)
 
 
-ORCWriter = libcudf.orc.ORCWriter
+ORCWriter = liborc.ORCWriter
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index bf8898825c0..2048e574acc 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -1,5 +1,7 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 
+from __future__ import annotations
+
 from typing import Union
 
 import numpy as np
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 93bc6d1c573..c821755f670 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -1,7 +1,8 @@
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 
 from __future__ import division
 
+import decimal
 import operator
 import random
 from itertools import product
@@ -1567,3 +1568,161 @@ def test_binops_with_NA_consistent(dtype, op):
         assert (result == expect_all).all()
     elif dtype in DATETIME_TYPES & TIMEDELTA_TYPES:
         assert result._column.null_count == len(data)
+
+
+@pytest.mark.parametrize(
+    "args",
+    [
+        (
+            operator.add,
+            ["1.5", "2.0"],
+            cudf.Decimal64Dtype(scale=2, precision=2),
+            ["1.5", "2.0"],
+            cudf.Decimal64Dtype(scale=2, precision=2),
+            ["3.0", "4.0"],
+            cudf.Decimal64Dtype(scale=2, precision=3),
+        ),
+        (
+            operator.add,
+            ["1.5", "2.0"],
+            cudf.Decimal64Dtype(scale=2, precision=2),
+            ["2.25", "1.005"],
+            cudf.Decimal64Dtype(scale=3, precision=4),
+            ["3.75", "3.005"],
+            cudf.Decimal64Dtype(scale=3, precision=5),
+        ),
+        (
+            operator.add,
+            ["100", "200"],
+            cudf.Decimal64Dtype(scale=-2, precision=3),
+            ["0.1", "0.2"],
+            cudf.Decimal64Dtype(scale=3, precision=4),
+            ["100.1", "200.2"],
+            cudf.Decimal64Dtype(scale=3, precision=9),
+        ),
+        (
+            operator.sub,
+            ["1.5", "2.0"],
+            cudf.Decimal64Dtype(scale=2, precision=2),
+            ["2.25", "1.005"],
+            cudf.Decimal64Dtype(scale=3, precision=4),
+            ["-0.75", "0.995"],
+            cudf.Decimal64Dtype(scale=3, precision=5),
+        ),
+        (
+            operator.sub,
+            ["1.5", "2.0"],
+            cudf.Decimal64Dtype(scale=2, precision=2),
+            ["2.25", "1.005"],
+            cudf.Decimal64Dtype(scale=3, precision=4),
+            ["-0.75", "0.995"],
+            cudf.Decimal64Dtype(scale=3, precision=5),
+        ),
+        (
+            operator.sub,
+            ["100", "200"],
+            cudf.Decimal64Dtype(scale=-2, precision=3),
+            ["0.1", "0.2"],
+            cudf.Decimal64Dtype(scale=3, precision=4),
+            ["99.9", "199.8"],
+            cudf.Decimal64Dtype(scale=3, precision=9),
+        ),
+        (
+            operator.mul,
+            ["1.5", "2.0"],
+            cudf.Decimal64Dtype(scale=2, precision=2),
+            ["1.5", "3.0"],
+            cudf.Decimal64Dtype(scale=3, precision=4),
+            ["2.25", "6.0"],
+            cudf.Decimal64Dtype(scale=5, precision=7),
+        ),
+        (
+            operator.mul,
+            ["100", "200"],
+            cudf.Decimal64Dtype(scale=-2, precision=3),
+            ["0.1", "0.2"],
+            cudf.Decimal64Dtype(scale=3, precision=4),
+            ["10.0", "40.0"],
+            cudf.Decimal64Dtype(scale=1, precision=8),
+        ),
+        (
+            operator.mul,
+            ["1000", "2000"],
+            cudf.Decimal64Dtype(scale=-3, precision=4),
+            ["0.343", "0.500"],
+            cudf.Decimal64Dtype(scale=3, precision=3),
+            ["343.0", "1000.0"],
+            cudf.Decimal64Dtype(scale=0, precision=8),
+        ),
+        (
+            operator.add,
+            ["1.5", None, "2.0"],
+            cudf.Decimal64Dtype(scale=2, precision=2),
+            ["1.5", None, "2.0"],
+            cudf.Decimal64Dtype(scale=2, precision=2),
+            ["3.0", None, "4.0"],
+            cudf.Decimal64Dtype(scale=2, precision=3),
+        ),
+        (
+            operator.add,
+            ["1.5", None],
+            cudf.Decimal64Dtype(scale=2, precision=2),
+            ["2.25", "1.005"],
+            cudf.Decimal64Dtype(scale=3, precision=4),
+            ["3.75", None],
+            cudf.Decimal64Dtype(scale=3, precision=5),
+        ),
+        (
+            operator.sub,
+            ["1.5", None],
+            cudf.Decimal64Dtype(scale=2, precision=2),
+            ["2.25", None],
+            cudf.Decimal64Dtype(scale=3, precision=4),
+            ["-0.75", None],
+            cudf.Decimal64Dtype(scale=3, precision=5),
+        ),
+        (
+            operator.sub,
+            ["1.5", "2.0"],
+            cudf.Decimal64Dtype(scale=2, precision=2),
+            ["2.25", None],
+            cudf.Decimal64Dtype(scale=3, precision=4),
+            ["-0.75", None],
+            cudf.Decimal64Dtype(scale=3, precision=5),
+        ),
+        (
+            operator.mul,
+            ["1.5", None],
+            cudf.Decimal64Dtype(scale=2, precision=2),
+            ["1.5", None],
+            cudf.Decimal64Dtype(scale=3, precision=4),
+            ["2.25", None],
+            cudf.Decimal64Dtype(scale=5, precision=7),
+        ),
+        (
+            operator.mul,
+            ["100", "200"],
+            cudf.Decimal64Dtype(scale=-2, precision=3),
+            ["0.1", None],
+            cudf.Decimal64Dtype(scale=3, precision=4),
+            ["10.0", None],
+            cudf.Decimal64Dtype(scale=1, precision=8),
+        ),
+    ],
+)
+def test_binops_decimal(args):
+    op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype = args
+
+    def decimal_series(input, dtype):
+        return cudf.Series(
+            [x if x is None else decimal.Decimal(x) for x in input],
+            dtype=dtype,
+        )
+
+    a = decimal_series(lhs, l_dtype)
+    b = decimal_series(rhs, r_dtype)
+    expect = decimal_series(expect, expect_dtype)
+
+    got = op(a, b)
+    assert expect.dtype == got.dtype
+    utils.assert_eq(expect, got)
diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py
index 62427cc593e..964e79a57b0 100644
--- a/python/cudf/cudf/tests/test_column_accessor.py
+++ b/python/cudf/cudf/tests/test_column_accessor.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 
+
 import pandas as pd
 import pytest
 
diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
new file mode 100644
index 00000000000..f73a785727b
--- /dev/null
+++ b/python/cudf/cudf/tests/test_decimal.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from decimal import Decimal
+
+import pyarrow as pa
+import pytest
+
+from cudf.core.column import DecimalColumn
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [Decimal("1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")],
+        [Decimal("-1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")],
+        [1],
+        [-1],
+        [1, 2, 3, 4],
+        [42, 1729, 4104],
+        [1, 2, None, 4],
+        [None, None, None],
+        [],
+    ],
+)
+@pytest.mark.parametrize(
+    "typ",
+    [
+        pa.decimal128(precision=4, scale=2),
+        pa.decimal128(precision=5, scale=3),
+        pa.decimal128(precision=6, scale=4),
+    ],
+)
+def test_round_trip_decimal_column(data, typ):
+    pa_arr = pa.array(data, type=typ)
+    col = DecimalColumn.from_arrow(pa_arr)
+    assert pa_arr.equals(col.to_arrow())
+
+
+def test_from_arrow_max_precision():
+    with pytest.raises(ValueError):
+        DecimalColumn.from_arrow(
+            pa.array([1, 2, 3], type=pa.decimal128(scale=0, precision=19))
+        )
diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py
index 3482b314fee..08378361188 100644
--- a/python/cudf/cudf/tests/test_dropna.py
+++ b/python/cudf/cudf/tests/test_dropna.py
@@ -108,7 +108,7 @@ def test_dropna_with_all_nulls(how, data, axis):
 
 def test_dropna_nan_as_null():
     sr = cudf.Series([1.0, 2.0, np.nan, None], nan_as_null=False)
-    assert_eq(sr.dropna(), sr[:3])
+    assert_eq(sr.dropna(), sr[:2])
     sr = sr.nans_to_nulls()
     assert_eq(sr.dropna(), sr[:2])
 
@@ -120,7 +120,7 @@ def test_dropna_nan_as_null():
     )
 
     got = df.dropna()
-    expected = df[:3]
+    expected = df[:2]
     assert_eq(expected, got)
 
     df = df.nans_to_nulls()
@@ -210,13 +210,28 @@ def test_dropna_thresh_cols(thresh, subset, inplace):
     )
 
 
-def test_dropna_dataframe_np_nan():
-    import numpy as np
-
-    import cudf
-
-    data = {"key": [1, 2], "val": [np.nan, 3]}
+@pytest.mark.parametrize(
+    "data",
+    [
+        {
+            "key": [1, 2, 10],
+            "val": cudf.Series([np.nan, 3, 1], nan_as_null=False),
+            "abc": [np.nan, None, 1],
+        },
+        {
+            "key": [None, 2, 1],
+            "val": cudf.Series([3, np.nan, 0.1], nan_as_null=True),
+            "abc": [None, 1, None],
+        },
+    ],
+)
+@pytest.mark.parametrize("axis", [0, 1])
+def test_dropna_dataframe_np_nan(data, axis):
     gdf = cudf.DataFrame(data)
-    pdf = pd.DataFrame(data)
+    pd_data = {
+        key: value.to_pandas() if isinstance(value, cudf.Series) else value
+        for key, value in data.items()
+    }
+    pdf = pd.DataFrame(pd_data)
 
-    assert_eq(pdf.dropna(), gdf.dropna(), check_dtype=False)
+    assert_eq(pdf.dropna(axis=axis), gdf.dropna(axis=axis), check_dtype=False)
diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py
index 4b5867c073f..32cecec3f60 100644
--- a/python/cudf/cudf/tests/test_dtypes.py
+++ b/python/cudf/cudf/tests/test_dtypes.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
@@ -8,9 +8,9 @@
 import cudf
 from cudf.core.dtypes import (
     CategoricalDtype,
+    Decimal64Dtype,
     ListDtype,
     StructDtype,
-    DecimalDtype,
 )
 from cudf.tests.utils import assert_eq
 
@@ -136,6 +136,12 @@ def test_struct_dtype_fields(fields):
 
 
 def test_decimal_dtype():
-    dt = DecimalDtype(4, 2)
+    dt = Decimal64Dtype(4, 2)
     assert dt.to_arrow() == pa.decimal128(4, 2)
-    assert dt == DecimalDtype.from_arrow(pa.decimal128(4, 2))
+    assert dt == Decimal64Dtype.from_arrow(pa.decimal128(4, 2))
+
+
+def test_max_precision():
+    Decimal64Dtype(scale=0, precision=18)
+    with pytest.raises(ValueError):
+        Decimal64Dtype(scale=0, precision=19)
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index b42586f4137..294443500a9 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -65,9 +65,15 @@ def pdf(gdf):
 
 @pytest.mark.parametrize("nelem", [2, 3, 100, 1000])
 def test_groupby_mean(nelem):
-    got_df = make_frame(DataFrame, nelem=nelem).groupby(["x", "y"]).mean()
+    got_df = (
+        make_frame(DataFrame, nelem=nelem)
+        .groupby(["x", "y"], sort=True)
+        .mean()
+    )
     expect_df = (
-        make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).mean()
+        make_frame(pd.DataFrame, nelem=nelem)
+        .groupby(["x", "y"], sort=True)
+        .mean()
     )
     assert_eq(got_df, expect_df)
 
@@ -78,12 +84,12 @@ def test_groupby_mean_3level(nelem):
     bys = list("xyz")
     got_df = (
         make_frame(DataFrame, nelem=nelem, extra_levels=lvls)
-        .groupby(bys)
+        .groupby(bys, sort=True)
         .mean()
     )
     expect_df = (
         make_frame(pd.DataFrame, nelem=nelem, extra_levels=lvls)
-        .groupby(bys)
+        .groupby(bys, sort=True)
         .mean()
     )
     assert_eq(got_df, expect_df)
@@ -93,12 +99,12 @@ def test_groupby_mean_3level(nelem):
 def test_groupby_agg_mean_min(nelem):
     got_df = (
         make_frame(DataFrame, nelem=nelem)
-        .groupby(["x", "y"])
+        .groupby(["x", "y"], sort=True)
         .agg(["mean", "min"])
     )
     expect_df = (
         make_frame(pd.DataFrame, nelem=nelem)
-        .groupby(["x", "y"])
+        .groupby(["x", "y"], sort=True)
         .agg(["mean", "min"])
     )
     assert_eq(got_df, expect_df)
@@ -108,12 +114,12 @@ def test_groupby_agg_mean_min(nelem):
 def test_groupby_agg_min_max_dictargs(nelem):
     expect_df = (
         make_frame(pd.DataFrame, nelem=nelem, extra_vals="ab")
-        .groupby(["x", "y"])
+        .groupby(["x", "y"], sort=True)
         .agg({"a": "min", "b": "max"})
     )
     got_df = (
         make_frame(DataFrame, nelem=nelem, extra_vals="ab")
-        .groupby(["x", "y"])
+        .groupby(["x", "y"], sort=True)
         .agg({"a": "min", "b": "max"})
     )
     assert_eq(expect_df, got_df)
@@ -123,12 +129,12 @@ def test_groupby_agg_min_max_dictargs(nelem):
 def test_groupby_agg_min_max_dictlist(nelem):
     expect_df = (
         make_frame(pd.DataFrame, nelem=nelem, extra_vals="ab")
-        .groupby(["x", "y"])
+        .groupby(["x", "y"], sort=True)
         .agg({"a": ["min", "max"], "b": ["min", "max"]})
     )
     got_df = (
         make_frame(DataFrame, nelem=nelem, extra_vals="ab")
-        .groupby(["x", "y"])
+        .groupby(["x", "y"], sort=True)
         .agg({"a": ["min", "max"], "b": ["min", "max"]})
     )
     assert_eq(got_df, expect_df)
@@ -141,17 +147,23 @@ def test_groupby_agg_min_max_dictlist(nelem):
 def test_groupby_2keys_agg(nelem, func):
     # gdf (Note: lack of multiIndex)
     expect_df = (
-        make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).agg(func)
+        make_frame(pd.DataFrame, nelem=nelem)
+        .groupby(["x", "y"], sort=True)
+        .agg(func)
+    )
+    got_df = (
+        make_frame(DataFrame, nelem=nelem)
+        .groupby(["x", "y"], sort=True)
+        .agg(func)
     )
-    got_df = make_frame(DataFrame, nelem=nelem).groupby(["x", "y"]).agg(func)
     check_dtype = False if func in _index_type_aggs else True
     assert_eq(got_df, expect_df, check_dtype=check_dtype)
 
 
 @pytest.mark.parametrize("as_index", [True, False])
 def test_groupby_as_index_single_agg(pdf, gdf, as_index):
-    gdf = gdf.groupby("y", as_index=as_index).agg({"x": "mean"})
-    pdf = pdf.groupby("y", as_index=as_index).agg({"x": "mean"})
+    gdf = gdf.groupby("y", as_index=as_index, sort=True).agg({"x": "mean"})
+    pdf = pdf.groupby("y", as_index=as_index, sort=True).agg({"x": "mean"})
     assert_eq(pdf, gdf)
 
 
@@ -162,8 +174,12 @@ def test_groupby_as_index_multiindex(pdf, gdf, as_index):
     )
     gdf = cudf.from_pandas(pdf)
 
-    gdf = gdf.groupby(["a", "b"], as_index=as_index).agg({"c": "mean"})
-    pdf = pdf.groupby(["a", "b"], as_index=as_index).agg({"c": "mean"})
+    gdf = gdf.groupby(["a", "b"], as_index=as_index, sort=True).agg(
+        {"c": "mean"}
+    )
+    pdf = pdf.groupby(["a", "b"], as_index=as_index, sort=True).agg(
+        {"c": "mean"}
+    )
 
     if as_index:
         assert_eq(pdf, gdf)
@@ -174,14 +190,14 @@ def test_groupby_as_index_multiindex(pdf, gdf, as_index):
 
 
 def test_groupby_default(pdf, gdf):
-    gdf = gdf.groupby("y").agg({"x": "mean"})
-    pdf = pdf.groupby("y").agg({"x": "mean"})
+    gdf = gdf.groupby("y", sort=True).agg({"x": "mean"})
+    pdf = pdf.groupby("y", sort=True).agg({"x": "mean"})
     assert_eq(pdf, gdf)
 
 
 def test_group_keys_true(pdf, gdf):
-    gdf = gdf.groupby("y", group_keys=True).sum()
-    pdf = pdf.groupby("y", group_keys=True).sum()
+    gdf = gdf.groupby("y", group_keys=True, sort=True).sum()
+    pdf = pdf.groupby("y", group_keys=True, sort=True).sum()
     assert_eq(pdf, gdf)
 
 
@@ -189,12 +205,21 @@ def test_group_keys_true(pdf, gdf):
 def test_groupby_getitem_getattr(as_index):
     pdf = pd.DataFrame({"x": [1, 3, 1], "y": [1, 2, 3], "z": [1, 4, 5]})
     gdf = cudf.from_pandas(pdf)
-    assert_eq(pdf.groupby("x")["y"].sum(), gdf.groupby("x")["y"].sum())
-    assert_eq(pdf.groupby("x").y.sum(), gdf.groupby("x").y.sum())
-    assert_eq(pdf.groupby("x")[["y"]].sum(), gdf.groupby("x")[["y"]].sum())
     assert_eq(
-        pdf.groupby(["x", "y"], as_index=as_index).sum(),
-        gdf.groupby(["x", "y"], as_index=as_index).sum(),
+        pdf.groupby("x", sort=True)["y"].sum(),
+        gdf.groupby("x", sort=True)["y"].sum(),
+    )
+    assert_eq(
+        pdf.groupby("x", sort=True).y.sum(),
+        gdf.groupby("x", sort=True).y.sum(),
+    )
+    assert_eq(
+        pdf.groupby("x", sort=True)[["y"]].sum(),
+        gdf.groupby("x", sort=True)[["y"]].sum(),
+    )
+    assert_eq(
+        pdf.groupby(["x", "y"], as_index=as_index, sort=True).sum(),
+        gdf.groupby(["x", "y"], as_index=as_index, sort=True).sum(),
     )
 
 
@@ -244,8 +269,10 @@ def test_groupby_apply():
     df["val1"] = np.random.random(nelem)
     df["val2"] = np.random.random(nelem)
 
-    expect_grpby = df.to_pandas().groupby(["key1", "key2"], as_index=False)
-    got_grpby = df.groupby(["key1", "key2"])
+    expect_grpby = df.to_pandas().groupby(
+        ["key1", "key2"], as_index=False, sort=True
+    )
+    got_grpby = df.groupby(["key1", "key2"], sort=True)
 
     def foo(df):
         df["out"] = df["val1"] + df["val2"]
@@ -267,8 +294,10 @@ def test_groupby_apply_grouped():
     df["val1"] = np.random.random(nelem)
     df["val2"] = np.random.random(nelem)
 
-    expect_grpby = df.to_pandas().groupby(["key1", "key2"], as_index=False)
-    got_grpby = df.groupby(["key1", "key2"])
+    expect_grpby = df.to_pandas().groupby(
+        ["key1", "key2"], as_index=False, sort=True
+    )
+    got_grpby = df.groupby(["key1", "key2"], sort=True)
 
     def foo(key1, val1, com1, com2):
         for i in range(cuda.threadIdx.x, len(key1), cuda.blockDim.x):
@@ -302,11 +331,17 @@ def emulate(df):
     ["mean", "std", "var", "min", "max", "idxmin", "idxmax", "count", "sum"],
 )
 def test_groupby_cudf_2keys_agg(nelem, func):
-    got_df = make_frame(DataFrame, nelem=nelem).groupby(["x", "y"]).agg(func)
+    got_df = (
+        make_frame(DataFrame, nelem=nelem)
+        .groupby(["x", "y"], sort=True)
+        .agg(func)
+    )
 
     # pandas
     expect_df = (
-        make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).agg(func)
+        make_frame(pd.DataFrame, nelem=nelem)
+        .groupby(["x", "y"], sort=True)
+        .agg(func)
     )
     check_dtype = False if func in _index_type_aggs else True
     assert_eq(got_df, expect_df, check_dtype=check_dtype)
@@ -399,8 +434,8 @@ def test_groupby_series_level_zero(agg):
 def test_groupby_column_name():
     pdf = pd.DataFrame({"xx": [1.0, 2.0, 3.0], "yy": [1, 2, 3]})
     gdf = DataFrame.from_pandas(pdf)
-    g = gdf.groupby("yy")
-    p = pdf.groupby("yy")
+    g = gdf.groupby("yy", sort=True)
+    p = pdf.groupby("yy", sort=True)
     gxx = g["xx"].sum()
     pxx = p["xx"].sum()
     assert_eq(pxx, gxx)
@@ -433,16 +468,16 @@ def test_groupby_column_name():
 def test_groupby_column_numeral():
     pdf = pd.DataFrame({0: [1.0, 2.0, 3.0], 1: [1, 2, 3]})
     gdf = DataFrame.from_pandas(pdf)
-    p = pdf.groupby(1)
-    g = gdf.groupby(1)
+    p = pdf.groupby(1, sort=True)
+    g = gdf.groupby(1, sort=True)
     pxx = p[0].sum()
     gxx = g[0].sum()
     assert_eq(pxx, gxx)
 
     pdf = pd.DataFrame({0.5: [1.0, 2.0, 3.0], 1.5: [1, 2, 3]})
     gdf = DataFrame.from_pandas(pdf)
-    p = pdf.groupby(1.5)
-    g = gdf.groupby(1.5)
+    p = pdf.groupby(1.5, sort=True)
+    g = gdf.groupby(1.5, sort=True)
     pxx = p[0.5].sum()
     gxx = g[0.5].sum()
     assert_eq(pxx, gxx)
@@ -455,8 +490,8 @@ def test_groupby_column_numeral():
 def test_groupby_external_series(series):
     pdf = pd.DataFrame({"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]})
     gdf = DataFrame.from_pandas(pdf)
-    pxx = pdf.groupby(pd.Series(series)).x.sum()
-    gxx = gdf.groupby(cudf.Series(series)).x.sum()
+    pxx = pdf.groupby(pd.Series(series), sort=True).x.sum()
+    gxx = gdf.groupby(cudf.Series(series), sort=True).x.sum()
     assert_eq(pxx, gxx)
 
 
@@ -464,8 +499,8 @@ def test_groupby_external_series(series):
 def test_groupby_external_series_incorrect_length(series):
     pdf = pd.DataFrame({"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]})
     gdf = DataFrame.from_pandas(pdf)
-    pxx = pdf.groupby(pd.Series(series)).x.sum()
-    gxx = gdf.groupby(cudf.Series(series)).x.sum()
+    pxx = pdf.groupby(pd.Series(series), sort=True).x.sum()
+    gxx = gdf.groupby(cudf.Series(series), sort=True).x.sum()
     assert_eq(pxx, gxx)
 
 
@@ -476,49 +511,52 @@ def test_groupby_levels(level):
     idx = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (2, 2)], names=("a", "b"))
     pdf = pd.DataFrame({"c": [1, 2, 3], "d": [2, 3, 4]}, index=idx)
     gdf = cudf.from_pandas(pdf)
-    assert_eq(pdf.groupby(level=level).sum(), gdf.groupby(level=level).sum())
+    assert_eq(
+        pdf.groupby(level=level, sort=True).sum(),
+        gdf.groupby(level=level, sort=True).sum(),
+    )
 
 
 def test_advanced_groupby_levels():
     pdf = pd.DataFrame({"x": [1, 2, 3], "y": [1, 2, 1], "z": [1, 1, 1]})
     gdf = cudf.from_pandas(pdf)
-    pdg = pdf.groupby(["x", "y"]).sum()
-    gdg = gdf.groupby(["x", "y"]).sum()
+    pdg = pdf.groupby(["x", "y"], sort=True).sum()
+    gdg = gdf.groupby(["x", "y"], sort=True).sum()
     assert_eq(pdg, gdg)
-    pdh = pdg.groupby(level=1).sum()
-    gdh = gdg.groupby(level=1).sum()
+    pdh = pdg.groupby(level=1, sort=True).sum()
+    gdh = gdg.groupby(level=1, sort=True).sum()
     assert_eq(pdh, gdh)
-    pdg = pdf.groupby(["x", "y", "z"]).sum()
-    gdg = gdf.groupby(["x", "y", "z"]).sum()
+    pdg = pdf.groupby(["x", "y", "z"], sort=True).sum()
+    gdg = gdf.groupby(["x", "y", "z"], sort=True).sum()
     assert_eq(pdg, gdg)
-    pdg = pdf.groupby(["z"]).sum()
-    gdg = gdf.groupby(["z"]).sum()
+    pdg = pdf.groupby(["z"], sort=True).sum()
+    gdg = gdf.groupby(["z"], sort=True).sum()
     assert_eq(pdg, gdg)
-    pdg = pdf.groupby(["y", "z"]).sum()
-    gdg = gdf.groupby(["y", "z"]).sum()
+    pdg = pdf.groupby(["y", "z"], sort=True).sum()
+    gdg = gdf.groupby(["y", "z"], sort=True).sum()
     assert_eq(pdg, gdg)
-    pdg = pdf.groupby(["x", "z"]).sum()
-    gdg = gdf.groupby(["x", "z"]).sum()
+    pdg = pdf.groupby(["x", "z"], sort=True).sum()
+    gdg = gdf.groupby(["x", "z"], sort=True).sum()
     assert_eq(pdg, gdg)
-    pdg = pdf.groupby(["y"]).sum()
-    gdg = gdf.groupby(["y"]).sum()
+    pdg = pdf.groupby(["y"], sort=True).sum()
+    gdg = gdf.groupby(["y"], sort=True).sum()
     assert_eq(pdg, gdg)
-    pdg = pdf.groupby(["x"]).sum()
-    gdg = gdf.groupby(["x"]).sum()
+    pdg = pdf.groupby(["x"], sort=True).sum()
+    gdg = gdf.groupby(["x"], sort=True).sum()
     assert_eq(pdg, gdg)
-    pdh = pdg.groupby(level=0).sum()
-    gdh = gdg.groupby(level=0).sum()
+    pdh = pdg.groupby(level=0, sort=True).sum()
+    gdh = gdg.groupby(level=0, sort=True).sum()
     assert_eq(pdh, gdh)
-    pdg = pdf.groupby(["x", "y"]).sum()
-    gdg = gdf.groupby(["x", "y"]).sum()
-    pdh = pdg.groupby(level=[0, 1]).sum()
-    gdh = gdg.groupby(level=[0, 1]).sum()
+    pdg = pdf.groupby(["x", "y"], sort=True).sum()
+    gdg = gdf.groupby(["x", "y"], sort=True).sum()
+    pdh = pdg.groupby(level=[0, 1], sort=True).sum()
+    gdh = gdg.groupby(level=[0, 1], sort=True).sum()
     assert_eq(pdh, gdh)
-    pdh = pdg.groupby(level=[1, 0]).sum()
-    gdh = gdg.groupby(level=[1, 0]).sum()
+    pdh = pdg.groupby(level=[1, 0], sort=True).sum()
+    gdh = gdg.groupby(level=[1, 0], sort=True).sum()
     assert_eq(pdh, gdh)
-    pdg = pdf.groupby(["x", "y"]).sum()
-    gdg = gdf.groupby(["x", "y"]).sum()
+    pdg = pdf.groupby(["x", "y"], sort=True).sum()
+    gdg = gdf.groupby(["x", "y"], sort=True).sum()
 
     assert_exceptions_equal(
         lfunc=pdg.groupby,
@@ -569,19 +607,19 @@ def test_groupby_unsupported_columns():
     )
     pdf["b"] = pd_cat
     gdf = cudf.from_pandas(pdf)
-    pdg = pdf.groupby("x").sum()
-    gdg = gdf.groupby("x").sum()
+    pdg = pdf.groupby("x", sort=True).sum()
+    gdg = gdf.groupby("x", sort=True).sum()
     assert_eq(pdg, gdg)
 
 
 def test_list_of_series():
     pdf = pd.DataFrame({"x": [1, 2, 3], "y": [1, 2, 1]})
     gdf = cudf.from_pandas(pdf)
-    pdg = pdf.groupby([pdf.x]).y.sum()
-    gdg = gdf.groupby([gdf.x]).y.sum()
+    pdg = pdf.groupby([pdf.x], sort=True).y.sum()
+    gdg = gdf.groupby([gdf.x], sort=True).y.sum()
     assert_eq(pdg, gdg)
-    pdg = pdf.groupby([pdf.x, pdf.y]).y.sum()
-    gdg = gdf.groupby([gdf.x, gdf.y]).y.sum()
+    pdg = pdf.groupby([pdf.x, pdf.y], sort=True).y.sum()
+    gdg = gdf.groupby([gdf.x, gdf.y], sort=True).y.sum()
     pytest.skip()
     assert_eq(pdg, gdg)
 
@@ -602,10 +640,10 @@ def test_groupby_list_then_string():
     gdf["b"] = [11, 2, 15, 12, 2]
     gdf["c"] = [6, 7, 6, 7, 6]
     pdf = gdf.to_pandas()
-    gdg = gdf.groupby("a", as_index=True).agg(
+    gdg = gdf.groupby("a", as_index=True, sort=True).agg(
         {"b": ["min", "max"], "c": "max"}
     )
-    pdg = pdf.groupby("a", as_index=True).agg(
+    pdg = pdf.groupby("a", as_index=True, sort=True).agg(
         {"b": ["min", "max"], "c": "max"}
     )
     assert_eq(gdg, pdg)
@@ -617,10 +655,10 @@ def test_groupby_different_unequal_length_column_aggregations():
     gdf["b"] = [11, 2, 15, 12, 2]
     gdf["c"] = [11, 2, 15, 12, 2]
     pdf = gdf.to_pandas()
-    gdg = gdf.groupby("a", as_index=True).agg(
+    gdg = gdf.groupby("a", as_index=True, sort=True).agg(
         {"b": "min", "c": ["max", "min"]}
     )
-    pdg = pdf.groupby("a", as_index=True).agg(
+    pdg = pdf.groupby("a", as_index=True, sort=True).agg(
         {"b": "min", "c": ["max", "min"]}
     )
     assert_eq(pdg, gdg)
@@ -632,8 +670,8 @@ def test_groupby_single_var_two_aggs():
     gdf["b"] = [11, 2, 15, 12, 2]
     gdf["c"] = [11, 2, 15, 12, 2]
     pdf = gdf.to_pandas()
-    gdg = gdf.groupby("a", as_index=True).agg({"b": ["min", "max"]})
-    pdg = pdf.groupby("a", as_index=True).agg({"b": ["min", "max"]})
+    gdg = gdf.groupby("a", as_index=True, sort=True).agg({"b": ["min", "max"]})
+    pdg = pdf.groupby("a", as_index=True, sort=True).agg({"b": ["min", "max"]})
     assert_eq(pdg, gdg)
 
 
@@ -643,8 +681,12 @@ def test_groupby_double_var_two_aggs():
     gdf["b"] = [11, 2, 15, 12, 2]
     gdf["c"] = [11, 2, 15, 12, 2]
     pdf = gdf.to_pandas()
-    gdg = gdf.groupby(["a", "b"], as_index=True).agg({"c": ["min", "max"]})
-    pdg = pdf.groupby(["a", "b"], as_index=True).agg({"c": ["min", "max"]})
+    gdg = gdf.groupby(["a", "b"], as_index=True, sort=True).agg(
+        {"c": ["min", "max"]}
+    )
+    pdg = pdf.groupby(["a", "b"], as_index=True, sort=True).agg(
+        {"c": ["min", "max"]}
+    )
     assert_eq(pdg, gdg)
 
 
@@ -655,8 +697,8 @@ def test_groupby_apply_basic_agg_single_column():
     gdf["mult"] = gdf["key"] * gdf["val"]
     pdf = gdf.to_pandas()
 
-    gdg = gdf.groupby(["key", "val"]).mult.sum()
-    pdg = pdf.groupby(["key", "val"]).mult.sum()
+    gdg = gdf.groupby(["key", "val"], sort=True).mult.sum()
+    pdg = pdf.groupby(["key", "val"], sort=True).mult.sum()
     assert_eq(pdg, gdg)
 
 
@@ -668,8 +710,8 @@ def test_groupby_multi_agg_single_groupby_series():
         }
     )
     gdf = cudf.from_pandas(pdf)
-    pdg = pdf.groupby("x").y.agg(["sum", "max"])
-    gdg = gdf.groupby("x").y.agg(["sum", "max"])
+    pdg = pdf.groupby("x", sort=True).y.agg(["sum", "max"])
+    gdg = gdf.groupby("x", sort=True).y.agg(["sum", "max"])
 
     assert_eq(pdg, gdg)
 
@@ -684,8 +726,8 @@ def test_groupby_multi_agg_multi_groupby():
         }
     )
     gdf = cudf.from_pandas(pdf)
-    pdg = pdf.groupby(["a", "b"]).agg(["sum", "max"])
-    gdg = gdf.groupby(["a", "b"]).agg(["sum", "max"])
+    pdg = pdf.groupby(["a", "b"], sort=True).agg(["sum", "max"])
+    gdg = gdf.groupby(["a", "b"], sort=True).agg(["sum", "max"])
     assert_eq(pdg, gdg)
 
 
@@ -703,8 +745,8 @@ def test_groupby_datetime_multi_agg_multi_groupby():
         }
     )
     gdf = cudf.from_pandas(pdf)
-    pdg = pdf.groupby(["a", "b"]).agg(["sum", "max"])
-    gdg = gdf.groupby(["a", "b"]).agg(["sum", "max"])
+    pdg = pdf.groupby(["a", "b"], sort=True).agg(["sum", "max"])
+    gdg = gdf.groupby(["a", "b"], sort=True).agg(["sum", "max"])
 
     assert_eq(pdg, gdg)
 
@@ -730,8 +772,8 @@ def test_groupby_multi_agg_hash_groupby(agg):
     ).reset_index(drop=True)
     pdf = gdf.to_pandas()
     check_dtype = False if "count" in agg else True
-    pdg = pdf.groupby("id").agg(agg)
-    gdg = gdf.groupby("id").agg(agg)
+    pdg = pdf.groupby("id", sort=True).agg(agg)
+    gdg = gdf.groupby("id", sort=True).agg(agg)
     assert_eq(pdg, gdg, check_dtype=check_dtype)
 
 
@@ -744,8 +786,8 @@ def test_groupby_nulls_basic(agg):
     pdf = pd.DataFrame({"a": [0, 0, 1, 1, 2, 2], "b": [1, 2, 1, 2, 1, None]})
     gdf = cudf.from_pandas(pdf)
     assert_eq(
-        getattr(pdf.groupby("a"), agg)(),
-        getattr(gdf.groupby("a"), agg)(),
+        getattr(pdf.groupby("a", sort=True), agg)(),
+        getattr(gdf.groupby("a", sort=True), agg)(),
         check_dtype=check_dtype,
     )
 
@@ -758,8 +800,8 @@ def test_groupby_nulls_basic(agg):
     )
     gdf = cudf.from_pandas(pdf)
     assert_eq(
-        getattr(pdf.groupby("a"), agg)(),
-        getattr(gdf.groupby("a"), agg)(),
+        getattr(pdf.groupby("a", sort=True), agg)(),
+        getattr(gdf.groupby("a", sort=True), agg)(),
         check_dtype=check_dtype,
     )
 
@@ -775,8 +817,8 @@ def test_groupby_nulls_basic(agg):
     # TODO: fillna() used here since we don't follow
     # Pandas' null semantics. Should we change it?
     assert_eq(
-        getattr(pdf.groupby("a"), agg)().fillna(0),
-        getattr(gdf.groupby("a"), agg)().fillna(0),
+        getattr(pdf.groupby("a", sort=True), agg)().fillna(0),
+        getattr(gdf.groupby("a", sort=True), agg)().fillna(0),
         check_dtype=check_dtype,
     )
 
@@ -805,13 +847,15 @@ def test_groupby_all_nulls_index():
     assert_eq(pdf.groupby("a").sum(), gdf.groupby("a").sum())
 
 
-def test_groupby_sort():
+@pytest.mark.parametrize("sort", [True, False])
+def test_groupby_sort(sort):
     pdf = pd.DataFrame({"a": [2, 2, 1, 1], "b": [1, 2, 3, 4]})
     gdf = cudf.from_pandas(pdf)
 
     assert_eq(
-        pdf.groupby("a", sort=False).sum().sort_index(),
-        gdf.groupby("a", sort=False).sum().sort_index(),
+        pdf.groupby("a", sort=sort).sum(),
+        gdf.groupby("a", sort=sort).sum(),
+        check_like=not sort,
     )
 
     pdf = pd.DataFrame(
@@ -820,8 +864,30 @@ def test_groupby_sort():
     gdf = cudf.from_pandas(pdf)
 
     assert_eq(
-        pdf.groupby(["c", "b"], sort=False).sum().sort_index(),
-        gdf.groupby(["c", "b"], sort=False).sum().to_pandas().sort_index(),
+        pdf.groupby(["c", "b"], sort=sort).sum(),
+        gdf.groupby(["c", "b"], sort=sort).sum(),
+        check_like=not sort,
+    )
+
+    ps = pd.Series([1, 2, 3, 4, 5, 6, 7, 8], index=[2, 2, 2, 3, 3, 1, 1, 1])
+    gs = cudf.from_pandas(ps)
+
+    assert_eq(
+        ps.groupby(level=0, sort=sort).sum().to_frame(),
+        gs.groupby(level=0, sort=sort).sum().to_frame(),
+        check_like=not sort,
+    )
+
+    ps = pd.Series(
+        [1, 2, 3, 4, 5, 6, 7, 8],
+        index=pd.MultiIndex.from_product([(1, 2), ("a", "b"), (42, 84)]),
+    )
+    gs = cudf.from_pandas(ps)
+
+    assert_eq(
+        ps.groupby(level=0, sort=sort).sum().to_frame(),
+        gs.groupby(level=0, sort=sort).sum().to_frame(),
+        check_like=not sort,
     )
 
 
@@ -831,7 +897,9 @@ def test_groupby_cat():
     )
     gdf = cudf.from_pandas(pdf)
     assert_eq(
-        pdf.groupby("a").count(), gdf.groupby("a").count(), check_dtype=False
+        pdf.groupby("a", sort=True).count(),
+        gdf.groupby("a", sort=True).count(),
+        check_dtype=False,
     )
 
 
@@ -883,8 +951,8 @@ def test_groupby_std():
     }
     pdf = pd.DataFrame(raw_data)
     gdf = DataFrame.from_pandas(pdf)
-    pdg = pdf.groupby("x")
-    gdg = gdf.groupby("x")
+    pdg = pdf.groupby("x", sort=True)
+    gdg = gdf.groupby("x", sort=True)
     pdresult = pdg.std()
     gdresult = gdg.std()
 
@@ -906,18 +974,22 @@ def test_groupby_size():
     gdf = cudf.from_pandas(pdf)
 
     assert_eq(
-        pdf.groupby("a").size(), gdf.groupby("a").size(), check_dtype=False
+        pdf.groupby("a", sort=True).size(),
+        gdf.groupby("a", sort=True).size(),
+        check_dtype=False,
     )
 
     assert_eq(
-        pdf.groupby(["a", "b", "c"]).size(),
-        gdf.groupby(["a", "b", "c"]).size(),
+        pdf.groupby(["a", "b", "c"], sort=True).size(),
+        gdf.groupby(["a", "b", "c"], sort=True).size(),
         check_dtype=False,
     )
 
     sr = pd.Series(range(len(pdf)))
     assert_eq(
-        pdf.groupby(sr).size(), gdf.groupby(sr).size(), check_dtype=False
+        pdf.groupby(sr, sort=True).size(),
+        gdf.groupby(sr, sort=True).size(),
+        check_dtype=False,
     )
 
 
@@ -932,8 +1004,8 @@ def test_groupby_datetime(nelem, as_index, agg):
     check_dtype = agg not in ("mean", "count", "idxmin", "idxmax")
     pdf = make_frame(pd.DataFrame, nelem=nelem, with_datetime=True)
     gdf = make_frame(cudf.DataFrame, nelem=nelem, with_datetime=True)
-    pdg = pdf.groupby("datetime", as_index=as_index)
-    gdg = gdf.groupby("datetime", as_index=as_index)
+    pdg = pdf.groupby("datetime", as_index=as_index, sort=True)
+    gdg = gdf.groupby("datetime", as_index=as_index, sort=True)
     if as_index is False:
         pdres = getattr(pdg, agg)()
         gdres = getattr(gdg, agg)()
@@ -948,7 +1020,7 @@ def test_groupby_dropna():
     expect = cudf.DataFrame(
         {"b": [3, 3]}, index=cudf.Series([1, None], name="a")
     )
-    got = df.groupby("a", dropna=False).sum()
+    got = df.groupby("a", dropna=False, sort=True).sum()
     assert_eq(expect, got)
 
     df = cudf.DataFrame(
@@ -959,7 +1031,7 @@ def test_groupby_dropna():
         names=["a", "b"],
     )
     expect = cudf.DataFrame({"c": [4, 2, 4]}, index=idx)
-    got = df.groupby(["a", "b"], dropna=False).sum()
+    got = df.groupby(["a", "b"], dropna=False, sort=True).sum()
 
     assert_eq(expect, got)
 
@@ -968,7 +1040,7 @@ def test_groupby_dropna_getattr():
     df = cudf.DataFrame()
     df["id"] = [0, 1, 1, None, None, 3, 3]
     df["val"] = [0, 1, 1, 2, 2, 3, 3]
-    got = df.groupby("id", dropna=False).val.sum()
+    got = df.groupby("id", dropna=False, sort=True).val.sum()
 
     expect = cudf.Series(
         [0, 2, 6, 4], name="val", index=cudf.Series([0, 1, 3, None], name="id")
@@ -984,7 +1056,7 @@ def test_groupby_categorical_from_string():
     gdf["id"] = gdf["id"].astype("category")
     assert_eq(
         cudf.DataFrame({"val": gdf["val"]}).set_index(keys=gdf["id"]),
-        gdf.groupby("id").sum(),
+        gdf.groupby("id", sort=True).sum(),
     )
 
 
@@ -1069,8 +1141,8 @@ def test_groupby_count(agg, by):
     )
     gdf = cudf.from_pandas(pdf)
 
-    expect = pdf.groupby(by).agg(agg)
-    got = gdf.groupby(by).agg(agg)
+    expect = pdf.groupby(by, sort=True).agg(agg)
+    got = gdf.groupby(by, sort=True).agg(agg)
 
     assert_eq(expect, got, check_dtype=False)
 
@@ -1120,8 +1192,8 @@ def test_groupby_nth(n, by):
     )
     gdf = cudf.from_pandas(pdf)
 
-    expect = pdf.groupby(by).nth(n)
-    got = gdf.groupby(by).nth(n)
+    expect = pdf.groupby(by, sort=True).nth(n)
+    got = gdf.groupby(by, sort=True).nth(n)
 
     assert_eq(expect, got, check_dtype=False)
 
@@ -1168,7 +1240,9 @@ def test_groupby_agg_combinations(agg):
     gdf = cudf.from_pandas(pdf)
 
     assert_eq(
-        pdf.groupby("a").agg(agg), gdf.groupby("a").agg(agg), check_dtype=False
+        pdf.groupby("a", sort=True).agg(agg),
+        gdf.groupby("a", sort=True).agg(agg),
+        check_dtype=False,
     )
 
 
@@ -1193,8 +1267,8 @@ def test_reset_index_after_empty_groupby():
     gdf = cudf.from_pandas(pdf)
 
     assert_eq(
-        pdf.groupby("a").sum().reset_index(),
-        gdf.groupby("a").sum().reset_index(),
+        pdf.groupby("a", sort=True).sum().reset_index(),
+        gdf.groupby("a", sort=True).sum().reset_index(),
     )
 
 
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index b0f1cfed2c0..b8e157b12ae 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -460,19 +460,19 @@ def test_multiindex_multiple_groupby():
         }
     )
     gdf = cudf.DataFrame.from_pandas(pdf)
-    pdg = pdf.groupby(["a", "b"]).sum()
-    gdg = gdf.groupby(["a", "b"]).sum()
+    pdg = pdf.groupby(["a", "b"], sort=True).sum()
+    gdg = gdf.groupby(["a", "b"], sort=True).sum()
     assert_eq(pdg, gdg)
-    pdg = pdf.groupby(["a", "b"]).x.sum()
-    gdg = gdf.groupby(["a", "b"]).x.sum()
+    pdg = pdf.groupby(["a", "b"], sort=True).x.sum()
+    gdg = gdf.groupby(["a", "b"], sort=True).x.sum()
     assert_eq(pdg, gdg)
 
 
 @pytest.mark.parametrize(
     "func",
     [
-        lambda df: df.groupby(["x", "y"]).z.sum(),
-        lambda df: df.groupby(["x", "y"]).sum(),
+        lambda df: df.groupby(["x", "y"], sort=True).z.sum(),
+        lambda df: df.groupby(["x", "y"], sort=True).sum(),
     ],
 )
 def test_multi_column(func):
@@ -498,7 +498,7 @@ def test_multiindex_equality():
     gdf = cudf.DataFrame(
         {"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5], "z": [0, 1, 0, 1, 0]}
     )
-    mi1 = gdf.groupby(["x", "y"]).mean().index
+    mi1 = gdf.groupby(["x", "y"], sort=True).mean().index
     mi2 = cudf.MultiIndex(
         levels=[[1, 3, 4, 5], [1, 2, 5]],
         codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
@@ -507,7 +507,7 @@ def test_multiindex_equality():
     assert_eq(mi1, mi2)
 
     # mi made from two groupbys, are they equal?
-    mi2 = gdf.groupby(["x", "y"]).max().index
+    mi2 = gdf.groupby(["x", "y"], sort=True).max().index
     assert_eq(mi1, mi2)
 
     # mi made manually twice are they equal?
@@ -549,7 +549,7 @@ def test_multiindex_equals():
     gdf = cudf.DataFrame(
         {"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5], "z": [0, 1, 0, 1, 0]}
     )
-    mi1 = gdf.groupby(["x", "y"]).mean().index
+    mi1 = gdf.groupby(["x", "y"], sort=True).mean().index
     mi2 = cudf.MultiIndex(
         levels=[[1, 3, 4, 5], [1, 2, 5]],
         codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
@@ -558,7 +558,7 @@ def test_multiindex_equals():
     assert_eq(mi1.equals(mi2), True)
 
     # mi made from two groupbys, are they equal?
-    mi2 = gdf.groupby(["x", "y"]).max().index
+    mi2 = gdf.groupby(["x", "y"], sort=True).max().index
     assert_eq(mi1.equals(mi2), True)
 
     # mi made manually twice are they equal?
@@ -575,8 +575,8 @@ def test_multiindex_equals():
     assert_eq(mi1.equals(mi2), True)
 
     # mi made from different groupbys are they not equal?
-    mi1 = gdf.groupby(["x", "y"]).mean().index
-    mi2 = gdf.groupby(["x", "z"]).mean().index
+    mi1 = gdf.groupby(["x", "y"], sort=True).mean().index
+    mi2 = gdf.groupby(["x", "z"], sort=True).mean().index
     assert_eq(mi1.equals(mi2), False)
 
     # mi made from different manuals are they not equal?
@@ -647,8 +647,8 @@ def test_multiindex_copy_sem(data, levels, codes, names):
     gdf = cudf.DataFrame(data)
     pdf = gdf.to_pandas()
 
-    gdf = gdf.groupby(["Date", "Symbol"]).mean()
-    pdf = pdf.groupby(["Date", "Symbol"]).mean()
+    gdf = gdf.groupby(["Date", "Symbol"], sort=True).mean()
+    pdf = pdf.groupby(["Date", "Symbol"], sort=True).mean()
 
     gmi = gdf.index
     gmi_copy = gmi.copy(levels=levels, codes=codes, names=names)
@@ -882,8 +882,8 @@ def test_multiindex_groupby_to_frame():
         {"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5], "z": [0, 1, 0, 1, 0]}
     )
     pdf = gdf.to_pandas()
-    gdg = gdf.groupby(["x", "y"]).count()
-    pdg = pdf.groupby(["x", "y"]).count()
+    gdg = gdf.groupby(["x", "y"], sort=True).count()
+    pdg = pdf.groupby(["x", "y"], sort=True).count()
     assert_eq(pdg.index.to_frame(), gdg.index.to_frame())
 
 
@@ -899,22 +899,22 @@ def test_multiindex_groupby_reset_index():
         {"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5], "z": [0, 1, 0, 1, 0]}
     )
     pdf = gdf.to_pandas()
-    gdg = gdf.groupby(["x", "y"]).sum()
-    pdg = pdf.groupby(["x", "y"]).sum()
+    gdg = gdf.groupby(["x", "y"], sort=True).sum()
+    pdg = pdf.groupby(["x", "y"], sort=True).sum()
     assert_eq(pdg.reset_index(), gdg.reset_index())
 
 
 def test_multicolumn_reset_index():
     gdf = cudf.DataFrame({"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5]})
     pdf = gdf.to_pandas()
-    gdg = gdf.groupby(["x"]).agg({"y": ["count", "mean"]})
-    pdg = pdf.groupby(["x"]).agg({"y": ["count", "mean"]})
+    gdg = gdf.groupby(["x"], sort=True).agg({"y": ["count", "mean"]})
+    pdg = pdf.groupby(["x"], sort=True).agg({"y": ["count", "mean"]})
     assert_eq(pdg.reset_index(), gdg.reset_index(), check_dtype=False)
-    gdg = gdf.groupby(["x"]).agg({"y": ["count"]})
-    pdg = pdf.groupby(["x"]).agg({"y": ["count"]})
+    gdg = gdf.groupby(["x"], sort=True).agg({"y": ["count"]})
+    pdg = pdf.groupby(["x"], sort=True).agg({"y": ["count"]})
     assert_eq(pdg.reset_index(), gdg.reset_index(), check_dtype=False)
-    gdg = gdf.groupby(["x"]).agg({"y": "count"})
-    pdg = pdf.groupby(["x"]).agg({"y": "count"})
+    gdg = gdf.groupby(["x"], sort=True).agg({"y": "count"})
+    pdg = pdf.groupby(["x"], sort=True).agg({"y": "count"})
     assert_eq(pdg.reset_index(), gdg.reset_index(), check_dtype=False)
 
 
@@ -923,11 +923,11 @@ def test_multiindex_multicolumn_reset_index():
         {"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5], "z": [1, 2, 3, 4, 5]}
     )
     pdf = gdf.to_pandas()
-    gdg = gdf.groupby(["x", "y"]).agg({"y": ["count", "mean"]})
-    pdg = pdf.groupby(["x", "y"]).agg({"y": ["count", "mean"]})
+    gdg = gdf.groupby(["x", "y"], sort=True).agg({"y": ["count", "mean"]})
+    pdg = pdf.groupby(["x", "y"], sort=True).agg({"y": ["count", "mean"]})
     assert_eq(pdg.reset_index(), gdg.reset_index(), check_dtype=False)
-    gdg = gdf.groupby(["x", "z"]).agg({"y": ["count", "mean"]})
-    pdg = pdf.groupby(["x", "z"]).agg({"y": ["count", "mean"]})
+    gdg = gdf.groupby(["x", "z"], sort=True).agg({"y": ["count", "mean"]})
+    pdg = pdf.groupby(["x", "z"], sort=True).agg({"y": ["count", "mean"]})
     assert_eq(pdg.reset_index(), gdg.reset_index(), check_dtype=False)
 
 
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index d590a3ddb52..85e61acd8e6 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1,8 +1,7 @@
 # Copyright (c) 2019-2021, NVIDIA CORPORATION.
 
-import os
 import datetime
-import math
+import os
 from io import BytesIO
 
 import numpy as np
@@ -12,9 +11,8 @@
 import pytest
 
 import cudf
-from cudf.tests.utils import assert_eq, supported_numpy_dtypes, gen_rand_series
-
 from cudf.io.orc import ORCWriter
+from cudf.tests.utils import assert_eq, gen_rand_series, supported_numpy_dtypes
 
 
 @pytest.fixture(scope="module")
@@ -565,7 +563,7 @@ def normalized_equals(value1, value2):
 
     # Compare integers with floats now
     if isinstance(value1, float) or isinstance(value2, float):
-        return math.isclose(value1, value2)
+        return np.isclose(value1, value2)
 
     return value1 == value2
 
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 55cff5ae6dd..8c09dc91253 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -244,8 +244,8 @@ def test_groupby_MI(nrows, ncols):
         {"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)}
     )
     pdf = gdf.to_pandas()
-    gdg = gdf.groupby(["a", "b"]).count()
-    pdg = pdf.groupby(["a", "b"]).count()
+    gdg = gdf.groupby(["a", "b"], sort=True).count()
+    pdg = pdf.groupby(["a", "b"], sort=True).count()
     pd.options.display.max_rows = nrows
     pd.options.display.max_columns = ncols
     assert gdg.__repr__() == pdg.__repr__()
diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py
index f8e7fc5b4f3..1ae5bab0da4 100644
--- a/python/cudf/cudf/tests/test_rolling.py
+++ b/python/cudf/cudf/tests/test_rolling.py
@@ -321,6 +321,18 @@ def test_rolling_groupby_simple(agg):
         got = getattr(gdf.groupby("a").rolling(window_size), agg)().fillna(-1)
         assert_eq(expect, got, check_dtype=False)
 
+    pdf = pd.DataFrame(
+        {"a": [1, 1, 1, 2, 2], "b": [1, 1, 2, 2, 3], "c": [1, 2, 3, 4, 5]}
+    )
+    gdf = cudf.from_pandas(pdf)
+
+    for window_size in range(1, len(pdf) + 1):
+        expect = getattr(pdf.groupby("a").rolling(window_size), agg)().fillna(
+            -1
+        )
+        got = getattr(gdf.groupby("a").rolling(window_size), agg)().fillna(-1)
+        assert_eq(expect, got, check_dtype=False)
+
 
 @pytest.mark.parametrize("agg", ["sum", "min", "max", "mean", "count"])
 def test_rolling_groupby_multi(agg):
@@ -335,10 +347,10 @@ def test_rolling_groupby_multi(agg):
 
     for window_size in range(1, len(pdf) + 1):
         expect = getattr(
-            pdf.groupby(["a", "b"]).rolling(window_size), agg
+            pdf.groupby(["a", "b"], sort=True).rolling(window_size), agg
         )().fillna(-1)
         got = getattr(
-            gdf.groupby(["a", "b"]).rolling(window_size), agg
+            gdf.groupby(["a", "b"], sort=True).rolling(window_size), agg
         )().fillna(-1)
         assert_eq(expect, got, check_dtype=False)
 
diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py
index f4d04f84097..656b66bf793 100644
--- a/python/cudf/cudf/tests/test_serialize.py
+++ b/python/cudf/cudf/tests/test_serialize.py
@@ -146,11 +146,11 @@ def test_serialize_groupby_df():
     df["key_1"] = np.random.randint(0, 20, 100)
     df["key_2"] = np.random.randint(0, 20, 100)
     df["val"] = np.arange(100, dtype=np.float32)
-    gb = df.groupby(["key_1", "key_2"])
+    gb = df.groupby(["key_1", "key_2"], sort=True)
     outgb = gb.deserialize(*gb.serialize())
     expect = gb.mean()
     got = outgb.mean()
-    assert_eq(got, expect)
+    assert_eq(got.sort_index(), expect.sort_index())
 
 
 def test_serialize_groupby_external():
@@ -160,7 +160,7 @@ def test_serialize_groupby_external():
     outgb = gb.deserialize(*gb.serialize())
     expect = gb.mean()
     got = outgb.mean()
-    assert_eq(got, expect)
+    assert_eq(got.sort_index(), expect.sort_index())
 
 
 def test_serialize_groupby_level():
@@ -171,7 +171,7 @@ def test_serialize_groupby_level():
     expect = gb.mean()
     outgb = gb.deserialize(*gb.serialize())
     got = outgb.mean()
-    assert_eq(expect, got)
+    assert_eq(expect.sort_index(), got.sort_index())
 
 
 def test_serialize_groupby_sr():
@@ -180,7 +180,7 @@ def test_serialize_groupby_sr():
     outgb = gb.deserialize(*gb.serialize())
     got = gb.mean()
     expect = outgb.mean()
-    assert_eq(got, expect)
+    assert_eq(got.sort_index(), expect.sort_index())
 
 
 def test_serialize_datetime():
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 980dcb5a13b..a19b88caf4c 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -913,3 +913,24 @@ def custom_add_func(sr, val):
         lfunc_args_and_kwargs=([(custom_add_func, "val")], {"val": 11}),
         rfunc_args_and_kwargs=([(custom_add_func, "val")], {"val": 11}),
     )
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [1, None, 11, 2.0, np.nan],
+        [np.nan],
+        [None, None, None],
+        [np.nan, 1, 10, 393.32, np.nan],
+    ],
+)
+@pytest.mark.parametrize("nan_as_null", [True, False])
+@pytest.mark.parametrize("fill_value", [1.2, 332, np.nan])
+def test_fillna_with_nan(data, nan_as_null, fill_value):
+    gs = cudf.Series(data, nan_as_null=nan_as_null)
+    ps = gs.to_pandas()
+
+    expected = ps.fillna(fill_value)
+    actual = gs.fillna(fill_value)
+
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 974892cb8e7..080420c8f75 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -1117,8 +1117,8 @@ def test_string_groupby_key_index():
     pdf["b"] = other_data
     gdf["b"] = other_data
 
-    expect = pdf.groupby("a").count()
-    got = gdf.groupby("a").count()
+    expect = pdf.groupby("a", sort=True).count()
+    got = gdf.groupby("a", sort=True).count()
 
     assert_eq(expect, got, check_dtype=False)
 
diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py
index cc580bedc08..1e8beb18234 100644
--- a/python/cudf/cudf/utils/applyutils.py
+++ b/python/cudf/cudf/utils/applyutils.py
@@ -1,5 +1,7 @@
 # Copyright (c) 2018, NVIDIA CORPORATION.
+
 import functools
+from typing import Any, Dict
 
 from numba import cuda
 
@@ -332,7 +334,7 @@ def chunk_wise_kernel(nrows, chunks, {args}):
     return kernel
 
 
-_cache = dict()  # WeakKeyDictionary()
+_cache = dict()  # type: Dict[Any, Any]
 
 
 @functools.wraps(_make_row_wise_kernel)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 3c15e1b2ad5..d49b4abd399 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 import datetime as dt
 import numbers
@@ -234,13 +234,23 @@ def is_struct_dtype(obj):
     return (
         type(obj) is cudf.core.dtypes.StructDtype
         or obj is cudf.core.dtypes.StructDtype
-        # or type(obj) is cudf.core.column.StructColumn
-        # or obj is cudf.core.column.StructColumn
         or (isinstance(obj, str) and obj == cudf.core.dtypes.StructDtype.name)
         or (hasattr(obj, "dtype") and is_struct_dtype(obj.dtype))
     )
 
 
+def is_decimal_dtype(obj):
+    return (
+        type(obj) is cudf.core.dtypes.Decimal64Dtype
+        or obj is cudf.core.dtypes.Decimal64Dtype
+        or (
+            isinstance(obj, str)
+            and obj == cudf.core.dtypes.Decimal64Dtype.name
+        )
+        or (hasattr(obj, "dtype") and is_decimal_dtype(obj.dtype))
+    )
+
+
 def cudf_dtype_from_pydata_dtype(dtype):
     """ Given a numpy or pandas dtype, converts it into the equivalent cuDF
         Python dtype.
diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py
index 82a51b3f9b4..c71a6dbccb1 100644
--- a/python/cudf/cudf/utils/queryutils.py
+++ b/python/cudf/cudf/utils/queryutils.py
@@ -2,6 +2,7 @@
 
 import ast
 import datetime as dt
+from typing import Any, Dict
 
 import numpy as np
 import six
@@ -101,7 +102,7 @@ def _check_error(tree):
         raise QuerySyntaxError("too many expressions")
 
 
-_cache = {}
+_cache = {}  # type: Dict[Any, Any]
 
 
 def query_compile(expr):
diff --git a/python/cudf/setup.cfg b/python/cudf/setup.cfg
index 0b2711155d7..3067d2daafd 100644
--- a/python/cudf/setup.cfg
+++ b/python/cudf/setup.cfg
@@ -46,6 +46,21 @@ skip=
     dist
     __init__.py
 
+[mypy]
+ignore_missing_imports = True
+ 
+[mypy-cudf._lib.*]
+ignore_errors = True
+
+[mypy-cudf._version]
+ignore_errors = True
+
+[mypy-cudf.utils.metadata.orc_column_statistics_pb2]
+ignore_errors = True
+
+[mypy-cudf.tests.*]
+ignore_errors = True
+
 [tool:pytest]
 addopts =
           --benchmark-warmup=off
@@ -60,4 +75,3 @@ python_files =
 python_functions =
                    bench_*
                    test_*
-
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index f0cf5bddca0..0ba35460835 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -424,6 +424,11 @@ def var(
                 result.divisions = (min(self.columns), max(self.columns))
             return handle_out(out, result)
 
+    def groupby(self, *args, **kwargs):
+        from .groupby import CudfSeriesGroupBy
+
+        return CudfSeriesGroupBy(self, *args, **kwargs)
+
 
 class Index(Series, dd.core.Index):
     _partition_type = cudf.Index
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index 494da4927d2..2803212a502 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -13,7 +13,7 @@
     new_dd_object,
     split_out_on_cols,
 )
-from dask.dataframe.groupby import DataFrameGroupBy
+from dask.dataframe.groupby import DataFrameGroupBy, SeriesGroupBy
 from dask.highlevelgraph import HighLevelGraph
 
 
@@ -23,6 +23,40 @@ def __init__(self, *args, **kwargs):
         self.as_index = kwargs.pop("as_index", True)
         super().__init__(*args, **kwargs)
 
+    def __getitem__(self, key):
+        if isinstance(key, list):
+            g = CudfDataFrameGroupBy(
+                self.obj,
+                by=self.index,
+                slice=key,
+                sort=self.sort,
+                **self.dropna,
+            )
+        else:
+            g = CudfSeriesGroupBy(
+                self.obj,
+                by=self.index,
+                slice=key,
+                sort=self.sort,
+                **self.dropna,
+            )
+
+        g._meta = g._meta[key]
+        return g
+
+    def mean(self, split_every=None, split_out=1):
+        return groupby_agg(
+            self.obj,
+            self.index,
+            {c: "mean" for c in self.obj.columns if c not in self.index},
+            split_every=split_every,
+            split_out=split_out,
+            dropna=self.dropna,
+            sep=self.sep,
+            sort=self.sort,
+            as_index=self.as_index,
+        )
+
     def aggregate(self, arg, split_every=None, split_out=1):
         if arg == "size":
             return self.size()
@@ -50,6 +84,52 @@ def aggregate(self, arg, split_every=None, split_out=1):
         )
 
 
+class CudfSeriesGroupBy(SeriesGroupBy):
+    def __init__(self, *args, **kwargs):
+        self.sep = kwargs.pop("sep", "___")
+        self.as_index = kwargs.pop("as_index", True)
+        super().__init__(*args, **kwargs)
+
+    def mean(self, split_every=None, split_out=1):
+        return groupby_agg(
+            self.obj,
+            self.index,
+            {self._slice: "mean"},
+            split_every=split_every,
+            split_out=split_out,
+            dropna=self.dropna,
+            sep=self.sep,
+            sort=self.sort,
+            as_index=self.as_index,
+        )[self._slice]
+
+    def aggregate(self, arg, split_every=None, split_out=1):
+        if arg == "size":
+            return self.size()
+
+        _supported = {"count", "mean", "std", "var", "sum", "min", "max"}
+        if (
+            isinstance(self.obj, DaskDataFrame)
+            and isinstance(self.index, (str, list))
+            and _is_supported({self._slice: arg}, _supported)
+        ):
+            return groupby_agg(
+                self.obj,
+                self.index,
+                {self._slice: arg},
+                split_every=split_every,
+                split_out=split_out,
+                dropna=self.dropna,
+                sep=self.sep,
+                sort=self.sort,
+                as_index=self.as_index,
+            )
+
+        return super().aggregate(
+            arg, split_every=split_every, split_out=split_out
+        )
+
+
 def groupby_agg(
     ddf,
     gb_cols,