diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2064f0ea04d..a7332b5c2f1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,29 +2,38 @@
 
 ## New Features
 - PR #6856 Add groupby idxmin, idxmax aggregation
-
 - PR #6847 Add a cmake find module for cuFile in JNI code
 - PR #6902 Implement `DataFrame.quantile` for `datetime` and `timedelta` data types
 - PR #6814 Implement `cudf::reduce` for `decimal32` and `decimal64` (part 1)
 - PR #6929 Add `Index.set_names` api
 - PR #6907 Add `replace_null` API with `replace_policy` parameter, `fixed_width` column support
+- PR #6885 Share `factorize` implementation with Index and cudf module
+
+- PR #6775 Implement cudf.DateOffset for months
 
 ## Improvements
 
+- PR #6938 Pass numeric scalars of the same dtype through numeric binops
 - PR #6275 Update to official libcu++ on Github
 - PR #6838 Fix `columns` & `index` handling in dataframe constructor
 - PR #6750 Remove **kwargs from string/categorical methods
+- PR #6909 Support reading byte array backed decimal columns from parquet files
 - PR #6939 Use simplified `rmm::exec_policy`
+- PR #6512 Refactor rolling.cu to reduce compile time
+- PR #6982 Disable some pragma unroll statements in thrust `sort.h`
 
 ## Bug Fixes
 
+- PR #6884 Correct the sampling range when sampling with replacement
+- PR #6903 Add null count test for apply_boolean_mask
 - PR #6922 Fix N/A detection for empty fields in CSV reader
 - PR #6912 Fix rmm_mode=managed parameter for gtests
+- PR #6943 Fix join with nulls not equal performance
 - PR #6945 Fix groupby agg/apply behaviour when no key columns are provided 
 - PR #6942 Fix cudf::merge gtest for dictionary columns
 
 
-# cuDF 0.17.0 (Date TBD)
+# cuDF 0.17.0 (10 Dec 2020)
 
 ## New Features
 
@@ -63,6 +72,7 @@
 - PR #6765 Cupy fallback for __array_function__ and __array_ufunc__ for cudf.Series
 - PR #6817 Add support for scatter() on lists-of-struct columns
 - PR #6805 Implement `cudf::detail::copy_if` for `decimal32` and `decimal64`
+- PR #6483 Add `agg` function to aggregate dataframe using one or more operations
 - PR #6726 Support selecting different hash functions in hash_partition
 - PR #6619 Improve Dockerfile
 - PR #6831 Added parquet chunked writing ability for list columns
@@ -153,6 +163,7 @@
 - PR #6837 Avoid gather when copying strings view from start of strings column
 - PR #6859 Move align_ptr_for_type() from cuda.cuh to alignment.hpp
 - PR #6807 Refactor `std::array` usage in row group index writing in ORC
+- PR #6914 Enable groupby `list` aggregation for strings
 - PR #6908 Parquet option for strictly decimal reading
 
 ## Bug Fixes
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 6c516d50a70..6991a5bac01 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -205,15 +205,15 @@ fi
 
 cd $WORKSPACE/python/cudf
 gpuci_logger "Python py.test for cuDF"
-py.test --cache-clear --basetemp=${WORKSPACE}/cudf-cuda-tmp --junitxml=${WORKSPACE}/junit-cudf.xml -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:${WORKSPACE}/python/cudf/cudf-coverage.xml --cov-report term
+py.test -n 6 --cache-clear --basetemp=${WORKSPACE}/cudf-cuda-tmp --junitxml=${WORKSPACE}/junit-cudf.xml -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:${WORKSPACE}/python/cudf/cudf-coverage.xml --cov-report term
 
 cd $WORKSPACE/python/dask_cudf
 gpuci_logger "Python py.test for dask-cudf"
-py.test --cache-clear --basetemp=${WORKSPACE}/dask-cudf-cuda-tmp --junitxml=${WORKSPACE}/junit-dask-cudf.xml -v --cov-config=.coveragerc --cov=dask_cudf --cov-report=xml:${WORKSPACE}/python/dask_cudf/dask-cudf-coverage.xml --cov-report term
+py.test -n 6 --cache-clear --basetemp=${WORKSPACE}/dask-cudf-cuda-tmp --junitxml=${WORKSPACE}/junit-dask-cudf.xml -v --cov-config=.coveragerc --cov=dask_cudf --cov-report=xml:${WORKSPACE}/python/dask_cudf/dask-cudf-coverage.xml --cov-report term
 
 cd $WORKSPACE/python/custreamz
 gpuci_logger "Python py.test for cuStreamz"
-py.test --cache-clear --basetemp=${WORKSPACE}/custreamz-cuda-tmp --junitxml=${WORKSPACE}/junit-custreamz.xml -v --cov-config=.coveragerc --cov=custreamz --cov-report=xml:${WORKSPACE}/python/custreamz/custreamz-coverage.xml --cov-report term
+py.test -n 6 --cache-clear --basetemp=${WORKSPACE}/custreamz-cuda-tmp --junitxml=${WORKSPACE}/junit-custreamz.xml -v --cov-config=.coveragerc --cov=custreamz --cov-report=xml:${WORKSPACE}/python/custreamz/custreamz-coverage.xml --cov-report term
 
 gpuci_logger "Test notebooks"
 ${WORKSPACE}/ci/gpu/test-notebooks.sh 2>&1 | tee nbtest.log
diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml
index 5acc953c03e..3de28fe30b0 100644
--- a/conda/environments/cudf_dev_cuda10.1.yml
+++ b/conda/environments/cudf_dev_cuda10.1.yml
@@ -23,6 +23,7 @@ dependencies:
   - fsspec>=0.6.0
   - pytest
   - pytest-benchmark
+  - pytest-xdist
   - sphinx
   - sphinx_rtd_theme
   - sphinxcontrib-websupport
diff --git a/conda/environments/cudf_dev_cuda10.2.yml b/conda/environments/cudf_dev_cuda10.2.yml
index 6ae0c6a8703..7055228943a 100644
--- a/conda/environments/cudf_dev_cuda10.2.yml
+++ b/conda/environments/cudf_dev_cuda10.2.yml
@@ -23,6 +23,7 @@ dependencies:
   - fsspec>=0.6.0
   - pytest
   - pytest-benchmark
+  - pytest-xdist
   - sphinx
   - sphinx_rtd_theme
   - sphinxcontrib-websupport
diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index c313352d731..497d8feefea 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -23,6 +23,7 @@ dependencies:
   - fsspec>=0.6.0
   - pytest
   - pytest-benchmark
+  - pytest-xdist
   - sphinx
   - sphinx_rtd_theme
   - sphinxcontrib-websupport
diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml
index f22929b7649..5348ec471e9 100644
--- a/conda/recipes/libcudf_kafka/meta.yaml
+++ b/conda/recipes/libcudf_kafka/meta.yaml
@@ -26,7 +26,7 @@ requirements:
     - cmake >=3.17.0
   host:
     - libcudf {{ version }}
-    - librdkafka 1.5
+    - librdkafka >=1.5.0,<1.5.3
   run:
     - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not
 
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 615960bece6..3f435a4368d 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -164,7 +164,8 @@ ConfigureBench(SEARCH_BENCH "${SEARCH_BENCH_SRC}")
 # - sort benchmark --------------------------------------------------------------------------------
 
 set(SORT_BENCH_SRC
-  "${CMAKE_CURRENT_SOURCE_DIR}/sort/sort_benchmark.cu")
+  "${CMAKE_CURRENT_SOURCE_DIR}/sort/sort_benchmark.cu"
+  "${CMAKE_CURRENT_SOURCE_DIR}/sort/sort_strings_benchmark.cu")
 
 ConfigureBench(SORT_BENCH "${SORT_BENCH_SRC}")
 
diff --git a/cpp/benchmarks/common/generate_benchmark_input.cpp b/cpp/benchmarks/common/generate_benchmark_input.cpp
index a064270d5a5..2419114ab20 100644
--- a/cpp/benchmarks/common/generate_benchmark_input.cpp
+++ b/cpp/benchmarks/common/generate_benchmark_input.cpp
@@ -307,7 +307,7 @@ std::unique_ptr<cudf::column> create_random_column(data_profile const& profile,
  */
 struct string_column_data {
   std::vector<char> chars;
-  std::vector<int32_t> offsets;
+  std::vector<cudf::size_type> offsets;
   std::vector<cudf::bitmask_type> null_mask;
   explicit string_column_data(cudf::size_type rows, cudf::size_type size)
   {
diff --git a/cpp/benchmarks/join/join_benchmark.cu b/cpp/benchmarks/join/join_benchmark.cu
index b18ceafdae6..bd013afc451 100644
--- a/cpp/benchmarks/join/join_benchmark.cu
+++ b/cpp/benchmarks/join/join_benchmark.cu
@@ -23,6 +23,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
 #include <fixture/benchmark_fixture.hpp>
@@ -36,7 +37,7 @@ template <typename key_type, typename payload_type>
 class Join : public cudf::benchmark {
 };
 
-template <typename key_type, typename payload_type>
+template <typename key_type, typename payload_type, bool Nullable>
 static void BM_join(benchmark::State &state)
 {
   const cudf::size_type build_table_size{(cudf::size_type)state.range(0)};
@@ -46,11 +47,33 @@ static void BM_join(benchmark::State &state)
   const bool is_build_table_key_unique = true;
 
   // Generate build and probe tables
-
-  auto build_key_column =
-    cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()), build_table_size);
-  auto probe_key_column =
-    cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()), probe_table_size);
+  cudf::test::UniformRandomGenerator<cudf::size_type> rand_gen(0, build_table_size);
+  auto build_random_null_mask = [&rand_gen](int size) {
+    if (Nullable) {
+      // roughly 25% nulls
+      auto validity = thrust::make_transform_iterator(
+        thrust::make_counting_iterator(0),
+        [&rand_gen](auto i) { return (rand_gen.generate() & 3) == 0; });
+      return cudf::test::detail::make_null_mask(validity, validity + size);
+    } else {
+      return cudf::create_null_mask(size, cudf::mask_state::UNINITIALIZED);
+    }
+  };
+
+  std::unique_ptr<cudf::column> build_key_column = [&]() {
+    return Nullable ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
+                                                build_table_size,
+                                                build_random_null_mask(build_table_size))
+                    : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
+                                                build_table_size);
+  }();
+  std::unique_ptr<cudf::column> probe_key_column = [&]() {
+    return Nullable ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
+                                                probe_table_size,
+                                                build_random_null_mask(probe_table_size))
+                    : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
+                                                probe_table_size);
+  }();
 
   generate_input_tables<key_type, cudf::size_type>(
     build_key_column->mutable_view().data<key_type>(),
@@ -82,17 +105,23 @@ static void BM_join(benchmark::State &state)
   for (auto _ : state) {
     cuda_event_timer raii(state, true, 0);
 
-    auto result =
-      cudf::inner_join(probe_table, build_table, columns_to_join, columns_to_join, {{0, 0}});
+    auto result = cudf::inner_join(probe_table,
+                                   build_table,
+                                   columns_to_join,
+                                   columns_to_join,
+                                   {{0, 0}},
+                                   cudf::null_equality::UNEQUAL);
   }
 }
 
-#define JOIN_BENCHMARK_DEFINE(name, key_type, payload_type)       \
-  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type) \
-  (::benchmark::State & st) { BM_join<key_type, payload_type>(st); }
+#define JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type)     \
+  (::benchmark::State & st) { BM_join<key_type, payload_type, nullable>(st); }
 
-JOIN_BENCHMARK_DEFINE(join_32bit, int32_t, int32_t);
-JOIN_BENCHMARK_DEFINE(join_64bit, int64_t, int64_t);
+JOIN_BENCHMARK_DEFINE(join_32bit, int32_t, int32_t, false);
+JOIN_BENCHMARK_DEFINE(join_64bit, int64_t, int64_t, false);
+JOIN_BENCHMARK_DEFINE(join_32bit_nulls, int32_t, int32_t, true);
+JOIN_BENCHMARK_DEFINE(join_64bit_nulls, int64_t, int64_t, true);
 
 BENCHMARK_REGISTER_F(Join, join_32bit)
   ->Unit(benchmark::kMillisecond)
@@ -111,3 +140,21 @@ BENCHMARK_REGISTER_F(Join, join_64bit)
   ->Args({50'000'000, 50'000'000})
   ->Args({40'000'000, 120'000'000})
   ->UseManualTime();
+
+BENCHMARK_REGISTER_F(Join, join_32bit_nulls)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->Args({10'000'000, 10'000'000})
+  ->Args({10'000'000, 40'000'000})
+  ->Args({10'000'000, 100'000'000})
+  ->Args({100'000'000, 100'000'000})
+  ->Args({80'000'000, 240'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(Join, join_64bit_nulls)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({50'000'000, 50'000'000})
+  ->Args({40'000'000, 120'000'000})
+  ->UseManualTime();
diff --git a/cpp/benchmarks/sort/sort_strings_benchmark.cu b/cpp/benchmarks/sort/sort_strings_benchmark.cu
new file mode 100644
index 00000000000..0566ac2ed75
--- /dev/null
+++ b/cpp/benchmarks/sort/sort_strings_benchmark.cu
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/sorting.hpp>
+#include <cudf/types.hpp>
+
+class SortStrings : public cudf::benchmark {
+};
+
+static void BM_sort(benchmark::State& state)
+{
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+
+  auto const table = create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows});
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    cudf::sort(table->view());
+  }
+}
+
+#define SORT_BENCHMARK_DEFINE(name)          \
+  BENCHMARK_DEFINE_F(SortStrings, name)      \
+  (::benchmark::State & st) { BM_sort(st); } \
+  BENCHMARK_REGISTER_F(SortStrings, name)    \
+    ->RangeMultiplier(8)                     \
+    ->Ranges({{1 << 10, 1 << 24}})           \
+    ->UseManualTime()                        \
+    ->Unit(benchmark::kMillisecond);
+
+SORT_BENCHMARK_DEFINE(stringssort)
diff --git a/cpp/benchmarks/synchronization/synchronization.hpp b/cpp/benchmarks/synchronization/synchronization.hpp
index 5e84e9fb9ae..d972247c86d 100644
--- a/cpp/benchmarks/synchronization/synchronization.hpp
+++ b/cpp/benchmarks/synchronization/synchronization.hpp
@@ -17,7 +17,7 @@
 /**
  * @file synchronization.hpp
  * @brief This is the header file for `cuda_event_timer`.
- **/
+ */
 
 /**
  * @brief  This class serves as a wrapper for using `cudaEvent_t` as the user
@@ -54,7 +54,7 @@
     BENCHMARK(sample_cuda_benchmark)->UseManualTime();
 
 
- **/
+ */
 
 #ifndef CUDF_BENCH_SYNCHRONIZATION_H
 #define CUDF_BENCH_SYNCHRONIZATION_H
@@ -79,7 +79,7 @@ class cuda_event_timer {
    * @param[in] flush_l2_cache_ whether or not to flush the L2 cache before
    *                            every iteration.
    * @param[in] stream_ The CUDA stream we are measuring time on.
-   **/
+   */
   cuda_event_timer(benchmark::State& state,
                    bool flush_l2_cache,
                    rmm::cuda_stream_view stream = rmm::cuda_stream_default);
diff --git a/cpp/docs/TRANSITIONGUIDE.md b/cpp/docs/TRANSITIONGUIDE.md
index e1ffd2b6525..8a8a8dc26c4 100644
--- a/cpp/docs/TRANSITIONGUIDE.md
+++ b/cpp/docs/TRANSITIONGUIDE.md
@@ -777,7 +777,7 @@ namespace experimental{
  * @param mr Memory resource used to allocate device memory for the returned
  * output column
  * @return std::unique_ptr<column> Newly allocated output column
- **/
+ */
 std::unique_ptr<column> new_function(cudf::column_view input, 
                                      cudf::mutable_column_view in_out, 
                                      cudf::table_view input_table,
diff --git a/cpp/include/cudf/ast/detail/linearizer.hpp b/cpp/include/cudf/ast/detail/linearizer.hpp
index c9f61490c5d..44910bcaad1 100644
--- a/cpp/include/cudf/ast/detail/linearizer.hpp
+++ b/cpp/include/cudf/ast/detail/linearizer.hpp
@@ -39,7 +39,6 @@ namespace detail {
  *
  * This enum is device-specific. For instance, intermediate data references are generated by the
  * linearization process but cannot be explicitly created by the user.
- *
  */
 enum class device_data_reference_type {
   COLUMN,       // A value in a table column
@@ -52,7 +51,6 @@ enum class device_data_reference_type {
  *
  * This is a POD class used to create references describing data type and locations for consumption
  * by the `row_evaluator`.
- *
  */
 struct alignas(8) device_data_reference {
   device_data_reference(device_data_reference_type reference_type,
@@ -85,7 +83,6 @@ class linearizer;
  *
  * This class is a part of a "visitor" pattern with the `linearizer` class.
  * Nodes inheriting from this class can accept visitors.
- *
  */
 class node {
   friend class detail::linearizer;
@@ -104,7 +101,6 @@ class node {
  * the nodes and constructing vectors of information that are later used by the device for
  * evaluating the abstract syntax tree as a "linear" list of operators whose input dependencies are
  * resolved into intermediate data storage in shared memory.
- *
  */
 class linearizer {
   friend class literal;
diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp
index 536dbb94a52..8ec26cf5eb7 100644
--- a/cpp/include/cudf/ast/detail/operators.hpp
+++ b/cpp/include/cudf/ast/detail/operators.hpp
@@ -931,7 +931,6 @@ struct dispatch_unary_operator_types {
 
 /**
  * @brief Functor performing a type dispatch for a unary operator.
- *
  */
 struct type_dispatch_unary_op {
   template <ast_operator op, typename F, typename... Ts>
@@ -968,7 +967,6 @@ CUDA_HOST_DEVICE_CALLABLE constexpr void unary_operator_dispatcher(ast_operator
 
 /**
  * @brief Functor to determine the return type of an operator from its input types.
- *
  */
 struct return_type_functor {
   /**
@@ -1057,7 +1055,6 @@ inline cudf::data_type ast_operator_return_type(ast_operator op,
 
 /**
  * @brief Functor to determine the arity (number of operands) of an operator.
- *
  */
 struct arity_functor {
   template <ast_operator op>
diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh
index 61aedab2f04..ee08742d871 100644
--- a/cpp/include/cudf/ast/detail/transform.cuh
+++ b/cpp/include/cudf/ast/detail/transform.cuh
@@ -126,7 +126,6 @@ struct binary_row_output : public row_output {
  * This class is designed for n-ary transform evaluation. Currently this class assumes that there's
  * only one relevant "row index" in its methods, which corresponds to a row in a single input table
  * and the same row index in an output column.
- *
  */
 struct row_evaluator {
   friend struct row_output;
diff --git a/cpp/include/cudf/ast/linearizer.hpp b/cpp/include/cudf/ast/linearizer.hpp
index 541e16c992a..594dd0a73ce 100644
--- a/cpp/include/cudf/ast/linearizer.hpp
+++ b/cpp/include/cudf/ast/linearizer.hpp
@@ -32,7 +32,6 @@ namespace ast {
  * @brief Enum of table references.
  *
  * This determines which table to use in cases with two tables (e.g. joins).
- *
  */
 enum class table_reference {
   LEFT,   // Column index in the left table
@@ -47,7 +46,6 @@ class expression;
 
 /**
  * @brief A literal value used in an abstract syntax tree.
- *
  */
 class literal : public detail::node {
   friend class detail::linearizer;
@@ -114,7 +112,6 @@ class literal : public detail::node {
 
 /**
  * @brief A node referring to data from a column in a table.
- *
  */
 class column_reference : public detail::node {
   friend class detail::linearizer;
@@ -194,7 +191,6 @@ class column_reference : public detail::node {
 
 /**
  * @brief An expression node holds an operator and zero or more operands.
- *
  */
 class expression : public detail::node {
   friend class detail::linearizer;
diff --git a/cpp/include/cudf/ast/operators.hpp b/cpp/include/cudf/ast/operators.hpp
index 75c2eac9d8a..78e56340246 100644
--- a/cpp/include/cudf/ast/operators.hpp
+++ b/cpp/include/cudf/ast/operators.hpp
@@ -21,7 +21,6 @@ namespace ast {
 
 /**
  * @brief Enum of supported operators.
- *
  */
 enum class ast_operator {
   // Binary operators
diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp
index 7966b6a1472..a08b10df6f4 100644
--- a/cpp/include/cudf/column/column.hpp
+++ b/cpp/include/cudf/column/column.hpp
@@ -53,7 +53,7 @@ class column {
    * @brief Construct a new column by deep copying the contents of `other`.
    *
    * @param other The column to copy
-   **/
+   */
   column(column const& other);
 
   /**
@@ -77,7 +77,7 @@ class column {
    * After the move, `other.size() == 0` and `other.type() = {EMPTY}`
    *
    * @param other The column whose contents will be moved into the new column
-   **/
+   */
   column(column&& other) noexcept;
 
   /**
@@ -95,7 +95,7 @@ class column {
    * `UNKNOWN_NULL_COUNT` to indicate that the null count should be computed on
    * the first invocation of `null_count()`.
    * @param children Optional, vector of child columns
-   **/
+   */
   template <typename B1, typename B2 = rmm::device_buffer>
   column(data_type dtype,
          size_type size,
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 0f9bcfd5cd9..1672f0d69aa 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -50,7 +50,6 @@ namespace detail {
  * not-obvious computation of null count, which could lead to undesirable performance issues.
  * This information is also generally not needed in device code, and on the host-side
  * is easily accessible from the associated column_view.
- *
  */
 class alignas(16) column_device_view_base {
  public:
@@ -795,6 +794,29 @@ __device__ inline numeric::decimal64 const column_device_view::element<numeric::
 }
 
 namespace detail {
+
+/**
+ * @brief Convenience function to get offset word from a bitmask
+ *
+ * @see copy_offset_bitmask
+ * @see offset_bitmask_and
+ */
+__device__ inline bitmask_type get_mask_offset_word(bitmask_type const* __restrict__ source,
+                                                    size_type destination_word_index,
+                                                    size_type source_begin_bit,
+                                                    size_type source_end_bit)
+{
+  size_type source_word_index = destination_word_index + word_index(source_begin_bit);
+  bitmask_type curr_word      = source[source_word_index];
+  bitmask_type next_word      = 0;
+  if (word_index(source_end_bit) >
+      word_index(source_begin_bit +
+                 destination_word_index * detail::size_in_bits<bitmask_type>())) {
+    next_word = source[source_word_index + 1];
+  }
+  return __funnelshift_r(curr_word, next_word, source_begin_bit);
+}
+
 /**
  * @brief value accessor of column without null bitmask
  * A unary functor returns scalar value at `id`.
diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
index 081ec0b84cb..7ccc5879f5f 100644
--- a/cpp/include/cudf/column/column_factories.hpp
+++ b/cpp/include/cudf/column/column_factories.hpp
@@ -583,7 +583,6 @@ std::unique_ptr<cudf::column> make_lists_column(
  * @param[in] null_mask The bits specifying the null struct values in the column.
  * @param[in] stream Optional stream for use with all memory allocation and device kernels.
  * @param[in] mr Optional resource to use for device memory allocation.
- *
  */
 std::unique_ptr<cudf::column> make_structs_column(
   size_type num_rows,
diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp
index e491cc30c90..d3d64eb21df 100644
--- a/cpp/include/cudf/column/column_view.hpp
+++ b/cpp/include/cudf/column/column_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,8 +42,7 @@ namespace detail {
  * To enable zero-copy slicing, a `column_view_base` has an `offset` that
  *indicates the index of the first element in the column relative to the base
  *device memory allocation. By default, `offset()` is zero.
- *
- **/
+ */
 class column_view_base {
  public:
   /**
@@ -58,7 +57,7 @@ class column_view_base {
    *
    * @tparam The type to cast to
    * @return T const* Typed pointer to underlying data
-   **/
+   */
   template <typename T = void>
   T const* head() const noexcept
   {
@@ -75,7 +74,7 @@ class column_view_base {
    *
    * @tparam T The type to cast to
    * @return T const* Typed pointer to underlying data, including the offset
-   **/
+   */
   template <typename T>
   T const* data() const noexcept
   {
@@ -88,7 +87,7 @@ class column_view_base {
    *
    * @tparam T The desired type
    * @return T const* Pointer to the first element after casting
-   **/
+   */
   template <typename T>
   T const* begin() const noexcept
   {
@@ -101,7 +100,7 @@ class column_view_base {
    *
    * @tparam T The desired type
    * @return T const* Pointer to one past the last element after casting
-   **/
+   */
   template <typename T>
   T const* end() const noexcept
   {
@@ -110,17 +109,17 @@ class column_view_base {
 
   /**
    * @brief Returns the number of elements in the column
-   **/
+   */
   size_type size() const noexcept { return _size; }
 
   /**
    * @brief Returns true if `size()` returns zero, or false otherwise
-   **/
+   */
   size_type is_empty() const noexcept { return size() == 0; }
 
   /**
    * @brief Returns the element `data_type`
-   **/
+   */
   data_type type() const noexcept { return _type; }
 
   /**
@@ -131,7 +130,7 @@ class column_view_base {
    *
    * @return true The bitmask is allocated
    * @return false The bitmask is not allocated
-   **/
+   */
   bool nullable() const noexcept { return nullptr != _null_mask; }
 
   /**
@@ -141,7 +140,7 @@ class column_view_base {
    * point `set_null_count(UNKNOWN_NULL_COUNT)` was invoked, then the
    * first invocation of `null_count()` will compute and store the count of null
    * elements indicated by the `null_mask` (if it exists).
-   **/
+   */
   size_type null_count() const;
 
   /**
@@ -156,7 +155,7 @@ class column_view_base {
    *
    * @param[in] begin The starting index of the range (inclusive).
    * @param[in] end The index of the last element in the range (exclusive).
-   **/
+   */
   size_type null_count(size_type begin, size_type end) const;
 
   /**
@@ -165,7 +164,7 @@ class column_view_base {
    *
    * @return true One or more elements are null
    * @return false All elements are valid
-   **/
+   */
   bool has_nulls() const { return null_count() > 0; }
 
   /**
@@ -188,13 +187,13 @@ class column_view_base {
    * @note This function does *not* account for the `offset()`.
    *
    * @note If `null_count() == 0`, this may return `nullptr`.
-   **/
+   */
   bitmask_type const* null_mask() const noexcept { return _null_mask; }
 
   /**
    * @brief Returns the index of the first element relative to the base memory
    * allocation, i.e., what is returned from `head<T>()`.
-   **/
+   */
   size_type offset() const noexcept { return _offset; }
 
  protected:
@@ -278,8 +277,7 @@ class mutable_column_view_base : public column_view_base {
  * To enable zero-copy slicing, a `column_view` has an `offset` that indicates
  * the index of the first element in the column relative to the base device
  * memory allocation. By default, `offset()` is zero.
- *
- **/
+ */
 class column_view : public detail::column_view_base {
  public:
   column_view() = default;
@@ -347,7 +345,7 @@ class column_view : public detail::column_view_base {
 
   /**
    * @brief Returns the number of child columns.
-   **/
+   */
   size_type num_children() const noexcept { return _children.size(); }
 
   /**
@@ -386,8 +384,7 @@ class column_view : public detail::column_view_base {
  * To enable zero-copy slicing, a `mutable_column_view` has an `offset` that
  * indicates the index of the first element in the column relative to the base
  * device memory allocation. By default, `offset()` is zero.
- *
- **/
+ */
 class mutable_column_view : public detail::column_view_base {
  public:
   mutable_column_view() = default;
@@ -448,7 +445,7 @@ class mutable_column_view : public detail::column_view_base {
    *
    * @tparam The type to cast to
    * @return T* Typed pointer to underlying data
-   **/
+   */
   template <typename T = void>
   T* head() const noexcept
   {
@@ -465,7 +462,7 @@ class mutable_column_view : public detail::column_view_base {
    *
    * @tparam T The type to cast to
    * @return T* Typed pointer to underlying data, including the offset
-   **/
+   */
   template <typename T>
   T* data() const noexcept
   {
@@ -478,7 +475,7 @@ class mutable_column_view : public detail::column_view_base {
    *
    * @tparam T The desired type
    * @return T* Pointer to the first element after casting
-   **/
+   */
   template <typename T>
   T* begin() const noexcept
   {
@@ -491,7 +488,7 @@ class mutable_column_view : public detail::column_view_base {
    *
    * @tparam T The desired type
    * @return T* Pointer to one past the last element after casting
-   **/
+   */
   template <typename T>
   T* end() const noexcept
   {
@@ -516,7 +513,7 @@ class mutable_column_view : public detail::column_view_base {
    * @throws cudf::logic_error if `new_null_count > 0` and `nullable() == false`
    *
    * @param new_null_count The new null count
-   **/
+   */
   void set_null_count(size_type new_null_count);
 
   /**
@@ -532,7 +529,7 @@ class mutable_column_view : public detail::column_view_base {
 
   /**
    * @brief Returns the number of child columns.
-   **/
+   */
   size_type num_children() const noexcept { return mutable_children.size(); }
 
   /**
@@ -549,7 +546,7 @@ class mutable_column_view : public detail::column_view_base {
    * @brief Converts a mutable view into an immutable view
    *
    * @return column_view An immutable view of the mutable view's elements
-   **/
+   */
   operator column_view() const;
 
  private:
@@ -563,7 +560,7 @@ class mutable_column_view : public detail::column_view_base {
  *
  * @param parent The parent whose descendants will be counted
  * @return size_type The number of descendants of the parent
- **/
+ */
 size_type count_descendants(column_view parent);
 
 /**
diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
index 5e3b3673053..c63fa62679f 100644
--- a/cpp/include/cudf/copying.hpp
+++ b/cpp/include/cudf/copying.hpp
@@ -735,7 +735,7 @@ std::unique_ptr<scalar> get_element(
 
 /**
  * @brief Indicates whether a row can be sampled more than once.
- **/
+ */
 enum class sample_with_replacement : bool {
   FALSE,  // A row can be sampled only once
   TRUE    // A row can be sampled more than once
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 5959ee34e04..1f70e68fce8 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -337,7 +337,7 @@ constexpr size_type ARGMIN_SENTINEL{-1};
  *
  * @tparam Source The type on which the aggregation is computed
  * @tparam k The aggregation performed
- **/
+ */
 template <typename Source, aggregation::Kind k, typename Enable = void>
 struct target_type_impl {
   using type = void;
diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp
index 0af8dd6a500..19d9d81a948 100644
--- a/cpp/include/cudf/detail/copy.hpp
+++ b/cpp/include/cudf/detail/copy.hpp
@@ -40,7 +40,7 @@ namespace detail {
  * @param[in] end Index of the last desired element in the slice (exclusive).
  *
  * @return ColumnView View of the elements `[begin,end)` from `input`.
- **/
+ */
 template <typename ColumnView>
 ColumnView slice(ColumnView const& input, cudf::size_type begin, cudf::size_type end)
 {
@@ -92,7 +92,7 @@ std::unique_ptr<column> shift(
  * @copydoc cudf::contiguous_split
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
- **/
+ */
 std::vector<contiguous_split_result> contiguous_split(
   cudf::table_view const& input,
   std::vector<size_type> const& splits,
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index a1be386006b..adae9b76c5b 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -331,7 +331,6 @@ struct column_gatherer_impl<list_view, MapItRoot> {
 /**
  * @brief Function object for gathering a type-erased
  * column. To be used with the cudf::type_dispatcher.
- *
  */
 struct column_gatherer {
   /**
diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp
index c6d2014f80e..cdc221dcdbe 100644
--- a/cpp/include/cudf/detail/interop.hpp
+++ b/cpp/include/cudf/detail/interop.hpp
@@ -100,7 +100,7 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type);
  * @copydoc cudf::to_arrow
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
- **/
+ */
 std::shared_ptr<arrow::Table> to_arrow(table_view input,
                                        std::vector<column_metadata> const& metadata = {},
                                        rmm::cuda_stream_view stream = rmm::cuda_stream_default,
@@ -110,7 +110,7 @@ std::shared_ptr<arrow::Table> to_arrow(table_view input,
  * @copydoc cudf::arrow_to_cudf
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
- **/
+ */
 std::unique_ptr<table> from_arrow(
   arrow::Table const& input_table,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
index 3d504b142da..75a710d1d5c 100644
--- a/cpp/include/cudf/detail/iterator.cuh
+++ b/cpp/include/cudf/detail/iterator.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-/** --------------------------------------------------------------------------*
+/**
  * @brief provides column input iterator with nulls replaced with a specified value
  * @file iterator.cuh
  *
@@ -29,7 +29,7 @@
  * For non-null column, use
  * auto iter = column.begin<Element>();
  *
- * -------------------------------------------------------------------------**/
+ */
 
 #pragma once
 
@@ -39,7 +39,7 @@
 
 namespace cudf {
 namespace detail {
-/** -------------------------------------------------------------------------*
+/**
  * @brief value accessor of column with null bitmask
  * A unary functor returns scalar value at `id`.
  * `operator() (cudf::size_type id)` computes `element` and valid flag at `id`
@@ -52,17 +52,17 @@ namespace detail {
  * @throws cudf::logic_error if column datatype and Element type mismatch.
  *
  * @tparam Element The type of elements in the column
- * -------------------------------------------------------------------------**/
+ */
 template <typename Element>
 struct null_replaced_value_accessor {
   column_device_view const col;      ///< column view of column in device
   Element const null_replacement{};  ///< value returned when element is null
 
-  /** -------------------------------------------------------------------------*
+  /**
    * @brief constructor
    * @param[in] _col column device view of cudf column
    * @param[in] null_replacement The value to return for null elements
-   * -------------------------------------------------------------------------**/
+   */
   null_replaced_value_accessor(column_device_view const& _col, Element null_val)
     : col{_col}, null_replacement{null_val}
   {
@@ -78,21 +78,21 @@ struct null_replaced_value_accessor {
   }
 };
 
-/** -------------------------------------------------------------------------*
+/**
  * @brief validity accessor of column with null bitmask
  * A unary functor returns validity at `id`.
  * `operator() (cudf::size_type id)` computes validity flag at `id`
  * This functor is only allowed for nullable columns.
  *
  * @throws cudf::logic_error if the column is not nullable.
- * -------------------------------------------------------------------------**/
+ */
 struct validity_accessor {
   column_device_view const col;
 
-  /** -------------------------------------------------------------------------*
+  /**
    * @brief constructor
    * @param[in] _col column device view of cudf column
-   * -------------------------------------------------------------------------**/
+   */
   validity_accessor(column_device_view const& _col) : col{_col}
   {
     // verify valid is non-null, otherwise, is_valid() will crash
diff --git a/cpp/include/cudf/detail/merge.cuh b/cpp/include/cudf/detail/merge.cuh
index cbf56d19380..06f9bfc5034 100644
--- a/cpp/include/cudf/detail/merge.cuh
+++ b/cpp/include/cudf/detail/merge.cuh
@@ -67,7 +67,6 @@ using index_vector = rmm::device_vector<index_type>;
  * As a result, a special comparison logic is necessary whereby the index is "tagged" with side
  * information and consequently comparator functors (predicates) must operate on these tagged
  * indices rather than on raw indices.
- *
  */
 template <bool has_nulls = true>
 struct tagged_element_relational_comparator {
diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp
index 9a5e000f265..2f2bc91cb74 100644
--- a/cpp/include/cudf/detail/null_mask.hpp
+++ b/cpp/include/cudf/detail/null_mask.hpp
@@ -28,7 +28,7 @@ namespace detail {
  * @copydoc cudf::create_null_mask(size_type, mask_state, rmm::mr::device_memory_resource*)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
- **/
+ */
 rmm::device_buffer create_null_mask(
   size_type size,
   mask_state state,
@@ -39,7 +39,7 @@ rmm::device_buffer create_null_mask(
  * @copydoc cudf::set_null_mask(bitmask_type*, size_type, size_type, bool)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
- **/
+ */
 void set_null_mask(bitmask_type *bitmask,
                    size_type begin_bit,
                    size_type end_bit,
@@ -69,7 +69,7 @@ std::vector<size_type> segmented_count_unset_bits(bitmask_type const *bitmask,
  *rmm::mr::device_memory_resource*)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
- **/
+ */
 rmm::device_buffer copy_bitmask(
   bitmask_type const *mask,
   size_type begin_bit,
@@ -81,7 +81,7 @@ rmm::device_buffer copy_bitmask(
  * @copydoc cudf::copy_bitmask(column_view const& view, rmm::mr::device_memory_resource*)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
- **/
+ */
 rmm::device_buffer copy_bitmask(
   column_view const &view,
   rmm::cuda_stream_view stream,
diff --git a/cpp/include/cudf/detail/nvtx/nvtx3.hpp b/cpp/include/cudf/detail/nvtx/nvtx3.hpp
index 4ef665a8f2e..add5699e34a 100644
--- a/cpp/include/cudf/detail/nvtx/nvtx3.hpp
+++ b/cpp/include/cudf/detail/nvtx/nvtx3.hpp
@@ -27,7 +27,6 @@
  *
  * If this value is incremented, the above version include guard needs to be
  * updated.
- *
  */
 #define NVTX3_MINOR_VERSION 0
 
@@ -496,7 +495,6 @@
  *    NVTX3_FUNC_RANGE();
  * }
  * \endcode
- *
  */
 
 /**
@@ -506,7 +504,6 @@
  * Initializing a legacy-C (i.e., no constructor) union member requires
  * initializing in the constructor body. Non-empty constexpr constructors
  * require C++14 relaxed constexpr.
- *
  */
 #if __cpp_constexpr >= 201304L
 #define NVTX3_RELAXED_CONSTEXPR constexpr
@@ -741,7 +738,6 @@ class domain {
  * will be grouped together.
  *
  * @return Reference to the `domain` corresponding to the global NVTX domain.
- *
  */
 template <>
 inline domain const& domain::get<domain::global>()
@@ -753,7 +749,6 @@ inline domain const& domain::get<domain::global>()
 /**
  * @brief Indicates the values of the red, green, blue color channels for
  * a rgb color code.
- *
  */
 struct rgb {
   /// Type used for component values
@@ -782,7 +777,6 @@ struct rgb {
 /**
  * @brief Indicates the value of the alpha, red, green, and blue color
  * channels for an argb color code.
- *
  */
 struct argb final : rgb {
   /**
@@ -815,7 +809,6 @@ struct argb final : rgb {
  * Specifying colors for NVTX events is a convenient way to visually
  * differentiate among different events in a visualization tool such as Nsight
  * Systems.
- *
  */
 class color {
  public:
@@ -921,7 +914,6 @@ class color {
  * \endcode
  *
  * To associate a name string with a category id, see `named_category`.
- *
  */
 class category {
  public:
@@ -1537,7 +1529,6 @@ class payload {
  * // they will be forwarded to the `EventAttribute`s constructor
  * nvtx3::thread_range r{nvtx3::payload{42}, nvtx3::category{1}, "message"};
  * \endcode
- *
  */
 class event_attributes {
  public:
@@ -1763,7 +1754,6 @@ class domain_thread_range {
 
 /**
  * @brief Alias for a `domain_thread_range` in the global NVTX domain.
- *
  */
 using thread_range = domain_thread_range<>;
 
@@ -1854,7 +1844,6 @@ class domain_process_range {
 
 /**
  * @brief Alias for a `domain_process_range` in the global NVTX domain.
- *
  */
 using process_range = domain_process_range<>;
 
diff --git a/cpp/include/cudf/detail/nvtx/ranges.hpp b/cpp/include/cudf/detail/nvtx/ranges.hpp
index 10f5916cde1..de5f9901506 100644
--- a/cpp/include/cudf/detail/nvtx/ranges.hpp
+++ b/cpp/include/cudf/detail/nvtx/ranges.hpp
@@ -21,7 +21,6 @@
 namespace cudf {
 /**
  * @brief Tag type for libcudf's NVTX domain.
- *
  */
 struct libcudf_domain {
   static constexpr char const* name{"libcudf"};  ///< Name of the libcudf domain
@@ -29,7 +28,6 @@ struct libcudf_domain {
 
 /**
  * @brief Alias for an NVTX range in the libcudf domain.
- *
  */
 using thread_range = ::nvtx3::domain_thread_range<libcudf_domain>;
 
@@ -49,6 +47,5 @@ using thread_range = ::nvtx3::domain_thread_range<libcudf_domain>;
  *    ...
  * }
  * ```
- *
  */
 #define CUDF_FUNC_RANGE() NVTX3_FUNC_RANGE_IN(cudf::libcudf_domain)
diff --git a/cpp/include/cudf/detail/reduction.cuh b/cpp/include/cudf/detail/reduction.cuh
index 9d20375e8b1..2c2b259f1fe 100644
--- a/cpp/include/cudf/detail/reduction.cuh
+++ b/cpp/include/cudf/detail/reduction.cuh
@@ -33,7 +33,7 @@
 namespace cudf {
 namespace reduction {
 namespace detail {
-/** --------------------------------------------------------------------------*
+/**
  * @brief Compute the specified simple reduction over the input range of elements.
  *
  * @param[in] d_in      the begin iterator
@@ -45,7 +45,7 @@ namespace detail {
  * @tparam Op               the reduction operator with device binary operator
  * @tparam InputIterator    the input column iterator
  * @tparam OutputType       the output type of reduction
- * ----------------------------------------------------------------------------**/
+ */
 template <typename Op,
           typename InputIterator,
           typename OutputType = typename thrust::iterator_value<InputIterator>::type,
@@ -148,7 +148,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
   return std::unique_ptr<scalar>(s);
 }
 
-/** --------------------------------------------------------------------------*
+/**
  * @brief compute reduction by the compound operator (reduce and transform)
  *
  * @param[in] d_in      the begin iterator
@@ -166,7 +166,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
  * @tparam Op               the reduction operator with device binary operator
  * @tparam InputIterator    the input column iterator
  * @tparam OutputType       the output type of reduction
- * ----------------------------------------------------------------------------**/
+ */
 template <typename Op,
           typename InputIterator,
           typename OutputType,
diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh
index 1ca948c0145..305aa51afb9 100644
--- a/cpp/include/cudf/detail/scatter.cuh
+++ b/cpp/include/cudf/detail/scatter.cuh
@@ -242,7 +242,7 @@ struct column_scatterer_impl<dictionary32, MapIterator> {
  * @param[in] mr Device memory resource used to allocate the returned table's device memory
  *
  * @return Result of scattering values from source to target
- **/
+ */
 template <typename MapIterator>
 std::unique_ptr<table> scatter(
   table_view const& source,
diff --git a/cpp/include/cudf/detail/scatter.hpp b/cpp/include/cudf/detail/scatter.hpp
index a5676c86f49..a3b1f95ca0a 100644
--- a/cpp/include/cudf/detail/scatter.hpp
+++ b/cpp/include/cudf/detail/scatter.hpp
@@ -61,7 +61,7 @@ namespace detail {
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return Result of scattering values from source to target
- **/
+ */
 std::unique_ptr<table> scatter(
   table_view const& source,
   column_view const& scatter_map,
@@ -101,7 +101,7 @@ std::unique_ptr<table> scatter(
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return Result of scattering values from source to target
- **/
+ */
 std::unique_ptr<table> scatter(
   std::vector<std::reference_wrapper<const scalar>> const& source,
   column_view const& indices,
diff --git a/cpp/include/cudf/detail/sequence.hpp b/cpp/include/cudf/detail/sequence.hpp
index c3bbb734476..6ba46219166 100644
--- a/cpp/include/cudf/detail/sequence.hpp
+++ b/cpp/include/cudf/detail/sequence.hpp
@@ -30,7 +30,7 @@ namespace detail {
  *rmm::mr::get_current_device_resource())
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
- **/
+ */
 std::unique_ptr<column> sequence(
   size_type size,
   scalar const& init,
@@ -44,7 +44,7 @@ std::unique_ptr<column> sequence(
  rmm::mr::get_current_device_resource())
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
- **/
+ */
 std::unique_ptr<column> sequence(
   size_type size,
   scalar const& init,
diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp
index 0309542d01f..bea480d85cd 100644
--- a/cpp/include/cudf/detail/transform.hpp
+++ b/cpp/include/cudf/detail/transform.hpp
@@ -26,7 +26,7 @@ namespace detail {
  * @copydoc cudf::transform
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
- **/
+ */
 std::unique_ptr<column> transform(
   column_view const& input,
   std::string const& unary_udf,
@@ -39,7 +39,7 @@ std::unique_ptr<column> transform(
  * @copydoc cudf::nans_to_nulls
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
- **/
+ */
 std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
   column_view const& input,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
@@ -49,7 +49,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
  * @copydoc cudf::bools_to_mask
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
- **/
+ */
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
   column_view const& input,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
@@ -59,7 +59,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
  * @copydoc cudf::encode
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
- **/
+ */
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
   cudf::table_view const& input,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
@@ -69,7 +69,7 @@ std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
  * @copydoc cudf::mask_to_bools
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
- **/
+ */
 std::unique_ptr<column> mask_to_bools(
   bitmask_type const* null_mask,
   size_type begin_bit,
diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh
index a2f73d9a0e9..246817a5cb5 100644
--- a/cpp/include/cudf/detail/utilities/device_atomics.cuh
+++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-/** ---------------------------------------------------------------------------*
+/**
  * @brief overloads for CUDA atomic operations
  * @file device_atomics.cuh
  *
@@ -30,7 +30,7 @@
  * `atomicAnd`, `atomicOr`, `atomicXor` are also supported for integer data types.
  * Also provides `cudf::genericAtomicOperation` which performs atomic operation
  * with the given binary operator.
- * ---------------------------------------------------------------------------**/
+ */
 
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/types.hpp>
@@ -412,7 +412,7 @@ struct typesAtomicCASImpl<T, 8> {
 
 }  // namespace detail
 
-/** -------------------------------------------------------------------------*
+/**
  * @brief compute atomic binary operation
  * reads the `old` located at the `address` in global or shared memory,
  * computes 'BinaryOp'('old', 'update_value'),
@@ -427,7 +427,7 @@ struct typesAtomicCASImpl<T, 8> {
  * @param[in] op  The binary operator used for compute
  *
  * @returns The old value at `address`
- * -------------------------------------------------------------------------**/
+ */
 template <typename T, typename BinaryOp>
 typename std::enable_if_t<cudf::is_numeric<T>(), T> __forceinline__ __device__
 genericAtomicOperation(T* address, T const& update_value, BinaryOp op)
@@ -476,7 +476,7 @@ __forceinline__ __device__ bool genericAtomicOperation(bool* address,
 
 }  // namespace cudf
 
-/** -------------------------------------------------------------------------*
+/**
  * @brief Overloads for `atomicAdd`
  * reads the `old` located at the `address` in global or shared memory,
  * computes (old + val), and stores the result back to memory at the same
@@ -496,14 +496,14 @@ __forceinline__ __device__ bool genericAtomicOperation(bool* address,
  * @param[in] val The value to be added
  *
  * @returns The old value at `address`
- * -------------------------------------------------------------------------**/
+ */
 template <typename T>
 __forceinline__ __device__ T atomicAdd(T* address, T val)
 {
   return cudf::genericAtomicOperation(address, val, cudf::DeviceSum{});
 }
 
-/** -------------------------------------------------------------------------*
+/**
  * @brief Overloads for `atomicMin`
  * reads the `old` located at the `address` in global or shared memory,
  * computes the minimum of old and val, and stores the result back to memory
@@ -522,14 +522,14 @@ __forceinline__ __device__ T atomicAdd(T* address, T val)
  * @param[in] val The value to be computed
  *
  * @returns The old value at `address`
- * -------------------------------------------------------------------------**/
+ */
 template <typename T>
 __forceinline__ __device__ T atomicMin(T* address, T val)
 {
   return cudf::genericAtomicOperation(address, val, cudf::DeviceMin{});
 }
 
-/** -------------------------------------------------------------------------*
+/**
  * @brief Overloads for `atomicMax`
  * reads the `old` located at the `address` in global or shared memory,
  * computes the maximum of old and val, and stores the result back to memory
@@ -548,14 +548,14 @@ __forceinline__ __device__ T atomicMin(T* address, T val)
  * @param[in] val The value to be computed
  *
  * @returns The old value at `address`
- * -------------------------------------------------------------------------**/
+ */
 template <typename T>
 __forceinline__ __device__ T atomicMax(T* address, T val)
 {
   return cudf::genericAtomicOperation(address, val, cudf::DeviceMax{});
 }
 
-/** --------------------------------------------------------------------------*
+/**
  * @brief Overloads for `atomicCAS`
  * reads the `old` located at the `address` in global or shared memory,
  * computes (`old` == `compare` ? `val` : `old`),
@@ -575,14 +575,14 @@ __forceinline__ __device__ T atomicMax(T* address, T val)
  * @param[in] val The value to be computed
  *
  * @returns The old value at `address`
- * -------------------------------------------------------------------------**/
+ */
 template <typename T>
 __forceinline__ __device__ T atomicCAS(T* address, T compare, T val)
 {
   return cudf::detail::typesAtomicCASImpl<T>()(address, compare, val);
 }
 
-/** -------------------------------------------------------------------------*
+/**
  * @brief Overloads for `atomicAnd`
  * reads the `old` located at the `address` in global or shared memory,
  * computes (old & val), and stores the result back to memory at the same
@@ -596,14 +596,14 @@ __forceinline__ __device__ T atomicCAS(T* address, T compare, T val)
  * @param[in] val The value to be computed
  *
  * @returns The old value at `address`
- * -------------------------------------------------------------------------**/
+ */
 template <typename T, typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
 __forceinline__ __device__ T atomicAnd(T* address, T val)
 {
   return cudf::genericAtomicOperation(address, val, cudf::DeviceAnd{});
 }
 
-/** -------------------------------------------------------------------------*
+/**
  * @brief Overloads for `atomicOr`
  * reads the `old` located at the `address` in global or shared memory,
  * computes (old | val), and stores the result back to memory at the same
@@ -617,14 +617,14 @@ __forceinline__ __device__ T atomicAnd(T* address, T val)
  * @param[in] val The value to be computed
  *
  * @returns The old value at `address`
- * -------------------------------------------------------------------------**/
+ */
 template <typename T, typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
 __forceinline__ __device__ T atomicOr(T* address, T val)
 {
   return cudf::genericAtomicOperation(address, val, cudf::DeviceOr{});
 }
 
-/** -------------------------------------------------------------------------*
+/**
  * @brief Overloads for `atomicXor`
  * reads the `old` located at the `address` in global or shared memory,
  * computes (old ^ val), and stores the result back to memory at the same
@@ -638,7 +638,7 @@ __forceinline__ __device__ T atomicOr(T* address, T val)
  * @param[in] val The value to be computed
  *
  * @returns The old value at `address`
- * -------------------------------------------------------------------------**/
+ */
 template <typename T, typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
 __forceinline__ __device__ T atomicXor(T* address, T val)
 {
diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh
index 7245bee1aa3..659f0d00d6f 100644
--- a/cpp/include/cudf/detail/utilities/device_operators.cuh
+++ b/cpp/include/cudf/detail/utilities/device_operators.cuh
@@ -17,11 +17,10 @@
 #ifndef DEVICE_OPERATORS_CUH
 #define DEVICE_OPERATORS_CUH
 
-/** ---------------------------------------------------------------------------*
+/**
  * @brief definition of the device operators
  * @file device_operators.cuh
- *
- * ---------------------------------------------------------------------------**/
+ */
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/types.hpp>
@@ -94,7 +93,6 @@ struct DeviceCount {
  * character. This serves as identity value for maximum operator on string
  * values. Also, this char pointer serves as valid device pointer of identity
  * value for minimum operator on string values.
- *
  */
 __constant__ char max_string_sentinel[5]{"\xF7\xBF\xBF\xBF"};
 
diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh
index 5f7728b8c1e..e9d66d125dd 100644
--- a/cpp/include/cudf/detail/utilities/hash_functions.cuh
+++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh
@@ -388,7 +388,6 @@ struct MurmurHash3_32 {
     return h;
   }
 
-  /* --------------------------------------------------------------------------*/
   /**
    * @brief  Combines two hash values into a new single hash value. Called
    * repeatedly to create a hash value from several variables.
@@ -400,7 +399,6 @@ struct MurmurHash3_32 {
    *
    * @returns A hash value that intelligently combines the lhs and rhs hash values
    */
-  /* ----------------------------------------------------------------------------*/
   CUDA_DEVICE_CALLABLE result_type hash_combine(result_type lhs, result_type rhs)
   {
     result_type combined{lhs};
@@ -694,17 +692,14 @@ SparkMurmurHash3_32<double>::operator()(double const& key) const
   return this->compute_floating_point(key);
 }
 
-/* --------------------------------------------------------------------------*/
 /**
  * @brief  This hash function simply returns the value that is asked to be hash
- reinterpreted as the result_type of the functor.
+ * reinterpreted as the result_type of the functor.
  */
-/* ----------------------------------------------------------------------------*/
 template <typename Key>
 struct IdentityHash {
   using result_type = hash_value_type;
 
-  /* --------------------------------------------------------------------------*/
   /**
    * @brief  Combines two hash values into a new single hash value. Called
    * repeatedly to create a hash value from several variables.
@@ -716,7 +711,6 @@ struct IdentityHash {
    *
    * @returns A hash value that intelligently combines the lhs and rhs hash values
    */
-  /* ----------------------------------------------------------------------------*/
   CUDA_HOST_DEVICE_CALLABLE result_type hash_combine(result_type lhs, result_type rhs) const
   {
     result_type combined{lhs};
diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp
index ca40d7516e8..dc919433da7 100644
--- a/cpp/include/cudf/detail/utilities/integer_utils.hpp
+++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp
@@ -20,7 +20,6 @@
 
 /**
  * @file Utility code involving integer arithmetic
- *
  */
 
 #include <cmath>
diff --git a/cpp/include/cudf/detail/utilities/release_assert.cuh b/cpp/include/cudf/detail/utilities/release_assert.cuh
index 2ca32fdcb8b..e0db88d8fcb 100644
--- a/cpp/include/cudf/detail/utilities/release_assert.cuh
+++ b/cpp/include/cudf/detail/utilities/release_assert.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,8 +26,7 @@
  * regardless of the state of `NDEBUG`.
  *
  * Relies on the `__PRETTY_FUNCTION__` macro which is specific to GCC and Clang.
- *
- **/
+ */
 #if defined(__CUDA_ARCH__) && (defined(__clang__) || defined(__GNUC__))
 #define __ASSERT_STR_HELPER(x) #x
 #define release_assert(e)     \
diff --git a/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh b/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh
index a3da7a36b90..8c0abbad49f 100644
--- a/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh
+++ b/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-/** --------------------------------------------------------------------------*
+/**
  * @brief unary functions for thrust::transform_iterator
  * @file transform_unary_functions.cuh
  *
@@ -22,7 +22,7 @@
  * for thrust::transform_iterator.
  * For the detail of example cases,
  * @see iterator.cuh iterator_test.cu
- * -------------------------------------------------------------------------**/
+ */
 
 #pragma once
 
@@ -59,7 +59,7 @@ struct null_replacing_transformer {
   }
 };
 
-/** -------------------------------------------------------------------------*
+/**
  * @brief intermediate struct to calculate mean and variance
  * This is an example case to output a struct from column input.
  *
@@ -69,7 +69,7 @@ struct null_replacing_transformer {
  * and `variance` (= sum of squares / count - mean^2).
  *
  * @tparam ElementType  element data type of value and value_squared.
- * -------------------------------------------------------------------------**/
+ */
 template <typename ElementType>
 struct meanvar {
   ElementType value;          /// the value
@@ -101,7 +101,7 @@ struct meanvar {
 // --------------------------------------------------------------------------
 // transformers
 
-/** -------------------------------------------------------------------------*
+/**
  * @brief Transforms a scalar by first casting to another type, and then squaring the result.
  *
  * This struct transforms the output value as
@@ -110,14 +110,14 @@ struct meanvar {
  * This will be used to compute "sum of squares".
  *
  * @tparam  ResultType  scalar data type of output
- * -------------------------------------------------------------------------**/
+ */
 template <typename ElementType>
 struct transformer_squared {
   CUDA_HOST_DEVICE_CALLABLE
   ElementType operator()(ElementType const &value) { return (value * value); };
 };
 
-/** -------------------------------------------------------------------------*
+/**
  * @brief Uses a scalar value to construct a `meanvar` object.
  * This transforms `thrust::pair<ElementType, bool>` into
  * `ResultType = meanvar<ElementType>` form.
@@ -125,7 +125,7 @@ struct transformer_squared {
  * This struct transforms the value and the squared value and the count at once.
  *
  * @tparam  ElementType         scalar data type of input
- * -------------------------------------------------------------------------**/
+ */
 template <typename ElementType>
 struct transformer_meanvar {
   using ResultType = meanvar<ElementType>;
diff --git a/cpp/include/cudf/detail/utilities/trie.cuh b/cpp/include/cudf/detail/utilities/trie.cuh
index ab43366aa3c..5370c8678cf 100644
--- a/cpp/include/cudf/detail/utilities/trie.cuh
+++ b/cpp/include/cudf/detail/utilities/trie.cuh
@@ -17,7 +17,6 @@
 /**
  * @brief Serialized trie implementation for C++/CUDA
  * @file trie.cuh
- *
  */
 
 #pragma once
@@ -89,8 +88,12 @@ inline thrust::host_vector<SerialTrieNode> createSerializedTrie(
   // Serialize the tree trie
   std::deque<IndexedTrieNode> to_visit;
   thrust::host_vector<SerialTrieNode> nodes;
-  // suport for matching empty input
+
+  // If the Tree trie matches empty strings, the root node is marked as 'end of word'.
+  // The first node in the serialized trie is also used to match empty strings, so we're
+  // initializing it using the `is_end_of_word` value from the root node.
   nodes.push_back(SerialTrieNode(trie_terminating_character, tree_trie.is_end_of_word));
+
   // Add root node to queue. this node is not included to the serialized trie
   to_visit.emplace_back(&tree_trie, -1);
   while (!to_visit.empty()) {
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index f8f3ba51468..c685837ae2b 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -113,7 +113,7 @@ std::pair<rmm::device_buffer, size_type> valid_if(
   return std::make_pair(std::move(null_mask), null_count);
 }
 
-/**----------------------------------------------------------------------------*
+/**
  * @brief Populates a set of bitmasks by applying a binary predicate to two
 *         input ranges.
 
@@ -146,7 +146,7 @@ std::pair<rmm::device_buffer, size_type> valid_if(
  *                      remaining bits may not be initialized.
  * @param valid_counts  Used to obtain the total number of valid bits for each
  *                      mask.
- **/
+ */
 template <typename InputIterator1,
           typename InputIterator2,
           typename BinaryPredicate,
diff --git a/cpp/include/cudf/filling.hpp b/cpp/include/cudf/filling.hpp
index b49173a9bba..119b849d15c 100644
--- a/cpp/include/cudf/filling.hpp
+++ b/cpp/include/cudf/filling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -171,7 +171,7 @@ std::unique_ptr<table> repeat(
  * @param step Increment value
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return std::unique_ptr<column> The result table containing the sequence
- **/
+ */
 std::unique_ptr<column> sequence(
   size_type size,
   scalar const& init,
@@ -197,7 +197,7 @@ std::unique_ptr<column> sequence(
  * @param init First value in the sequence
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return std::unique_ptr<column> The result table containing the sequence
- **/
+ */
 std::unique_ptr<column> sequence(
   size_type size,
   scalar const& init,
diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp
index fc809b03dfa..f7f7f51479d 100644
--- a/cpp/include/cudf/groupby.hpp
+++ b/cpp/include/cudf/groupby.hpp
@@ -62,7 +62,6 @@ struct aggregation_request {
  * For every `aggregation_request` given to `groupby::aggregate` an
  * `aggregation_result` will be returned. The `aggregation_result` holds the
  * resulting column(s) for each requested aggregation on the `request`s values.
- *
  */
 struct aggregation_result {
   /// Columns of results from an `aggregation_request`
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index 042a85cfd6e..9dbde1432aa 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -109,7 +109,7 @@ struct column_metadata {
  * @param metadata Contains hierarchy of names of columns and children
  * @param ar_mr arrow memory pool to allocate memory for arrow Table
  * @return arrow Table generated from `input`
- **/
+ */
 std::shared_ptr<arrow::Table> to_arrow(table_view input,
                                        std::vector<column_metadata> const& metadata = {},
                                        arrow::MemoryPool* ar_mr = arrow::default_memory_pool());
@@ -120,7 +120,7 @@ std::shared_ptr<arrow::Table> to_arrow(table_view input,
  * @param input arrow:Table that needs to be converted to `cudf::table`
  * @param mr    Device memory resource used to allocate `cudf::table`
  * @return cudf table generated from given arrow Table.
- **/
+ */
 
 std::unique_ptr<table> from_arrow(
   arrow::Table const& input,
diff --git a/cpp/include/cudf/io/data_sink.hpp b/cpp/include/cudf/io/data_sink.hpp
index 6c830e31a56..0ae403458a0 100644
--- a/cpp/include/cudf/io/data_sink.hpp
+++ b/cpp/include/cudf/io/data_sink.hpp
@@ -30,21 +30,21 @@ namespace cudf {
 namespace io {
 /**
  * @brief Interface class for storing the output data from the writers
- **/
+ */
 class data_sink {
  public:
   /**
    * @brief Create a sink from a file path
    *
    * @param[in] filepath Path to the file to use
-   **/
+   */
   static std::unique_ptr<data_sink> create(const std::string& filepath);
 
   /**
    * @brief Create a sink from a std::vector
    *
    * @param[in,out] buffer Pointer to the output vector
-   **/
+   */
   static std::unique_ptr<data_sink> create(std::vector<char>* buffer);
 
   /**
@@ -53,7 +53,7 @@ class data_sink {
    * A useful code path for benchmarking, to eliminate physical
    * hardware randomness from profiling.
    *
-   **/
+   */
   static std::unique_ptr<data_sink> create();
 
   /**
@@ -65,12 +65,12 @@ class data_sink {
    * class that wraps the user pointer.  The principle is to allow the user to declare
    * a custom sink instance and use it across multiple write() calls.
    *
-   **/
+   */
   static std::unique_ptr<data_sink> create(cudf::io::data_sink* const user_sink);
 
   /**
    * @brief Base class destructor
-   **/
+   */
   virtual ~data_sink(){};
 
   /**
@@ -80,7 +80,7 @@ class data_sink {
    * @param[in] size Number of bytes to write
    *
    * @return void
-   **/
+   */
   virtual void host_write(void const* data, size_t size) = 0;
 
   /**
@@ -104,7 +104,7 @@ class data_sink {
    * write() calls as well.
    *
    * @return bool If this writer supports device_write() calls.
-   **/
+   */
   virtual bool supports_device_write() const { return false; }
 
   /**
@@ -114,7 +114,7 @@ class data_sink {
    * @param[in] size Number of bytes to write
    *
    * @return void
-   **/
+   */
   virtual void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream)
   {
     CUDF_FAIL("data_sink classes that support device_write must override this function.");
@@ -131,7 +131,7 @@ class data_sink {
    * @brief Returns the total number of bytes written into this sink
    *
    * @return size_t Total number of bytes written into this sink
-   **/
+   */
   virtual size_t bytes_written() = 0;
 };
 
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 262d79b64c2..7d56c1c0fc6 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -61,7 +61,6 @@ class json_reader_options_builder;
  * | `date_unit`          | only millisecond units are supported             |
  * | `encoding`           | only ASCII-encoded data is supported             |
  * | `chunksize`          | use `byte_range_xxx` for chunking instead        |
- *
  */
 class json_reader_options {
   source_info _source;
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 8d1e192cee0..661b36f10c8 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -148,7 +148,6 @@ struct table_metadata {
  *
  * In the case where column nullability is known, pass `true` if the corresponding column could
  * contain nulls in one or more subtables to be written, otherwise `false`.
- *
  */
 struct table_metadata_with_nullability : public table_metadata {
   std::vector<bool> column_nullable;  //!< Per-column nullability information.
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index 37847c41339..b2c1296ccef 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -394,10 +394,12 @@ class hash_join {
    *
    * @param build The build table, from which the hash table is built.
    * @param build_on The column indices from `build` to join on.
+   * @param compare_nulls Controls whether null join-key values should match or not.
    * @param stream CUDA stream used for device memory operations and kernel launches
    */
   hash_join(cudf::table_view const& build,
             std::vector<size_type> const& build_on,
+            null_equality compare_nulls,
             rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
   /**
diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh
index cde07b9427d..0dcc4e2b37c 100644
--- a/cpp/include/cudf/lists/detail/gather.cuh
+++ b/cpp/include/cudf/lists/detail/gather.cuh
@@ -232,7 +232,6 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
  *
  * @returns The gather_data struct needed to construct the gather map for the
  *          next level of recursion.
- *
  */
 template <bool NullifyOutOfBounds, typename MapItType>
 gather_data make_gather_data(cudf::lists_column_view const& source_column,
@@ -261,7 +260,6 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
  * @param mr Memory resource to use for all allocations
  *
  * @returns column with elements gathered based on `gather_data`
- *
  */
 std::unique_ptr<column> gather_list_nested(
   lists_column_view const& list,
@@ -280,7 +278,6 @@ std::unique_ptr<column> gather_list_nested(
  * @param mr Memory resource to use for all allocations
  *
  * @returns column with elements gathered based on `gather_data`
- *
  */
 std::unique_ptr<column> gather_list_leaf(
   column_view const& column,
diff --git a/cpp/include/cudf/lists/list_device_view.cuh b/cpp/include/cudf/lists/list_device_view.cuh
index be6bf88da30..38708d4878e 100644
--- a/cpp/include/cudf/lists/list_device_view.cuh
+++ b/cpp/include/cudf/lists/list_device_view.cuh
@@ -24,7 +24,6 @@ namespace cudf {
 /**
  * @brief A non-owning, immutable view of device data that represents
  * a list of elements of arbitrary type (including further nested lists).
- *
  */
 class list_device_view {
   using lists_column_device_view = cudf::detail::lists_column_device_view;
diff --git a/cpp/include/cudf/lists/list_view.cuh b/cpp/include/cudf/lists/list_view.cuh
index 898b274781a..9af722e444b 100644
--- a/cpp/include/cudf/lists/list_view.cuh
+++ b/cpp/include/cudf/lists/list_view.cuh
@@ -25,7 +25,6 @@ namespace cudf {
 /**
  * @brief A non-owning, immutable view of device data that represents
  * a list of elements of arbitrary type (including further nested lists).
- *
  */
 class list_view {
 };
diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp
index 690f4cdbbb0..5e1f0f0802e 100644
--- a/cpp/include/cudf/null_mask.hpp
+++ b/cpp/include/cudf/null_mask.hpp
@@ -37,7 +37,7 @@ namespace cudf {
  * @param state The state of the null mask
  * @param size The number of elements represented by the mask
  * @return size_type The count of null elements
- **/
+ */
 size_type state_null_count(mask_state state, size_type size);
 
 /**
@@ -51,7 +51,7 @@ size_type state_null_count(mask_state state, size_type size);
  * @param padding_boundary The value returned will be rounded up to a multiple
  * of this value
  * @return std::size_t The necessary number of bytes
- **/
+ */
 std::size_t bitmask_allocation_size_bytes(size_type number_of_bits,
                                           std::size_t padding_boundary = 64);
 
@@ -79,7 +79,7 @@ size_type num_bitmask_words(size_type number_of_bits);
  * @param mr Device memory resource used to allocate the returned device_buffer.
  * @return rmm::device_buffer A `device_buffer` for use as a null bitmask
  * satisfying the desired size and state
- **/
+ */
 rmm::device_buffer create_null_mask(
   size_type size,
   mask_state state,
@@ -96,7 +96,7 @@ rmm::device_buffer create_null_mask(
  * @param begin_bit Index of the first bit to set (inclusive)
  * @param end_bit Index of the last bit to set (exclusive)
  * @param valid If true set all entries to valid; otherwise, set all to null.
- **/
+ */
 void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit, bool valid);
 
 /**
@@ -112,7 +112,7 @@ void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit
  * @param start_bit Index of the first bit to count (inclusive)
  * @param stop_bit Index of the last bit to count (exclusive)
  * @return The number of non-zero bits in the specified range
- **/
+ */
 cudf::size_type count_set_bits(bitmask_type const* bitmask, size_type start, size_type stop);
 
 /**
@@ -128,7 +128,7 @@ cudf::size_type count_set_bits(bitmask_type const* bitmask, size_type start, siz
  * @param start_bit Index of the first bit to count (inclusive)
  * @param stop_bit Index of the last bit to count (exclusive)
  * @return The number of zero bits in the specified range
- **/
+ */
 cudf::size_type count_unset_bits(bitmask_type const* bitmask, size_type start, size_type stop);
 
 /**
@@ -184,7 +184,7 @@ std::vector<size_type> segmented_count_unset_bits(bitmask_type const* bitmask,
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return rmm::device_buffer A `device_buffer` containing the bits
  * `[begin_bit, end_bit)` from `mask`.
- **/
+ */
 rmm::device_buffer copy_bitmask(
   bitmask_type const* mask,
   size_type begin_bit,
@@ -201,7 +201,7 @@ rmm::device_buffer copy_bitmask(
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return rmm::device_buffer A `device_buffer` containing the bits
  * `[view.offset(), view.offset() + view.size())` from `view`'s bitmask.
- **/
+ */
 rmm::device_buffer copy_bitmask(
   column_view const& view,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
diff --git a/cpp/include/cudf/rolling.hpp b/cpp/include/cudf/rolling.hpp
index ecbf999aa92..44a64a01c5e 100644
--- a/cpp/include/cudf/rolling.hpp
+++ b/cpp/include/cudf/rolling.hpp
@@ -52,7 +52,7 @@ namespace cudf {
  * @param[in] agg The rolling window aggregation type (SUM, MAX, MIN, etc.)
  *
  * @returns   A nullable output column containing the rolling window results
- **/
+ */
 std::unique_ptr<column> rolling_window(
   column_view const& input,
   size_type preceding_window,
@@ -188,7 +188,7 @@ struct window_bounds {
  * @param[in] aggr The rolling window aggregation type (SUM, MAX, MIN, etc.)
  *
  * @returns   A nullable output column containing the rolling window results
- **/
+ */
 std::unique_ptr<column> grouped_rolling_window(
   table_view const& group_keys,
   column_view const& input,
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index c9002e5a9a4..de01f4b860f 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -330,26 +330,26 @@ class fixed_point_scalar : public scalar {
                      bool is_valid                       = true,
                      rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-    : scalar{data_type{type_to_id<T>(), 0}, is_valid, stream, mr},
+    : scalar{data_type{type_to_id<T>(), _type.scale()}, is_valid, stream, mr},
       _data{numeric::scaled_integer<rep_type>{value}.value}
   {
-    CUDF_EXPECTS(value == (T{_data.value(), numeric::scale_type{0}}),
-                 "scale of fixed_point value should be zero");
   }
 
   /**
    * @brief Construct a new fixed_point scalar object from existing device memory.
    *
-   * @param[in] data The scalar's data in device memory
-   * @param[in] is_valid Whether the value held by the scalar is valid
-   * @param[in] stream CUDA stream used for device memory operations.
-   * @param[in] mr Device memory resource to use for device memory allocation
+   * @param[in] data      The scalar's data in device memory
+   * @param[in] scale     The scale of the fixed_point scalar
+   * @param[in] is_valid  Whether the value held by the scalar is valid
+   * @param[in] stream    CUDA stream used for device memory operations.
+   * @param[in] mr        Device memory resource to use for device memory allocation
    */
   fixed_point_scalar(rmm::device_scalar<rep_type>&& data,
+                     numeric::scale_type scale,
                      bool is_valid                       = true,
                      rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-    : scalar{data_type{type_to_id<T>()}, is_valid, stream, mr},
+    : scalar{data_type{type_to_id<T>(), scale}, is_valid, stream, mr},
       _data{std::forward<rmm::device_scalar<rep_type>>(data)}
   {
   }
diff --git a/cpp/include/cudf/scalar/scalar_device_view.cuh b/cpp/include/cudf/scalar/scalar_device_view.cuh
index a4f404b5d19..aa3cd932f4f 100644
--- a/cpp/include/cudf/scalar/scalar_device_view.cuh
+++ b/cpp/include/cudf/scalar/scalar_device_view.cuh
@@ -214,9 +214,14 @@ class fixed_point_scalar_device_view : public detail::scalar_device_view_base {
   using rep_type = typename T::rep;
 
   fixed_point_scalar_device_view(data_type type, rep_type* data, bool* is_valid)
-    : detail::scalar_device_view_base(type, is_valid)
+    : detail::scalar_device_view_base(type, is_valid), _data(data)
   {
   }
+
+  __device__ void set_value(rep_type value) { *_data = value; }
+
+ private:
+  rep_type* _data{};
 };
 
 /**
@@ -310,4 +315,13 @@ auto get_scalar_device_view(duration_scalar<T>& s)
   return duration_scalar_device_view<T>(s.type(), s.data(), s.validity_data());
 }
 
+/**
+ * @brief Get the device view of a fixed_point_scalar
+ */
+template <typename T>
+auto get_scalar_device_view(fixed_point_scalar<T>& s)
+{
+  return fixed_point_scalar_device_view<T>(s.type(), s.data(), s.validity_data());
+}
+
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 8fbecc2e815..b9c35912a8f 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -17,9 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
-#include <cudf/detail/copy.hpp>
-#include <cudf/detail/valid_if.cuh>
-#include <cudf/strings/detail/utilities.cuh>
+#include <cudf/column/column_factories.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
@@ -75,22 +73,38 @@ std::unique_ptr<cudf::column> gather(
   auto d_strings      = *strings_column;
 
   // build offsets column
-  auto offsets_transformer = [d_strings, strings_count] __device__(size_type idx) {
-    if (NullifyOutOfBounds && ((idx < 0) || (idx >= strings_count))) return 0;
-    if (d_strings.is_null(idx)) return 0;
-    return d_strings.element<string_view>(idx).size_bytes();
-  };
-  auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer);
-  auto offsets_column          = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + output_count, stream, mr);
-  auto offsets_view = offsets_column->view();
-  auto d_offsets    = offsets_view.template data<int32_t>();
+  auto offsets_column = make_numeric_column(
+    data_type{type_id::INT32}, output_count + 1, mask_state::UNALLOCATED, stream, mr);
+  auto d_offsets = offsets_column->mutable_view().template data<int32_t>();
+  thrust::transform(rmm::exec_policy(stream),
+                    begin,
+                    end,
+                    d_offsets,
+                    [d_strings, strings_count] __device__(size_type idx) {
+                      if (NullifyOutOfBounds && ((idx < 0) || (idx >= strings_count))) return 0;
+                      if (d_strings.is_null(idx)) return 0;
+                      return d_strings.element<string_view>(idx).size_bytes();
+                    });
+
+  // check total size is not too large
+  size_t total_bytes = thrust::transform_reduce(
+    rmm::exec_policy(stream),
+    d_offsets,
+    d_offsets + output_count,
+    [] __device__(auto size) { return static_cast<size_t>(size); },
+    size_t{0},
+    thrust::plus<size_t>{});
+  CUDF_EXPECTS(total_bytes < std::numeric_limits<size_type>::max(),
+               "total size of output strings is too large for a cudf column");
+
+  // create offsets from sizes
+  thrust::exclusive_scan(
+    rmm::exec_policy(stream), d_offsets, d_offsets + output_count + 1, d_offsets);
 
   // build chars column
-  size_type bytes   = thrust::device_pointer_cast(d_offsets)[output_count];
+  size_type bytes   = static_cast<size_type>(total_bytes);
   auto chars_column = create_chars_child_column(output_count, 0, bytes, stream, mr);
-  auto chars_view   = chars_column->mutable_view();
-  auto d_chars      = chars_view.template data<char>();
+  auto d_chars      = chars_column->mutable_view().template data<char>();
   // fill in chars
   auto gather_chars =
     [d_strings, begin, strings_count, d_offsets, d_chars] __device__(size_type idx) {
diff --git a/cpp/include/cudf/strings/sorting.hpp b/cpp/include/cudf/strings/sorting.hpp
index 84ce2e4ec2b..399625e3265 100644
--- a/cpp/include/cudf/strings/sorting.hpp
+++ b/cpp/include/cudf/strings/sorting.hpp
@@ -26,7 +26,7 @@ namespace detail {
 
 /**
  * @brief Sort types for the sort method.
- **/
+ */
 enum sort_type {
   none   = 0,  ///< no sorting
   length = 1,  ///< sort by string length
diff --git a/cpp/include/cudf/structs/struct_view.hpp b/cpp/include/cudf/structs/struct_view.hpp
index 778ffccf7b4..18f0384118b 100644
--- a/cpp/include/cudf/structs/struct_view.hpp
+++ b/cpp/include/cudf/structs/struct_view.hpp
@@ -26,7 +26,6 @@ namespace cudf {
  * @brief A non-owning, immutable view of device data that represents
  * a struct with fields of arbitrary types (including primitives, lists,
  * and other structs)
- *
  */
 class struct_view {
 };
diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
index 75c2340e51b..d9840e78be2 100644
--- a/cpp/include/cudf/table/row_operators.cuh
+++ b/cpp/include/cudf/table/row_operators.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,8 +37,7 @@ namespace cudf {
  *
  * Equivalence is defined as `not (a<b) and not (b<a)`. Elements that are
  * EQUIVALENT may not necessarily be *equal*.
- *
- **/
+ */
 enum class weak_ordering {
   LESS,        ///< Indicates `a` is less than (ordered before) `b`
   EQUIVALENT,  ///< Indicates `a` is ordered neither before nor after `b`
@@ -53,7 +52,7 @@ namespace detail {
  * @param[in] rhs second element
  * @return weak_ordering Indicates the relationship between the elements in
  * the `lhs` and `rhs` columns.
- **/
+ */
 template <typename Element>
 __device__ weak_ordering compare_elements(Element lhs, Element rhs)
 {
@@ -140,7 +139,7 @@ __device__ bool equality_compare(Element const lhs, Element const rhs)
  * @brief Performs an equality comparison between two elements in two columns.
  *
  * @tparam has_nulls Indicates the potential for null values in either column.
- **/
+ */
 template <bool has_nulls = true>
 class element_equality_comparator {
  public:
@@ -153,7 +152,7 @@ class element_equality_comparator {
    * @param lhs The column containing the first element
    * @param rhs The column containing the second element (may be the same as lhs)
    * @param nulls_are_equal Indicates if two null elements are treated as equivalent
-   **/
+   */
   __host__ __device__ element_equality_comparator(column_device_view lhs,
                                                   column_device_view rhs,
                                                   bool nulls_are_equal = true)
@@ -232,7 +231,7 @@ class row_equality_comparator {
  * @brief Performs a relational comparison between two elements in two columns.
  *
  * @tparam has_nulls Indicates the potential for null values in either column.
- **/
+ */
 template <bool has_nulls = true>
 class element_relational_comparator {
  public:
@@ -246,7 +245,7 @@ class element_relational_comparator {
    * @param rhs The column containing the second element (may be the same as lhs)
    * @param null_precedence Indicates how null values are ordered with other
    * values
-   **/
+   */
   __host__ __device__ element_relational_comparator(column_device_view lhs,
                                                     column_device_view rhs,
                                                     null_order null_precedence)
@@ -316,7 +315,7 @@ class element_relational_comparator {
  * `aac < abb`.
  *
  * @tparam has_nulls Indicates the potential for null values in either row.
- **/
+ */
 template <bool has_nulls = true>
 class row_lexicographic_comparator {
  public:
@@ -388,7 +387,7 @@ class row_lexicographic_comparator {
  *
  * @tparam hash_function Hash functor to use for hashing elements.
  * @tparam has_nulls Indicates the potential for null values in the column.
- **/
+ */
 template <template <typename> class hash_function, bool has_nulls = true>
 class element_hasher {
  public:
@@ -432,7 +431,7 @@ class element_hasher_with_seed {
  *
  * @tparam hash_function Hash functor to use for hashing elements.
  * @tparam has_nulls Indicates the potential for null values in the table.
- **/
+ */
 template <template <typename> class hash_function, bool has_nulls = true>
 class row_hasher {
  public:
@@ -472,7 +471,7 @@ class row_hasher {
  *
  * @tparam hash_function Hash functor to use for hashing elements.
  * @tparam has_nulls Indicates the potential for null values in the table.
- **/
+ */
 template <template <typename> class hash_function, bool has_nulls = true>
 class row_hasher_initial_values {
  public:
diff --git a/cpp/include/cudf/table/table.hpp b/cpp/include/cudf/table/table.hpp
index e760e18c6d6..553cf5e9096 100644
--- a/cpp/include/cudf/table/table.hpp
+++ b/cpp/include/cudf/table/table.hpp
@@ -45,7 +45,7 @@ class table {
 
   /**
    * @brief Construct a new table by copying the contents of another table.
-   **/
+   */
   table(table const& other);
 
   /**
@@ -54,7 +54,7 @@ class table {
    *
    * @param columns The vector of `unique_ptr`s to columns whose contents will
    * be moved into the new table.
-   **/
+   */
   table(std::vector<std::unique_ptr<column>>&& columns);
 
   /**
@@ -63,43 +63,43 @@ class table {
    * @param view The view whose contents will be copied to create a new `table`
    * @param stream CUDA stream used for device memory operations.
    * @param mr Device memory resource used for allocating the device memory for the new columns
-   **/
+   */
   table(table_view view,
         rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
         rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Returns the number of columns in the table
-   **/
+   */
   size_type num_columns() const noexcept { return _columns.size(); }
 
   /**
    * @brief Returns the number of rows
-   **/
+   */
   size_type num_rows() const noexcept { return _num_rows; }
 
   /**
    * @brief Returns an immutable, non-owning `table_view` of the contents of
    *this `table`.
-   **/
+   */
   table_view view() const;
 
   /**
    * @brief Conversion operator to an immutable, non-owning `table_view` of the
    * contents of this `table`.
-   **/
+   */
   operator table_view() const { return this->view(); };
 
   /**
    * @brief Returns a mutable, non-owning `mutable_table_view` of the contents
    * of this `table`.
-   **/
+   */
   mutable_table_view mutable_view();
 
   /**
    * @brief Conversion operator to a mutable, non-owning `mutable_table_view` of
    *the contents of this `table`.
-   **/
+   */
   operator mutable_table_view() { return this->mutable_view(); };
 
   /**
@@ -107,7 +107,7 @@ class table {
    * `unique_ptr`s to the constituent columns.
    *
    * After `release()`, `num_columns() == 0` and `num_rows() == 0`
-   **/
+   */
   std::vector<std::unique_ptr<column>> release();
 
   /**
@@ -119,7 +119,7 @@ class table {
    * @param column_indices Indices of columns in the table
    * @return A table_view consisting of columns from the original table
    * specified by the elements of `column_indices`
-   **/
+   */
   table_view select(std::vector<cudf::size_type> const& column_indices) const;
 
   /**
@@ -130,7 +130,7 @@ class table {
    *
    * @param i Index of the desired column
    * @return A reference to the desired column
-   **/
+   */
   column& get_column(cudf::size_type column_index) { return *(_columns.at(column_index)); }
 
   /**
@@ -141,7 +141,7 @@ class table {
    *
    * @param i Index of the desired column
    * @return A const reference to the desired column
-   **/
+   */
   column const& get_column(cudf::size_type i) const { return *(_columns.at(i)); }
 
  private:
diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp
index 522cf7a637a..22f2073f73c 100644
--- a/cpp/include/cudf/table/table_view.hpp
+++ b/cpp/include/cudf/table/table_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@
  *
  * A `(mutable_)table_view` is non-owning and trivially copyable and should be
  * passed by value.
- **/
+ */
 
 namespace cudf {
 namespace detail {
@@ -45,7 +45,7 @@ namespace detail {
  * available in both `table_view` and `mutable_table_view`.
  *
  * @tparam ColumnView The type of column view the table contains
- **/
+ */
 template <typename ColumnView>
 class table_view_base {
   static_assert(std::is_same<ColumnView, column_view>::value or
@@ -76,17 +76,17 @@ class table_view_base {
    * @throws cudf::logic_error If all views do not have the same size
    *
    * @param cols The vector of columns to construct the table from
-   **/
+   */
   explicit table_view_base(std::vector<ColumnView> const& cols);
 
   /**
    * @brief Returns an iterator to the first view in the `table`.
-   **/
+   */
   iterator begin() noexcept { return std::begin(_columns); }
 
   /**
    * @brief Returns an iterator to the first view in the `table`.
-   **/
+   */
   const_iterator begin() const noexcept { return std::begin(_columns); }
 
   /**
@@ -94,7 +94,7 @@ class table_view_base {
    *
    * `end()` acts as a place holder. Attempting to dereference it results in
    * undefined behavior.
-   **/
+   */
   iterator end() noexcept { return std::end(_columns); }
 
   /**
@@ -102,7 +102,7 @@ class table_view_base {
    *
    * `end()` acts as a place holder. Attempting to dereference it results in
    * undefined behavior.
-   **/
+   */
   const_iterator end() const noexcept { return std::end(_columns); }
 
   /**
@@ -113,17 +113,17 @@ class table_view_base {
    *
    * @param column_index The index of the desired column
    * @return A reference to the desired column
-   **/
+   */
   ColumnView const& column(size_type column_index) const;
 
   /**
    * @brief Returns the number of columns
-   **/
+   */
   size_type num_columns() const noexcept { return _columns.size(); }
 
   /**
    * @brief Returns the number of rows
-   **/
+   */
   size_type num_rows() const noexcept { return _num_rows; }
 
   table_view_base() = default;
@@ -145,7 +145,7 @@ class table_view_base {
  *
  * All public member functions and constructors are inherited from
  * `table_view_base<column_view>`.
- **/
+ */
 class table_view : public detail::table_view_base<column_view> {
   using detail::table_view_base<column_view>::table_view_base;
 
@@ -171,7 +171,7 @@ class table_view : public detail::table_view_base<column_view> {
    * If number of rows mismatch
    *
    * @param views The vector of table views to construct the table from
-   **/
+   */
   table_view(std::vector<table_view> const& views);
 
   /**
@@ -183,7 +183,7 @@ class table_view : public detail::table_view_base<column_view> {
    * @param column_indices Indices of columns in the table
    * @return A table_view consisting of columns from the original table
    * specified by the elements of `column_indices`
-   **/
+   */
   table_view select(std::vector<size_type> const& column_indices) const;
 };
 
@@ -194,7 +194,7 @@ class table_view : public detail::table_view_base<column_view> {
  *
  * All public member functions and constructors are inherited from
  * `table_view_base<mutable_column_view>`.
- **/
+ */
 class mutable_table_view : public detail::table_view_base<mutable_column_view> {
   using detail::table_view_base<mutable_column_view>::table_view_base;
 
@@ -209,7 +209,7 @@ class mutable_table_view : public detail::table_view_base<mutable_column_view> {
   }
   /**
    * @brief Creates an immutable `table_view` of the columns
-   **/
+   */
   operator table_view();
 
   /**
@@ -229,7 +229,7 @@ class mutable_table_view : public detail::table_view_base<mutable_column_view> {
    * If number of rows mismatch
    *
    * @param views The vector of table views to construct the table from
-   **/
+   */
   mutable_table_view(std::vector<mutable_table_view> const& views);
 };
 
diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index 9931e9ab0c1..9b740d207e1 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,7 +45,7 @@ namespace cudf {
  * @param mr            Device memory resource used to allocate the returned column's device memory
  * @return              The column resulting from applying the unary function to
  *                      every element of the input
- **/
+ */
 std::unique_ptr<column> transform(
   column_view const& input,
   std::string const& unary_udf,
@@ -63,7 +63,7 @@ std::unique_ptr<column> transform(
  * @param mr            Device memory resource used to allocate the returned bitmask.
  * @return A pair containing a `device_buffer` with the new bitmask and it's
  * null count obtained by replacing `NaN` in `input` with null.
- **/
+ */
 std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
   column_view const& input,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
@@ -82,7 +82,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
  * @return A pair containing a `device_buffer` with the new bitmask and it's
  * null count obtained from input considering `true` represent `valid`/`1` and
  * `false` represent `invalid`/`0`.
- **/
+ */
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
   column_view const& input,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index c0ab5c220d3..48e5d9543b8 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -34,8 +34,7 @@
 /**
  * @file
  * @brief Type declarations for libcudf.
- *
- **/
+ */
 
 namespace bit_mask {
 using bit_mask_t = uint32_t;
@@ -112,12 +111,12 @@ size_type distance(T f, T l)
  *
  * Use this value when constructing any column-like object to indicate that
  * the null count should be computed on the first invocation of `null_count()`.
- **/
+ */
 static constexpr size_type UNKNOWN_NULL_COUNT{-1};
 
 /**
  * @brief Indicates the order in which elements should be sorted.
- **/
+ */
 enum class order : bool {
   ASCENDING,  ///< Elements ordered from small to large
   DESCENDING  ///< Elements ordered from large to small
@@ -149,7 +148,7 @@ enum class null_equality : bool {
 
 /**
  * @brief Indicates how null values compare against all other values.
- **/
+ */
 enum class null_order : bool {
   AFTER,  ///< NULL values ordered *after* all other values
   BEFORE  ///< NULL values ordered *before* all other values
@@ -162,7 +161,7 @@ enum class sorted : bool { NO, YES };
 
 /**
  * @brief Indicates how a collection of values has been ordered.
- **/
+ */
 struct order_info {
   sorted is_sorted;
   order ordering;
@@ -171,7 +170,7 @@ struct order_info {
 
 /**
  * @brief Controls the allocation/initialization of a null mask.
- **/
+ */
 enum class mask_state : int32_t {
   UNALLOCATED,    ///< Null mask not allocated, (all elements are valid)
   UNINITIALIZED,  ///< Null mask allocated, but not initialized
@@ -193,7 +192,7 @@ enum class interpolation : int32_t {
 
 /**
  * @brief Identifies a column's logical element type
- **/
+ */
 enum class type_id : int32_t {
   EMPTY,                   ///< Always null with no underlying data
   INT8,                    ///< 1 byte signed integer
@@ -232,7 +231,7 @@ enum class type_id : int32_t {
  *
  * Simple types can be be entirely described by their `id()`, but some types
  * require additional metadata to fully describe elements of that type.
- **/
+ */
 class data_type {
  public:
   data_type()                 = default;
@@ -246,7 +245,7 @@ class data_type {
    * @brief Construct a new `data_type` object
    *
    * @param id The type's identifier
-   **/
+   */
   explicit constexpr data_type(type_id id) : _id{id} {}
 
   /**
@@ -254,7 +253,7 @@ class data_type {
    *
    * @param id The `fixed_point`'s identifier
    * @param scale The `fixed_point`'s scale (see `fixed_point::_scale`)
-   **/
+   */
   explicit data_type(type_id id, int32_t scale) : _id{id}, _fixed_point_scale{scale}
   {
     assert(id == type_id::DECIMAL32 || id == type_id::DECIMAL64);
@@ -262,12 +261,12 @@ class data_type {
 
   /**
    * @brief Returns the type identifier
-   **/
+   */
   CUDA_HOST_DEVICE_CALLABLE type_id id() const noexcept { return _id; }
 
   /**
    * @brief Returns the scale (for fixed_point types)
-   **/
+   */
   CUDA_HOST_DEVICE_CALLABLE int32_t scale() const noexcept { return _fixed_point_scale; }
 
  private:
diff --git a/cpp/include/cudf/utilities/bit.hpp b/cpp/include/cudf/utilities/bit.hpp
index 06bb495f165..31c8835f4c6 100644
--- a/cpp/include/cudf/utilities/bit.hpp
+++ b/cpp/include/cudf/utilities/bit.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,6 @@
 /**
  * @file bit.hpp
  * @brief Utilities for bit and bitmask operations.
- *
  */
 
 namespace cudf {
@@ -58,7 +57,7 @@ constexpr CUDA_HOST_DEVICE_CALLABLE std::size_t size_in_bits()
 
 /**
  * @brief Returns the index of the word containing the specified bit.
- **/
+ */
 constexpr CUDA_HOST_DEVICE_CALLABLE size_type word_index(size_type bit_index)
 {
   return bit_index / detail::size_in_bits<bitmask_type>();
@@ -66,7 +65,7 @@ constexpr CUDA_HOST_DEVICE_CALLABLE size_type word_index(size_type bit_index)
 
 /**
  * @brief Returns the position within a word of the specified bit.
- **/
+ */
 constexpr CUDA_HOST_DEVICE_CALLABLE size_type intra_word_index(size_type bit_index)
 {
   return bit_index % detail::size_in_bits<bitmask_type>();
@@ -80,7 +79,7 @@ constexpr CUDA_HOST_DEVICE_CALLABLE size_type intra_word_index(size_type bit_ind
  *
  * @param bitmask The bitmask containing the bit to set
  * @param bit_index Index of the bit to set
- **/
+ */
 CUDA_HOST_DEVICE_CALLABLE void set_bit_unsafe(bitmask_type* bitmask, size_type bit_index)
 {
   assert(nullptr != bitmask);
@@ -95,7 +94,7 @@ CUDA_HOST_DEVICE_CALLABLE void set_bit_unsafe(bitmask_type* bitmask, size_type b
  *
  * @param bitmask The bitmask containing the bit to clear
  * @param bit_index The index of the bit to clear
- **/
+ */
 CUDA_HOST_DEVICE_CALLABLE void clear_bit_unsafe(bitmask_type* bitmask, size_type bit_index)
 {
   assert(nullptr != bitmask);
@@ -108,7 +107,7 @@ CUDA_HOST_DEVICE_CALLABLE void clear_bit_unsafe(bitmask_type* bitmask, size_type
  * @param bit_index Index of the bit to test
  * @return true The specified bit is `1`
  * @return false  The specified bit is `0`
- **/
+ */
 CUDA_HOST_DEVICE_CALLABLE bool bit_is_set(bitmask_type const* bitmask, size_type bit_index)
 {
   assert(nullptr != bitmask);
@@ -122,7 +121,7 @@ CUDA_HOST_DEVICE_CALLABLE bool bit_is_set(bitmask_type const* bitmask, size_type
  *
  * @param n The number of least significant bits to set
  * @return A bitmask word with `n` least significant bits set
- **/
+ */
 constexpr CUDA_HOST_DEVICE_CALLABLE bitmask_type set_least_significant_bits(size_type n)
 {
   constexpr_assert(0 <= n && n < static_cast<size_type>(detail::size_in_bits<bitmask_type>()));
@@ -136,7 +135,7 @@ constexpr CUDA_HOST_DEVICE_CALLABLE bitmask_type set_least_significant_bits(size
  *
  * @param n The number of most significant bits to set
  * @return A bitmask word with `n` most significant bits set
- **/
+ */
 constexpr CUDA_HOST_DEVICE_CALLABLE bitmask_type set_most_significant_bits(size_type n)
 {
   constexpr size_type word_size{detail::size_in_bits<bitmask_type>()};
@@ -158,7 +157,7 @@ constexpr CUDA_HOST_DEVICE_CALLABLE bitmask_type set_most_significant_bits(size_
  *
  * @param bitmask The bitmask containing the bit to set
  * @param bit_index  Index of the bit to set
- **/
+ */
 __device__ inline void set_bit(bitmask_type* bitmask, size_type bit_index)
 {
   assert(nullptr != bitmask);
@@ -176,7 +175,7 @@ __device__ inline void set_bit(bitmask_type* bitmask, size_type bit_index)
  * This function is thread-safe.
  *
  * @param bit_index  Index of the bit to clear
- **/
+ */
 __device__ inline void clear_bit(bitmask_type* bitmask, size_type bit_index)
 {
   assert(nullptr != bitmask);
diff --git a/cpp/include/cudf/utilities/error.hpp b/cpp/include/cudf/utilities/error.hpp
index 0cdf0e7fe7b..15613c8caa7 100644
--- a/cpp/include/cudf/utilities/error.hpp
+++ b/cpp/include/cudf/utilities/error.hpp
@@ -17,7 +17,6 @@ namespace cudf {
  *
  * This exception should not be thrown directly and is instead thrown by the
  * CUDF_EXPECTS macro.
- *
  */
 struct logic_error : public std::logic_error {
   logic_error(char const* const message) : std::logic_error(message) {}
@@ -59,7 +58,7 @@ struct cuda_error : public std::runtime_error {
  * @param[in] reason String literal description of the reason that cond is
  * expected to be true
  * @throw cudf::logic_error if the condition evaluates to false.
- **/
+ */
 #define CUDF_EXPECTS(cond, reason)                                  \
   (!!(cond)) ? static_cast<void>(0)                                 \
              : throw cudf::logic_error("cuDF failure at: " __FILE__ \
@@ -77,7 +76,7 @@ struct cuda_error : public std::runtime_error {
  * ```
  *
  * @param[in] reason String literal description of the reason
- **/
+ */
 #define CUDF_FAIL(reason) \
   throw cudf::logic_error("cuDF failure at: " __FILE__ ":" CUDF_STRINGIFY(__LINE__) ": " reason)
 
@@ -99,8 +98,7 @@ inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int l
  * Invokes a CUDA runtime API function call, if the call does not return
  * cudaSuccess, invokes cudaGetLastError() to clear the error and throws an
  * exception detailing the CUDA error that occurred
- *
- **/
+ */
 #define CUDA_TRY(call)                                            \
   do {                                                            \
     cudaError_t const status = (call);                            \
@@ -122,8 +120,7 @@ inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int l
  * deterministic execution for debugging asynchronous CUDA execution. It should
  * be used after any asynchronous CUDA call, e.g., cudaMemcpyAsync, or an
  * asynchronous kernel launch.
- *
- **/
+ */
 #ifndef NDEBUG
 #define CHECK_CUDA(stream)                   \
   do {                                       \
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index 90600d8dc3f..69035a36c58 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -90,7 +90,7 @@ using is_duration_t = cuda::std::disjunction<std::is_same<cudf::duration_D, T>,
  * @tparam R Type of the second object
  * @return true Objects of types `L` and `R` can be relationally be compared
  * @return false Objects of types `L` and `R` cannot be compared
- **/
+ */
 template <typename L, typename R>
 constexpr inline bool is_relationally_comparable()
 {
@@ -121,7 +121,7 @@ constexpr inline bool is_equality_comparable()
  * @tparam T  The type to verify
  * @return true `T` is numeric
  * @return false  `T` is not numeric
- **/
+ */
 template <typename T>
 constexpr inline bool is_numeric()
 {
@@ -146,7 +146,7 @@ struct is_numeric_impl {
  * @param type The `data_type` to verify
  * @return true `type` is numeric
  * @return false `type` is not numeric
- **/
+ */
 constexpr inline bool is_numeric(data_type type)
 {
   return cudf::type_dispatcher(type, is_numeric_impl{});
@@ -162,7 +162,7 @@ constexpr inline bool is_numeric(data_type type)
  * @tparam T  The type to verify
  * @return true `T` is index type
  * @return false  `T` is not index type
- **/
+ */
 template <typename T>
 constexpr inline bool is_index_type()
 {
@@ -187,7 +187,7 @@ struct is_index_type_impl {
  * @param type The `data_type` to verify
  * @return true `type` is index type
  * @return false `type` is not index type
- **/
+ */
 constexpr inline bool is_index_type(data_type type)
 {
   return cudf::type_dispatcher(type, is_index_type_impl{});
@@ -199,7 +199,7 @@ constexpr inline bool is_index_type(data_type type)
  * @tparam T  The type to verify
  * @return true `T` is unsigned numeric
  * @return false  `T` is signed numeric
- **/
+ */
 template <typename T>
 constexpr inline bool is_unsigned()
 {
@@ -221,7 +221,7 @@ struct is_unsigned_impl {
  * @param type The `data_type` to verify
  * @return true `type` is unsigned numeric
  * @return false `type` is signed numeric
- **/
+ */
 constexpr inline bool is_unsigned(data_type type)
 {
   return cudf::type_dispatcher(type, is_unsigned_impl{});
@@ -233,7 +233,7 @@ constexpr inline bool is_unsigned(data_type type)
  * @tparam T  The type to verify
  * @return true `T` is floating point
  * @return false  `T` is not floating point
- **/
+ */
 template <typename T>
 constexpr inline bool is_floating_point()
 {
@@ -256,7 +256,7 @@ struct is_floating_point_impl {
  * @param type The `data_type` to verify
  * @return true `type` is floating point
  * @return false `type` is not floating point
- **/
+ */
 constexpr inline bool is_floating_point(data_type type)
 {
   return cudf::type_dispatcher(type, is_floating_point_impl{});
@@ -268,7 +268,7 @@ constexpr inline bool is_floating_point(data_type type)
  * @param type The `data_type` to verify
  * @return true `type` is Boolean
  * @return false `type` is not Boolean
- **/
+ */
 template <typename T>
 constexpr inline bool is_boolean()
 {
@@ -289,7 +289,7 @@ struct is_boolean_impl {
  * @param type The `data_type` to verify
  * @return true `type` is a Boolean
  * @return false `type` is not a Boolean
- **/
+ */
 constexpr inline bool is_boolean(data_type type)
 {
   return cudf::type_dispatcher(type, is_boolean_impl{});
@@ -301,7 +301,7 @@ constexpr inline bool is_boolean(data_type type)
  * @tparam T  The type to verify
  * @return true `T` is a timestamp
  * @return false  `T` is not a timestamp
- **/
+ */
 template <typename T>
 constexpr inline bool is_timestamp()
 {
@@ -324,7 +324,7 @@ struct is_timestamp_impl {
  * @param type The `data_type` to verify
  * @return true `type` is a timestamp
  * @return false `type` is not a timestamp
- **/
+ */
 constexpr inline bool is_timestamp(data_type type)
 {
   return cudf::type_dispatcher(type, is_timestamp_impl{});
@@ -336,7 +336,7 @@ constexpr inline bool is_timestamp(data_type type)
  * @tparam T  The type to verify
  * @return true `T` is a fixed-point type
  * @return false  `T` is not a fixed-point type
- **/
+ */
 template <typename T>
 constexpr inline bool is_fixed_point()
 {
@@ -357,7 +357,7 @@ struct is_fixed_point_impl {
  * @param type The `data_type` to verify
  * @return true `type` is a fixed point type
  * @return false `type` is not a fixed point type
- **/
+ */
 constexpr inline bool is_fixed_point(data_type type)
 {
   return cudf::type_dispatcher(type, is_fixed_point_impl{});
@@ -369,7 +369,7 @@ constexpr inline bool is_fixed_point(data_type type)
  * @tparam T  The type to verify
  * @return true `T` is a duration
  * @return false  `T` is not a duration
- **/
+ */
 template <typename T>
 constexpr inline bool is_duration()
 {
@@ -392,7 +392,7 @@ struct is_duration_impl {
  * @param type The `data_type` to verify
  * @return true `type` is a duration
  * @return false `type` is not a duration
- **/
+ */
 constexpr inline bool is_duration(data_type type)
 {
   return cudf::type_dispatcher(type, is_duration_impl{});
@@ -404,7 +404,7 @@ constexpr inline bool is_duration(data_type type)
  * @tparam T  The type to verify
  * @return true `T` is a duration or a timestamp type
  * @return false  `T` is neither a duration nor a timestamp type
- **/
+ */
 template <typename T>
 constexpr inline bool is_chrono()
 {
@@ -428,7 +428,7 @@ struct is_chrono_impl {
  * @param type The `data_type` to verify
  * @return true `type` is a chrono type
  * @return false `type` is not a chrono type
- **/
+ */
 constexpr inline bool is_chrono(data_type type)
 {
   return cudf::type_dispatcher(type, is_chrono_impl{});
@@ -440,7 +440,7 @@ constexpr inline bool is_chrono(data_type type)
  * @tparam T  The type to verify
  * @return true `T` is a dictionary-type
  * @return false  `T` is not dictionary-type
- **/
+ */
 template <typename T>
 constexpr inline bool is_dictionary()
 {
@@ -461,7 +461,7 @@ struct is_dictionary_impl {
  * @param type The `data_type` to verify
  * @return true `type` is a dictionary type
  * @return false `type` is not a dictionary type
- **/
+ */
 constexpr inline bool is_dictionary(data_type type)
 {
   return cudf::type_dispatcher(type, is_dictionary_impl{});
@@ -474,7 +474,7 @@ constexpr inline bool is_dictionary(data_type type)
  * @tparam T The C++ type to verify
  * @return true `T` corresponds to a fixed-width element type
  * @return false `T` corresponds to a variable-width element type
- **/
+ */
 template <typename T>
 constexpr inline bool is_fixed_width()
 {
@@ -499,7 +499,7 @@ struct is_fixed_width_impl {
  * @param type The `data_type` to verify
  * @return true `type` is fixed-width
  * @return false  `type` is variable-width
- **/
+ */
 constexpr inline bool is_fixed_width(data_type type)
 {
   return cudf::type_dispatcher(type, is_fixed_width_impl{});
@@ -543,7 +543,7 @@ struct is_compound_impl {
  * @param type The `data_type` to verify
  * @return true `type` is a compound type
  * @return false `type` is a simple type
- **/
+ */
 constexpr inline bool is_compound(data_type type)
 {
   return cudf::type_dispatcher(type, is_compound_impl{});
@@ -559,7 +559,7 @@ constexpr inline bool is_compound(data_type type)
  * @param T The type to verify
  * @return true T is a nested type
  * @return false T is not a nested type
- **/
+ */
 template <typename T>
 constexpr inline bool is_nested()
 {
@@ -584,7 +584,7 @@ struct is_nested_impl {
  * @param type The `data_type` to verify
  * @return true `type` is a nested type
  * @return false `type` is not a nested type
- **/
+ */
 constexpr inline bool is_nested(data_type type)
 {
   return cudf::type_dispatcher(type, is_nested_impl{});
diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp
index 2bd3f12feac..9045c596b62 100644
--- a/cpp/include/cudf/utilities/type_dispatcher.hpp
+++ b/cpp/include/cudf/utilities/type_dispatcher.hpp
@@ -30,7 +30,7 @@
  * @file
  * @brief Defines the mapping between `cudf::type_id` runtime type information
  * and concrete C++ types.
- **/
+ */
 namespace cudf {
 /**
  * @addtogroup utility_dispatcher
@@ -51,7 +51,7 @@ namespace cudf {
  * ```
  *
  * @tparam T The type to map to a `cudf::type_id`
- **/
+ */
 template <typename T>
 inline constexpr type_id type_to_id()
 {
@@ -78,7 +78,7 @@ struct id_to_type_impl {
  * static_assert(std::is_same<int32_t, id_to_type<id_type::INT32>);
  * ```
  * @tparam t The `cudf::type_id` to map
- **/
+ */
 template <cudf::type_id Id>
 using id_to_type = typename id_to_type_impl<Id>::type;
 
@@ -124,7 +124,7 @@ bool type_id_matches_device_storage_type(type_id const& id)
 
  * @param Type The concrete C++ type
  * @param Id The `cudf::type_id` enum
- **/
+ */
 #ifndef CUDF_TYPE_MAPPING
 #define CUDF_TYPE_MAPPING(Type, Id)                   \
   template <>                                         \
@@ -146,7 +146,7 @@ bool type_id_matches_device_storage_type(type_id const& id)
 /**
  * @brief Defines all of the mappings between C++ types and their corresponding
  * `cudf::type_id` values.
- **/
+ */
 CUDF_TYPE_MAPPING(bool, type_id::BOOL8);
 CUDF_TYPE_MAPPING(int8_t, type_id::INT8);
 CUDF_TYPE_MAPPING(int16_t, type_id::INT16);
@@ -376,7 +376,7 @@ using scalar_device_type_t = typename type_to_scalar_type_impl<T>::ScalarDeviceT
  * @param args Parameter pack of arguments forwarded to the `operator()`
  * invocation
  * @return Whatever is returned by the callable's `operator()`
- **/
+ */
 // This pragma disables a compiler warning that complains about the valid usage
 // of calling a __host__ functor from this function which is __host__ __device__
 #pragma nv_exec_check_disable
diff --git a/cpp/include/cudf/wrappers/durations.hpp b/cpp/include/cudf/wrappers/durations.hpp
index c526ab45a1d..07bcc1976a8 100644
--- a/cpp/include/cudf/wrappers/durations.hpp
+++ b/cpp/include/cudf/wrappers/durations.hpp
@@ -31,23 +31,23 @@ namespace cudf {
 
 /**
  * @brief Type alias representing an int32_t duration of days.
- **/
+ */
 using duration_D = cuda::std::chrono::duration<int32_t, cuda::std::chrono::days::period>;
 /**
  * @brief Type alias representing an int64_t duration of seconds.
- **/
+ */
 using duration_s = cuda::std::chrono::duration<int64_t, cuda::std::chrono::seconds::period>;
 /**
  * @brief Type alias representing an int64_t duration of milliseconds.
- **/
+ */
 using duration_ms = cuda::std::chrono::duration<int64_t, cuda::std::chrono::milliseconds::period>;
 /**
  * @brief Type alias representing an int64_t duration of microseconds.
- **/
+ */
 using duration_us = cuda::std::chrono::duration<int64_t, cuda::std::chrono::microseconds::period>;
 /**
  * @brief Type alias representing an int64_t duration of nanoseconds.
- **/
+ */
 using duration_ns = cuda::std::chrono::duration<int64_t, cuda::std::chrono::nanoseconds::period>;
 
 static_assert(sizeof(duration_D) == sizeof(typename duration_D::rep), "");
@@ -64,7 +64,7 @@ namespace std {
  * @brief Specialization of std::numeric_limits for cudf::detail::duration
  *
  * Pass through to return the limits of the underlying numeric representation.
- **/
+ */
 #define DURATION_LIMITS(TypeName)                                             \
   template <>                                                                 \
   struct numeric_limits<TypeName> {                                           \
diff --git a/cpp/include/cudf/wrappers/timestamps.hpp b/cpp/include/cudf/wrappers/timestamps.hpp
index cf22c088f9c..275ac20048e 100644
--- a/cpp/include/cudf/wrappers/timestamps.hpp
+++ b/cpp/include/cudf/wrappers/timestamps.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
  * @file timestamps.hpp
  * @brief Concrete type definitions for int32_t and int64_t timestamps in
  * varying resolutions as durations since the UNIX epoch.
- **/
+ */
 namespace cudf {
 namespace detail {
 // TODO: Use chrono::utc_clock when available in libcu++?
@@ -56,28 +56,28 @@ struct timestamp : time_point<Duration> {
 /**
  * @brief Type alias representing an int32_t duration of days since the unix
  * epoch.
- **/
+ */
 using timestamp_D =
   detail::timestamp<cuda::std::chrono::duration<int32_t, cuda::std::ratio<86400>>>;
 /**
  * @brief Type alias representing an int64_t duration of seconds since the
  * unix epoch.
- **/
+ */
 using timestamp_s = detail::timestamp<cuda::std::chrono::duration<int64_t, cuda::std::ratio<1>>>;
 /**
  * @brief Type alias representing an int64_t duration of milliseconds since
  * the unix epoch.
- **/
+ */
 using timestamp_ms = detail::timestamp<cuda::std::chrono::duration<int64_t, cuda::std::milli>>;
 /**
  * @brief Type alias representing an int64_t duration of microseconds since
  * the unix epoch.
- **/
+ */
 using timestamp_us = detail::timestamp<cuda::std::chrono::duration<int64_t, cuda::std::micro>>;
 /**
  * @brief Type alias representing an int64_t duration of nanoseconds since
  * the unix epoch.
- **/
+ */
 using timestamp_ns = detail::timestamp<cuda::std::chrono::duration<int64_t, cuda::std::nano>>;
 
 static_assert(sizeof(timestamp_D) == sizeof(typename timestamp_D::rep), "");
@@ -94,7 +94,7 @@ namespace std {
  * @brief Specialization of std::numeric_limits for cudf::detail::timestamp
  *
  * Pass through to return the limits of the underlying numeric representation.
- **/
+ */
 #define TIMESTAMP_LIMITS(TypeName)                                                                \
   template <>                                                                                     \
   struct numeric_limits<TypeName> {                                                               \
diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp
index 2ad6e06dc6b..9fa67dccb52 100644
--- a/cpp/include/cudf_test/base_fixture.hpp
+++ b/cpp/include/cudf_test/base_fixture.hpp
@@ -40,7 +40,7 @@ namespace test {
  * ```
  * class MyTestFixture : public cudf::test::BaseFixture {};
  * ```
- **/
+ */
 class BaseFixture : public ::testing::Test {
   rmm::mr::device_memory_resource *_mr{rmm::mr::get_current_device_resource()};
 
@@ -48,7 +48,7 @@ class BaseFixture : public ::testing::Test {
   /**
    * @brief Returns pointer to `device_memory_resource` that should be used for
    * all tests inheriting from this fixture
-   **/
+   */
   rmm::mr::device_memory_resource *mr() { return _mr; }
 };
 
@@ -96,7 +96,6 @@ namespace detail {
  * now uses a local UniformRandomGenerator object.  If we didn't generate an incrementing seed
  * for each one, every call to make_random_wrapped_column() would return the same values. This
  * fixes that case and also leaves results across multiple test runs deterministic.
- *
  */
 uint64_t random_generator_incrementing_seed();
 
@@ -117,7 +116,7 @@ uint64_t random_generator_incrementing_seed();
  * ```
  *
  * @tparam T The type of values that will be generated.
- **/
+ */
 template <typename T = cudf::size_type, typename Engine = std::default_random_engine>
 class UniformRandomGenerator {
  public:
@@ -157,7 +156,7 @@ class UniformRandomGenerator {
 
   /**
    * @brief Returns the next random number.
-   **/
+   */
   template <typename TL = T, std::enable_if_t<!cudf::is_timestamp<TL>()> * = nullptr>
   T generate()
   {
@@ -280,7 +279,6 @@ inline auto parse_cudf_test_opts(int argc, char **argv)
  * maintaining the original functionality. In addition, this custom `main`
  * function parses the command line to customize test behavior, like the
  * allocation mode used for creating the default memory resource.
- *
  */
 #define CUDF_TEST_PROGRAM_MAIN()                                        \
   int main(int argc, char **argv)                                       \
diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp
index 62eb1c084b8..66710960296 100644
--- a/cpp/include/cudf_test/column_utilities.hpp
+++ b/cpp/include/cudf_test/column_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,7 +56,7 @@ void expect_column_properties_equivalent(cudf::column_view const& lhs,
  * @param lhs                   The first column
  * @param rhs                   The second column
  * @param print_all_differences If true display all differences
- **/
+ */
 void expect_columns_equal(cudf::column_view const& lhs,
                           cudf::column_view const& rhs,
                           bool print_all_differences = false);
@@ -70,7 +70,7 @@ void expect_columns_equal(cudf::column_view const& lhs,
  * @param lhs                   The first column
  * @param rhs                   The second column
  * @param print_all_differences If true display all differences
- **/
+ */
 void expect_columns_equivalent(cudf::column_view const& lhs,
                                cudf::column_view const& rhs,
                                bool print_all_differences = false);
@@ -112,7 +112,7 @@ std::vector<std::string> to_strings(cudf::column_view const& col);
  *
  * @param os        The output stream
  * @param col       The column view
- **/
+ */
 void print(cudf::column_view const& col,
            std::ostream& os             = std::cout,
            std::string const& delimiter = ",");
@@ -122,7 +122,7 @@ void print(cudf::column_view const& col,
  *
  * @param c      The column view
  * @returns      Vector of bitmask_type elements
- **/
+ */
 std::vector<bitmask_type> bitmask_to_host(cudf::column_view const& c);
 
 /**
@@ -135,7 +135,7 @@ std::vector<bitmask_type> bitmask_to_host(cudf::column_view const& c);
  * @param        number_of_elements number of elements the mask represent
  *
  * @returns      true if both vector match till the `number_of_elements`
- **/
+ */
 bool validate_host_masks(std::vector<bitmask_type> const& expected_mask,
                          std::vector<bitmask_type> const& got_mask_begin,
                          size_type number_of_elements);
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 83aab53cc43..a97f4d042ce 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -64,7 +64,7 @@ namespace test {
  * @param f The unary function to apply to the counting iterator.
  * This should be a host function and not a device function.
  * @return auto A transform iterator that applies `f` to a counting iterator
- **/
+ */
 template <typename UnaryFunction>
 auto make_counting_transform_iterator(cudf::size_type start, UnaryFunction f)
 {
@@ -79,7 +79,7 @@ namespace detail {
  * API expecting a `column_view` or `mutable_column_view`.
  *
  * `column_wrapper` should not be instantiated directly.
- **/
+ */
 class column_wrapper {
  public:
   /**
@@ -88,7 +88,7 @@ class column_wrapper {
    * Allows passing in a `column_wrapper` (or any class deriving from
    * `column_wrapper`) to be passed into any API expecting a `column_view`
    * parameter.
-   **/
+   */
   operator column_view() const { return wrapped->view(); }
 
   /**
@@ -97,12 +97,12 @@ class column_wrapper {
    * Allows passing in a `column_wrapper` (or any class deriving from
    * `column_wrapper`) to be passed into any API expecting a
    * `mutable_column_view` parameter.
-   **/
+   */
   operator mutable_column_view() { return wrapped->mutable_view(); }
 
   /**
    * @brief Releases internal unique_ptr to wrapped column
-   **/
+   */
   std::unique_ptr<cudf::column> release() { return std::move(wrapped); }
 
  protected:
@@ -111,7 +111,7 @@ class column_wrapper {
 
 /**
  * @brief Convert between source and target types when they differ and where possible.
- **/
+ */
 template <typename From, typename To>
 struct fixed_width_type_converter {
   // Are the types same - simply copy elements from [begin, end) to out
@@ -156,7 +156,7 @@ struct fixed_width_type_converter {
  * @param begin Beginning of the sequence of elements
  * @param end End of the sequence of elements
  * @return rmm::device_buffer Buffer containing all elements in the range `[begin,end)`
- **/
+ */
 template <typename ElementTo,
           typename ElementFrom,
           typename InputIterator,
@@ -181,7 +181,7 @@ rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
  * @param begin Beginning of the sequence of elements
  * @param end End of the sequence of elements
  * @return rmm::device_buffer Buffer containing all elements in the range `[begin,end)`
- **/
+ */
 template <typename ElementTo,
           typename ElementFrom,
           typename InputIterator,
@@ -206,7 +206,7 @@ rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
  * @param begin Beginning of the sequence of elements
  * @param end End of the sequence of elements
  * @return rmm::device_buffer Buffer containing all elements in the range `[begin,end)`
- **/
+ */
 template <typename ElementTo,
           typename ElementFrom,
           typename InputIterator,
@@ -235,7 +235,7 @@ rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
  * @param end The end of the validity indicator sequence
  * @return std::vector Contains a bitmask where bits are set for every
  * element in `[begin,end)` that evaluated to `true`.
- **/
+ */
 template <typename ValidityIterator>
 std::vector<bitmask_type> make_null_mask_vector(ValidityIterator begin, ValidityIterator end)
 {
@@ -261,7 +261,7 @@ std::vector<bitmask_type> make_null_mask_vector(ValidityIterator begin, Validity
  * @param end The end of the validity indicator sequence
  * @return rmm::device_buffer Contains a bitmask where bits are set for every
  * element in `[begin,end)` that evaluated to `true`.
- **/
+ */
 template <typename ValidityIterator>
 rmm::device_buffer make_null_mask(ValidityIterator begin, ValidityIterator end)
 {
@@ -283,12 +283,12 @@ rmm::device_buffer make_null_mask(ValidityIterator begin, ValidityIterator end)
  * @param end The end of the sequence of values to convert to strings
  * @param v The beginning of the validity indicator sequence
  * @return std::pair containing the vector of chars and offsets
- **/
+ */
 template <typename StringsIterator, typename ValidityIterator>
 auto make_chars_and_offsets(StringsIterator begin, StringsIterator end, ValidityIterator v)
 {
   std::vector<char> chars{};
-  std::vector<int32_t> offsets(1, 0);
+  std::vector<cudf::size_type> offsets(1, 0);
   for (auto str = begin; str < end; ++str) {
     std::string tmp = (*v++) ? std::string(*str) : std::string{};
     chars.insert(chars.end(), std::cbegin(tmp), std::cend(tmp));
@@ -305,13 +305,13 @@ auto make_chars_and_offsets(StringsIterator begin, StringsIterator end, Validity
  * @tparam ElementTo The fixed-width element type that is created
  * @tparam SourceElementT The fixed-width element type that is used to create elements of type
  * `ElementTo`
- **/
+ */
 template <typename ElementTo, typename SourceElementT = ElementTo>
 class fixed_width_column_wrapper : public detail::column_wrapper {
  public:
   /**
    * @brief Default constructor initializes an empty column with proper dtype
-   **/
+   */
   fixed_width_column_wrapper() : column_wrapper{}
   {
     std::vector<ElementTo> empty;
@@ -338,7 +338,7 @@ class fixed_width_column_wrapper : public detail::column_wrapper {
    *
    * @param begin The beginning of the sequence of elements
    * @param end The end of the sequence of elements
-   **/
+   */
   template <typename InputIterator>
   fixed_width_column_wrapper(InputIterator begin, InputIterator end) : column_wrapper{}
   {
@@ -370,7 +370,7 @@ class fixed_width_column_wrapper : public detail::column_wrapper {
    * @param begin The beginning of the sequence of elements
    * @param end The end of the sequence of elements
    * @param v The beginning of the sequence of validity indicators
-   **/
+   */
   template <typename InputIterator, typename ValidityIterator>
   fixed_width_column_wrapper(InputIterator begin, InputIterator end, ValidityIterator v)
     : column_wrapper{}
@@ -394,7 +394,7 @@ class fixed_width_column_wrapper : public detail::column_wrapper {
    * @endcode
    *
    * @param element_list The list of elements
-   **/
+   */
   template <typename ElementFrom>
   fixed_width_column_wrapper(std::initializer_list<ElementFrom> elements)
     : fixed_width_column_wrapper(std::cbegin(elements), std::cend(elements))
@@ -417,7 +417,7 @@ class fixed_width_column_wrapper : public detail::column_wrapper {
    *
    * @param elements The list of elements
    * @param validity The list of validity indicator booleans
-   **/
+   */
   template <typename ElementFrom>
   fixed_width_column_wrapper(std::initializer_list<ElementFrom> elements,
                              std::initializer_list<bool> validity)
@@ -441,7 +441,7 @@ class fixed_width_column_wrapper : public detail::column_wrapper {
    * convertible to `bool`
    * @param element_list The list of elements
    * @param v The beginning of the sequence of validity indicators
-   **/
+   */
   template <typename ValidityIterator, typename ElementFrom>
   fixed_width_column_wrapper(std::initializer_list<ElementFrom> element_list, ValidityIterator v)
     : fixed_width_column_wrapper(std::cbegin(element_list), std::cend(element_list), v)
@@ -465,7 +465,7 @@ class fixed_width_column_wrapper : public detail::column_wrapper {
    * @param begin The beginning of the sequence of elements
    * @param end The end of the sequence of elements
    * @param validity The list of validity indicator booleans
-   **/
+   */
   template <typename InputIterator>
   fixed_width_column_wrapper(InputIterator begin,
                              InputIterator end,
@@ -689,7 +689,7 @@ class fixed_point_column_wrapper : public detail::column_wrapper {
 
 /**
  * @brief `column_wrapper` derived class for wrapping columns of strings.
- **/
+ */
 class strings_column_wrapper : public detail::column_wrapper {
  public:
   /**
@@ -716,12 +716,12 @@ class strings_column_wrapper : public detail::column_wrapper {
    * dereferencing a `StringsIterator`.
    * @param begin The beginning of the sequence
    * @param end The end of the sequence
-   **/
+   */
   template <typename StringsIterator>
   strings_column_wrapper(StringsIterator begin, StringsIterator end) : column_wrapper{}
   {
     std::vector<char> chars;
-    std::vector<int32_t> offsets;
+    std::vector<cudf::size_type> offsets;
     auto all_valid           = make_counting_transform_iterator(0, [](auto i) { return true; });
     std::tie(chars, offsets) = detail::make_chars_and_offsets(begin, end, all_valid);
     wrapped                  = cudf::make_strings_column(chars, offsets);
@@ -754,14 +754,14 @@ class strings_column_wrapper : public detail::column_wrapper {
    * @param begin The beginning of the sequence
    * @param end The end of the sequence
    * @param v The beginning of the sequence of validity indicators
-   **/
+   */
   template <typename StringsIterator, typename ValidityIterator>
   strings_column_wrapper(StringsIterator begin, StringsIterator end, ValidityIterator v)
     : column_wrapper{}
   {
     size_type num_strings = std::distance(begin, end);
     std::vector<char> chars;
-    std::vector<int32_t> offsets;
+    std::vector<size_type> offsets;
     std::tie(chars, offsets) = detail::make_chars_and_offsets(begin, end, v);
     wrapped =
       cudf::make_strings_column(chars, offsets, detail::make_null_mask_vector(v, v + num_strings));
@@ -778,7 +778,7 @@ class strings_column_wrapper : public detail::column_wrapper {
    * @endcode
    *
    * @param strings The list of strings
-   **/
+   */
   strings_column_wrapper(std::initializer_list<std::string> strings)
     : strings_column_wrapper(std::cbegin(strings), std::cend(strings))
   {
@@ -801,7 +801,7 @@ class strings_column_wrapper : public detail::column_wrapper {
    * convertible to `bool`
    * @param strings The list of strings
    * @param v The beginning of the sequence of validity indicators
-   **/
+   */
   template <typename ValidityIterator>
   strings_column_wrapper(std::initializer_list<std::string> strings, ValidityIterator v)
     : strings_column_wrapper(std::cbegin(strings), std::cend(strings), v)
@@ -822,7 +822,7 @@ class strings_column_wrapper : public detail::column_wrapper {
    *
    * @param strings The list of strings
    * @param validity The list of validity indicator booleans
-   **/
+   */
   strings_column_wrapper(std::initializer_list<std::string> strings,
                          std::initializer_list<bool> validity)
     : strings_column_wrapper(std::cbegin(strings), std::cend(strings), std::cbegin(validity))
@@ -1216,7 +1216,6 @@ class dictionary_column_wrapper<std::string> : public detail::column_wrapper {
  *   // situation 2 (cudf TEST_F case)
  *   {LCW{}}
  * @endcode
- *
  */
 template <typename T, typename SourceElementT = T>
 class lists_column_wrapper : public detail::column_wrapper {
diff --git a/cpp/include/cudf_test/cudf_gtest.hpp b/cpp/include/cudf_test/cudf_gtest.hpp
index 0b29fa63ad8..b60c94394d1 100644
--- a/cpp/include/cudf_test/cudf_gtest.hpp
+++ b/cpp/include/cudf_test/cudf_gtest.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,7 +32,7 @@
  *
  * Uses macros to rename GTests's emulated variadic template types and then
  * redefines them properly.
- **/
+ */
 
 #define Types Types_NOT_USED
 #define Types0 Types0_NOT_USED
diff --git a/cpp/include/cudf_test/scalar_utilities.hpp b/cpp/include/cudf_test/scalar_utilities.hpp
index 7cf5a2bd9f5..7e34630365e 100644
--- a/cpp/include/cudf_test/scalar_utilities.hpp
+++ b/cpp/include/cudf_test/scalar_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@ namespace test {
  *
  * @param lhs                   The first scalar
  * @param rhs                   The second scalar
- **/
+ */
 void expect_scalars_equal(cudf::scalar const& lhs, cudf::scalar const& rhs);
 
 }  // namespace test
diff --git a/cpp/include/cudf_test/table_utilities.hpp b/cpp/include/cudf_test/table_utilities.hpp
index 0bc7fd28b2d..831c9f5ac14 100644
--- a/cpp/include/cudf_test/table_utilities.hpp
+++ b/cpp/include/cudf_test/table_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,7 +47,7 @@ void expect_tables_equal(cudf::table_view lhs, cudf::table_view rhs);
  *
  * @param lhs The first table
  * @param rhs The second table
- **/
+ */
 void expect_tables_equivalent(cudf::table_view lhs, cudf::table_view rhs);
 
 }  // namespace test
diff --git a/cpp/include/cudf_test/timestamp_utilities.cuh b/cpp/include/cudf_test/timestamp_utilities.cuh
index a4f547b9ca4..b1c94916f70 100644
--- a/cpp/include/cudf_test/timestamp_utilities.cuh
+++ b/cpp/include/cudf_test/timestamp_utilities.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,7 +40,7 @@ using time_point_ms =
  * @param count The number of timestamps to create
  * @param start The first timestamp as a cuda::std::chrono::time_point
  * @param stop The last timestamp as a cuda::std::chrono::time_point
- **/
+ */
 template <typename T, bool nullable = false>
 inline cudf::test::fixed_width_column_wrapper<T, int64_t> generate_timestamps(int32_t count,
                                                                               time_point_ms start,
diff --git a/cpp/include/cudf_test/type_list_utilities.hpp b/cpp/include/cudf_test/type_list_utilities.hpp
index cdb060edb1b..a3f771c2f72 100644
--- a/cpp/include/cudf_test/type_list_utilities.hpp
+++ b/cpp/include/cudf_test/type_list_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -66,8 +66,7 @@
  *
  * @note WARNING: Abusing and overusing these utilities can lead to dramatically
  * increased compile-times. Use responsibly.
- *
- **/
+ */
 
 namespace cudf {
 namespace test {
@@ -111,7 +110,7 @@ struct GetTypeImpl<Types<ARGS...>, 0> {
  *
  * @tparam TUPLE The type list
  * @tparam D Index of the desired type
- **/
+ */
 template <class TUPLE, int D>
 using GetType = typename GetTypeImpl<TUPLE, D>::type;
 
@@ -131,7 +130,7 @@ struct GetSizeImpl<Types<TYPES...>> {
  * ```
  * GetSize< Types<int, float, double, void*> == 4
  * ```
- **/
+ */
 template <class TUPLE>
 constexpr auto GetSize = GetSizeImpl<TUPLE>::value;
 
@@ -177,7 +176,7 @@ struct ConcatImpl<> {
  * using MyTypes = Concat< Types<int, float>, Types<char, double>>
  * // MyTypes == Types<int, float, char, double>;
  * ```
- **/
+ */
 template <class... T>
 using Concat = typename ConcatImpl<T...>::type;
 
@@ -210,7 +209,7 @@ struct FlattenImpl<Types<Types<HEAD...>, TAIL...>> {
  *char> static_assert(std::is_same<Flatten<Types<Types<int, Types<double>>,
  *float>>, Types<int, double, float>>::value, "");
  * ```
- **/
+ */
 template <class T>
 using Flatten = typename FlattenImpl<T>::type;
 
@@ -278,7 +277,7 @@ struct CrossProductImpl<T, TAIL...> : CrossProductImpl<Types<T>, TAIL...> {
  * // Types == Types< Types<int, char>, Types<int, double>, Types<float, char>,
  * Types<float, double> >
  * ```
- **/
+ */
 template <class... ARGS>
 using CrossProduct = typename CrossProductImpl<ARGS...>::type;
 
@@ -322,7 +321,7 @@ struct AllSame<Types<ITEMS...>> : AllSame<ITEMS...> {
  * RemoveIf<AllSame, Types<Types<int, float, int>>> ==  Types<Types<int, float,
  *int>>
  * ```
- **/
+ */
 struct AllSame {
   template <class... ITEMS>
   using Call = detail::AllSame<ITEMS...>;
@@ -358,7 +357,7 @@ struct ExistsImpl<NEEDLE, Types<HEAD, TAIL...>> : ExistsImpl<NEEDLE, Types<TAIL.
  * ```
  * @tparam NEEDLE The type to search for
  * @tparam HAYSACK The list to search in
- **/
+ */
 template <class NEEDLE, class HAYSACK>
 constexpr bool Exists = ExistsImpl<NEEDLE, HAYSACK>::value;
 
@@ -383,14 +382,14 @@ constexpr bool Exists = ExistsImpl<NEEDLE, HAYSACK>::value;
  * ```
  *
  * @tparam HAYSACK The type list to search
- **/
+ */
 template <class HAYSACK>
 struct ContainedIn {
   /**
    * @brief Invoked as predicate for RemoveIf
    *
    * @tparam NEEDLE The type to search for
-   **/
+   */
   template <class NEEDLE>
   using Call = ExistsImpl<NEEDLE, HAYSACK>;
 };
@@ -430,7 +429,7 @@ struct RemoveIfImpl<PRED, Types<HEAD, TAIL...>> {
  *
  * @tparam PRED The predicate
  * @tparam TUPLE The list of types on which to apply the predicate
- **/
+ */
 template <class PRED, class TUPLE>
 using RemoveIf = typename RemoveIfImpl<PRED, TUPLE>::type;
 
@@ -458,7 +457,7 @@ struct TransformImpl<XFORM, Types<ITEMS...>> {
  *
  * @tparam XFORM The transformation to apply
  * @tparam TYPES The list of types to transform
- **/
+ */
 template <class XFORM, class TYPES>
 using Transform = typename TransformImpl<XFORM, TYPES>::type;
 
@@ -492,7 +491,7 @@ struct Repeat<T, 0, Types<ITEMS...>> {
  * ```
  *
  * @tparam N The number of times to repeat the type
- **/
+ */
 template <int N>
 struct Repeat {
   template <class T>
@@ -520,7 +519,7 @@ struct AppendImpl<Types<HEAD...>, TAIL...> {
  *
  * @tparam TYPES The type list to append to
  * @tparam ITEMS The types to append
- **/
+ */
 template <class TYPES, class... ITEMS>
 using Append = typename AppendImpl<TYPES, ITEMS...>::type;
 
@@ -565,7 +564,7 @@ struct RemoveImpl {
  *
  * @tparam TUPLE Type list to remove types from
  * @tparam IDXs Indices of types to remove
- **/
+ */
 template <class TUPLE, int... IDXs>
 using Remove = typename RemoveImpl<TUPLE, IDXs...>::type;
 
@@ -605,10 +604,10 @@ struct UniqueImpl<Types<ITEMS...>> {
  * ```
  *
  * @tparam TYPES The type list from which to remove duplicates
- **/
+ */
 template <class TYPES>
 using Unique = typename UniqueImpl<TYPES>::type;
 
 }  // namespace test
 
-}  // namespace cudf
\ No newline at end of file
+}  // namespace cudf
diff --git a/cpp/include/cudf_test/type_lists.hpp b/cpp/include/cudf_test/type_lists.hpp
index 695f9c037b9..1d7174e05d7 100644
--- a/cpp/include/cudf_test/type_lists.hpp
+++ b/cpp/include/cudf_test/type_lists.hpp
@@ -34,7 +34,7 @@
  *
  * These lists should be used for consistency across tests as well as
  * future-proofing against the addition of any new types in the future.
- **/
+ */
 namespace cudf {
 namespace test {
 namespace detail {
@@ -57,7 +57,7 @@ constexpr std::array<cudf::type_id, sizeof...(Indices)> types_to_ids_impl(
  *
  * @tparam TYPES List of types to conver to `type_id`s
  * @return `std::array` of `type_id`s corresponding to each type in `TYPES`
- **/
+ */
 template <typename TYPES>
 constexpr auto types_to_ids()
 {
@@ -172,7 +172,7 @@ using NumericTypes = Concat<IntegralTypes, FloatingPointTypes>;
  * // Invokes all typed fixture tests for all timestamp types in libcudf
  * TYPED_TEST_CASE(MyTypedFixture, cudf::test::TimestampTypes);
  * ```
- **/
+ */
 using TimestampTypes =
   cudf::test::Types<timestamp_D, timestamp_s, timestamp_ms, timestamp_us, timestamp_ns>;
 
@@ -185,7 +185,7 @@ using TimestampTypes =
  * // Invokes all typed fixture tests for all duration types in libcudf
  * TYPED_TEST_CASE(MyTypedFixture, cudf::test::DurationTypes);
  * ```
- **/
+ */
 using DurationTypes =
   cudf::test::Types<duration_D, duration_s, duration_ms, duration_us, duration_ns>;
 
@@ -209,7 +209,7 @@ using ChronoTypes = Concat<TimestampTypes, DurationTypes>;
  * // Invokes all typed fixture tests for all string types in libcudf
  * TYPED_TEST_CASE(MyTypedFixture, cudf::test::StringTypes);
  * ```
- **/
+ */
 using StringTypes = cudf::test::Types<string_view>;
 
 /**
@@ -233,7 +233,7 @@ using ListTypes = cudf::test::Types<list_view>;
  * // Invokes all typed fixture tests for all fixed-width types in libcudf
  * TYPED_TEST_CASE(MyTypedFixture, cudf::test::FixedPointTypes);
  * ```
- **/
+ */
 using FixedPointTypes = cudf::test::Types<numeric::decimal32, numeric::decimal64>;
 
 /**
@@ -245,7 +245,7 @@ using FixedPointTypes = cudf::test::Types<numeric::decimal32, numeric::decimal64
  * // Invokes all typed fixture tests for all fixed-width types in libcudf
  * TYPED_TEST_CASE(MyTypedFixture, cudf::test::FixedWidthTypes);
  * ```
- **/
+ */
 using FixedWidthTypes = Concat<NumericTypes, ChronoTypes, FixedPointTypes>;
 
 /**
@@ -259,7 +259,7 @@ using FixedWidthTypes = Concat<NumericTypes, ChronoTypes, FixedPointTypes>;
  * // Invokes all typed fixture tests for all fixed-width types in libcudf
  * TYPED_TEST_CASE(MyTypedFixture, cudf::test::FixedWidthTypes);
  * ```
- **/
+ */
 using FixedWidthTypesWithoutFixedPoint = Concat<NumericTypes, ChronoTypes>;
 
 /**
@@ -270,7 +270,7 @@ using FixedWidthTypesWithoutFixedPoint = Concat<NumericTypes, ChronoTypes>;
  * // Invokes all typed fixture tests for all sortable types in libcudf
  * TYPED_TEST_CASE(MyTypedFixture, cudf::test::ComparableTypes);
  * ```
- **/
+ */
 using ComparableTypes = Concat<NumericTypes, ChronoTypes, StringTypes>;
 
 /**
@@ -285,7 +285,7 @@ using ComparableTypes = Concat<NumericTypes, ChronoTypes, StringTypes>;
  * // Invokes all typed fixture tests for all types supported by libcudf
  * TYPED_TEST_CASE(MyTypedFixture, cudf::test::AllTypes);
  * ```
- **/
+ */
 using AllTypes = Concat<NumericTypes, ChronoTypes>;
 
 /**
@@ -293,7 +293,7 @@ using AllTypes = Concat<NumericTypes, ChronoTypes>;
  *
  * This can be used for iterating over `type_id`s for custom testing, or used in
  * GTest value-parameterized tests.
- **/
+ */
 static constexpr auto all_type_ids{detail::types_to_ids<AllTypes>()};
 
 /**
@@ -301,7 +301,7 @@ static constexpr auto all_type_ids{detail::types_to_ids<AllTypes>()};
  *
  * This can be used for iterating over `type_id`s for custom testing, or used in
  * GTest value-parameterized tests.
- **/
+ */
 static constexpr auto numeric_type_ids{detail::types_to_ids<NumericTypes>()};
 
 /**
@@ -309,7 +309,7 @@ static constexpr auto numeric_type_ids{detail::types_to_ids<NumericTypes>()};
  *
  * This can be used for iterating over `type_id`s for custom testing, or used in
  * GTest value-parameterized tests.
- **/
+ */
 static constexpr std::array<cudf::type_id, 5> timestamp_type_ids{
   detail::types_to_ids<TimestampTypes>()};
 
@@ -318,7 +318,7 @@ static constexpr std::array<cudf::type_id, 5> timestamp_type_ids{
  *
  * This can be used for iterating over `type_id`s for custom testing, or used in
  * GTest value-parameterized tests.
- **/
+ */
 static constexpr std::array<cudf::type_id, 5> duration_type_ids{
   detail::types_to_ids<DurationTypes>()};
 
@@ -327,7 +327,7 @@ static constexpr std::array<cudf::type_id, 5> duration_type_ids{
  *
  * This can be used for iterating over `type_id`s for custom testing, or used in
  * GTest value-parameterized tests.
- **/
+ */
 static constexpr std::array<cudf::type_id, 12> non_numeric_type_ids{
   cudf::type_id::EMPTY,
   cudf::type_id::TIMESTAMP_DAYS,
@@ -347,7 +347,7 @@ static constexpr std::array<cudf::type_id, 12> non_numeric_type_ids{
  *
  * This can be used for iterating over `type_id`s for custom testing, or used in
  * GTest value-parameterized tests.
- **/
+ */
 static constexpr std::array<cudf::type_id, 2> non_fixed_width_type_ids{cudf::type_id::EMPTY,
                                                                        cudf::type_id::STRING};
 
diff --git a/cpp/libcudf_kafka/include/cudf_kafka/kafka_consumer.hpp b/cpp/libcudf_kafka/include/cudf_kafka/kafka_consumer.hpp
index 8552b158f20..254d1150418 100644
--- a/cpp/libcudf_kafka/include/cudf_kafka/kafka_consumer.hpp
+++ b/cpp/libcudf_kafka/include/cudf_kafka/kafka_consumer.hpp
@@ -32,7 +32,7 @@ namespace kafka {
  * @brief libcudf datasource for Apache Kafka
  *
  * @ingroup io_datasources
- **/
+ */
 class kafka_consumer : public cudf::io::datasource {
  public:
   /**
@@ -48,7 +48,7 @@ class kafka_consumer : public cudf::io::datasource {
    *
    * @param configs key/value pairs of librdkafka configurations that will be
    *                passed to the librdkafka client
-   **/
+   */
   kafka_consumer(std::map<std::string, std::string> const &configs);
 
   /**
@@ -65,7 +65,7 @@ class kafka_consumer : public cudf::io::datasource {
    * @param batch_timeout maximum (millisecond) read time allowed. If end_offset is not reached
    * before batch_timeout, a smaller subset will be returned
    * @param delimiter optional delimiter to insert into the output between kafka messages, Ex: "\n"
-   **/
+   */
   kafka_consumer(std::map<std::string, std::string> const &configs,
                  std::string const &topic_name,
                  int partition,
@@ -195,7 +195,7 @@ class kafka_consumer : public cudf::io::datasource {
 
   /**
    * Convenience method for getting "now()" in Kafka's standard format
-   **/
+   */
   int64_t now();
 
   void consume_to_buffer();
diff --git a/cpp/libcudf_kafka/src/kafka_consumer.cpp b/cpp/libcudf_kafka/src/kafka_consumer.cpp
index abfef73a521..472b9f035c3 100644
--- a/cpp/libcudf_kafka/src/kafka_consumer.cpp
+++ b/cpp/libcudf_kafka/src/kafka_consumer.cpp
@@ -100,7 +100,7 @@ size_t kafka_consumer::size() const { return buffer.size(); }
 
 /**
  * Change the TOPPAR assignment for this consumer instance
- **/
+ */
 RdKafka::ErrorCode kafka_consumer::update_consumer_topic_partition_assignment(
   std::string const &topic, int partition, int64_t offset)
 {
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index f28d3de4489..6c1ad58e81b 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -155,10 +155,13 @@ std::unique_ptr<aggregation> make_udf_aggregation(udf_type type,
 namespace detail {
 namespace {
 struct target_type_functor {
+  data_type type;
   template <typename Source, aggregation::Kind k>
   constexpr data_type operator()() const noexcept
   {
-    return data_type{type_to_id<target_type_t<Source, k>>()};
+    auto const id = type_to_id<target_type_t<Source, k>>();
+    return id == type_id::DECIMAL32 || id == type_id::DECIMAL64 ? data_type{id, type.scale()}
+                                                                : data_type{id};
   }
 };
 
@@ -174,7 +177,7 @@ struct is_valid_aggregation_impl {
 // Return target data_type for the given source_type and aggregation
 data_type target_type(data_type source, aggregation::Kind k)
 {
-  return dispatch_type_and_aggregation(source, k, target_type_functor{});
+  return dispatch_type_and_aggregation(source, k, target_type_functor{source});
 }
 
 // Verifies the aggregation `k` is valid on the type `source`
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 67a2b25cf2f..8cdcefe9796 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -175,7 +175,7 @@ namespace {
  * @param[in] first_bit_index The index (inclusive) of the first bit to count
  * @param[in] last_bit_index The index (inclusive) of the last bit to count
  * @param[out] global_count The number of non-zero bits in the specified range
- **/
+ */
 template <size_type block_size>
 __global__ void count_set_bits_kernel(bitmask_type const *bitmask,
                                       size_type first_bit_index,
@@ -224,28 +224,6 @@ __global__ void count_set_bits_kernel(bitmask_type const *bitmask,
   if (threadIdx.x == 0) { atomicAdd(global_count, block_count); }
 }
 
-/**
- * @brief Convenience function to get offset word from a bitmask
- *
- * @see copy_offset_bitmask
- * @see offset_bitmask_and
- */
-__device__ bitmask_type get_mask_offset_word(bitmask_type const *__restrict__ source,
-                                             size_type destination_word_index,
-                                             size_type source_begin_bit,
-                                             size_type source_end_bit)
-{
-  size_type source_word_index = destination_word_index + word_index(source_begin_bit);
-  bitmask_type curr_word      = source[source_word_index];
-  bitmask_type next_word      = 0;
-  if (word_index(source_end_bit) >
-      word_index(source_begin_bit +
-                 destination_word_index * detail::size_in_bits<bitmask_type>())) {
-    next_word = source[source_word_index + 1];
-  }
-  return __funnelshift_r(curr_word, next_word, source_begin_bit);
-}
-
 /**
  * For each range `[first_bit_indices[i], last_bit_indices[i])`
  * (where 0 <= i < `num_ranges`), count the number of bits set outside the range
@@ -321,7 +299,7 @@ __global__ void subtract_set_bits_range_boundaries_kerenel(bitmask_type const *b
  * @param source_begin_bit The offset into `source` from which to begin the copy
  * @param source_end_bit   The offset into `source` till which copying is done
  * @param number_of_mask_words The number of `cudf::bitmask_type` words to copy
- **/
+ */
 // TODO: Also make binops test that uses offset in column_view
 __global__ void copy_offset_bitmask(bitmask_type *__restrict__ destination,
                                     bitmask_type const *__restrict__ source,
@@ -332,8 +310,8 @@ __global__ void copy_offset_bitmask(bitmask_type *__restrict__ destination,
   for (size_type destination_word_index = threadIdx.x + blockIdx.x * blockDim.x;
        destination_word_index < number_of_mask_words;
        destination_word_index += blockDim.x * gridDim.x) {
-    destination[destination_word_index] =
-      get_mask_offset_word(source, destination_word_index, source_begin_bit, source_end_bit);
+    destination[destination_word_index] = detail::get_mask_offset_word(
+      source, destination_word_index, source_begin_bit, source_end_bit);
   }
 }
 
@@ -360,7 +338,7 @@ __global__ void offset_bitmask_and(bitmask_type *__restrict__ destination,
        destination_word_index += blockDim.x * gridDim.x) {
     bitmask_type destination_word = ~bitmask_type{0};  // All bits 1
     for (size_type i = 0; i < num_sources; i++) {
-      destination_word &= get_mask_offset_word(
+      destination_word &= detail::get_mask_offset_word(
         source[i], destination_word_index, begin_bit[i], begin_bit[i] + source_size);
     }
 
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 1782b0a765c..06d97198c42 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -102,7 +102,7 @@ auto create_device_views(std::vector<column_view> const& views, rmm::cuda_stream
  * @param dest_mask The output buffer to copy null masks into
  * @param number_of_mask_bits The total number of null masks bits that are being
  * copied
- **/
+ */
 __global__ void concatenate_masks_kernel(column_device_view const* views,
                                          size_t const* output_offsets,
                                          size_type number_of_views,
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 5d797ef3770..b896fafcbf6 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -123,7 +123,6 @@ struct dst_buf_info {
  * @param stride Size of the kernel block
  * @param value_shift Shift incoming 4-byte offset values down by this amount
  * @param bit_shift Shift incoming data right by this many bits
- *
  */
 __device__ void copy_buffer(uint8_t* __restrict__ dst,
                             uint8_t* __restrict__ src,
@@ -203,7 +202,6 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst,
  * @param src_bufs Input source buffers (N)
  * @param dst_bufs Desination buffers (N*M)
  * @param buf_info Information on the range of values to be copied for each destination buffer.
- *
  */
 __global__ void copy_partition(int num_src_bufs,
                                int num_partitions,
@@ -667,7 +665,6 @@ struct dst_offset_output_iterator {
  *
  * Note: columns types which themselves inherently have no data (strings, lists,
  * structs) return 0.
- *
  */
 struct size_of_helper {
   template <typename T>
diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu
index 29c695918da..fc241dd9e32 100644
--- a/cpp/src/copying/get_element.cu
+++ b/cpp/src/copying/get_element.cu
@@ -125,24 +125,35 @@ struct get_element_functor {
     CUDF_FAIL("get_element_functor not supported for list_view");
   }
 
-  template <typename T, std::enable_if_t<std::is_same<T, numeric::decimal32>::value> *p = nullptr>
+  template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()> *p = nullptr>
   std::unique_ptr<scalar> operator()(
     column_view const &input,
     size_type index,
     rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
   {
-    CUDF_FAIL("get_element_functor not supported for decimal32");
-  }
+    using Type = typename T::rep;
 
-  template <typename T, std::enable_if_t<std::is_same<T, numeric::decimal64>::value> *p = nullptr>
-  std::unique_ptr<scalar> operator()(
-    column_view const &input,
-    size_type index,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
-  {
-    CUDF_FAIL("get_element_functor not supported for decimal64");
+    auto device_col = column_device_view::create(input, stream);
+
+    rmm::device_scalar<Type> temp_data;
+    rmm::device_scalar<bool> temp_valid;
+
+    device_single_thread(
+      [buffer   = temp_data.data(),
+       validity = temp_valid.data(),
+       d_col    = *device_col,
+       index] __device__() mutable {
+        *buffer   = d_col.element<Type>(index);
+        *validity = d_col.is_valid(index);
+      },
+      stream);
+
+    return std::make_unique<fixed_point_scalar<T>>(std::move(temp_data),
+                                                   numeric::scale_type{input.type().scale()},
+                                                   temp_valid.value(stream),
+                                                   stream,
+                                                   mr);
   }
 
   template <typename T, std::enable_if_t<std::is_same<T, struct_view>::value> *p = nullptr>
diff --git a/cpp/src/copying/sample.cu b/cpp/src/copying/sample.cu
index 04b331231e3..2e6c1549afa 100644
--- a/cpp/src/copying/sample.cu
+++ b/cpp/src/copying/sample.cu
@@ -51,7 +51,7 @@ std::unique_ptr<table> sample(table_view const& input,
   if (replacement == sample_with_replacement::TRUE) {
     auto RandomGen = [seed, num_rows] __device__(auto i) {
       thrust::default_random_engine rng(seed);
-      thrust::uniform_int_distribution<size_type> dist{0, num_rows};
+      thrust::uniform_int_distribution<size_type> dist{0, num_rows - 1};
       rng.discard(i);
       return dist(rng);
     };
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index 7306a74bd14..f32f351487a 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -43,7 +43,6 @@ namespace detail {
  * d2 = add_keys( d1, [d, b, e] )
  * d2 is now {[a, b, c, d, e, f], [5, 0, 3, 1, 2, 2, 2, 5, 0]}
  * ```
- *
  */
 std::unique_ptr<column> add_keys(
   dictionary_column_view const& dictionary_column,
diff --git a/cpp/src/examples/documentation_example.cpp b/cpp/src/examples/documentation_example.cpp
deleted file mode 100644
index 43d4520c893..00000000000
--- a/cpp/src/examples/documentation_example.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-// DESCRIPTION: Appropriate license header at the top, e.g.,
-/*
- * Copyright (c) 2018, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// DESCRIPTION: A brief description of the overall purpose and contents of this
-// file. Note the use of the @file for file descriptions DESCRIPTION: The text
-// on the same line as the @brief will show up in lists and summaries, whereas
-// the detailed text following on the next line is the detailed description.
-/**
- * @file example_documentation.cpp
- * @brief Example code documentation for libgdf.
- *
- * This file provides examples of how source files, classes, functions, and
- * variables should be documented in libgdf.
- **/
-
-// DESCRIPTION: A brief description of the purpose and functionality of the
-// class
-/**
- * @brief  This class serves as an example of how classes in libgdf should
- * be documented.
- *
- * In detail, this class shows how member functions and member variables should
- * be documented.
- *
- * @tparam T Template parameter for this class is used for...
- **/
-template <typename T>
-class example_class {
-  // DESCRIPTION: Trivial class functions should be given names that make their
-  // purpose clear DESCRIPTION: If their name makes the functionality obvious,
-  // no documentation is necessary
-  void get_my_int() { return my_int; }
-  void set_my_int(int new_value) { my_int = new_value; }
-
-  // DESCRIPTION: Non-trivial member functions should have a brief description
-  // of the function as well as all of its parameters. Every parameter should be
-  // decorated to indicate if it is an input or output parameter, or both with
-  // @param[in], @param[out], and @param[in,out] respectively.
-  /**
-   * @brief This is a complicated function that requires more detailed
-   * documentation.
-   *
-   * Here is the more detailed description of what this function does and what
-   * its logic is.
-   *
-   * @param[in] first_parameter  This parameter is an input parameter to the
-   * function
-   * @param[in,out] second_parameter This parameter is used both as an input and
-   * output
-   * @param[out] third_parameter This parameter is an output of the function
-   *
-   * @return The result of the complex function
-   **/
-  T complicated_function(int const first_parameter,
-                         double* second_parameter,
-                         float* third_parameter)
-  {
-    // DESCRIPTION: Notice the use of *human readable* variable names. Human
-    // readable variable names are vastly preferred to short, hard to read names.
-    // E.g., use 'first_parameter' or `firstParameter` instead of 'fp'. When in
-    // doubt, opt for the longer, easier to read name that conveys the meaning
-    // and purpose of the variable. Well named variables are self-documenting.
-    // As developers, we usually spend more time reading code than writing code,
-    // so the easier you make it to read your code, the more efficient we will
-    // all be.
-
-    // DESCRIPTION: In-line comments that describe the logic inside of your
-    // functions are extremely helpful both to others as well as your future
-    // self to aid in understanding your thought process
-  }
-
- private:
-  int my_int;                ///< An example private member variable
-  std::vector<T> my_vector;  ///< An example private member variable
-};
-
-// DESCRIPTION: Free functions should be commented in the same way as
-// non-trivial class member functions. If the function is templated, use @tparam
-// to describe the purpose of the template parameters.
-/**
- * @brief  An example of a free function (non-class member). This function
- * calls a functor on an input argument and returns the result.
- *
- * @tparam functor_type The type of the functor
- * @tparam input_type The datatype of the input argument
- * @tparam return_type The return type of the functor
- * @param[in] functor The functor to be called on the input argument
- * @param[in] input_argument The input argument passed into the functor
- * @return The result of calling the functor on the input argument
- **/
-template <class functor_type, typename input_type, typename return_type>
-return_type free_function(functor_type functor, input_type input_argument)
-{
-  // Calls the passed in functor on the passed in input argument and returns
-  // the result
-  return functor(input_argument);
-}
-
-// DESCRIPTION: Enumeration types should have a brief overall description of
-// the purpose of the enums, as well as a description of each enum member.
-/**
- * @brief  The purpose of these enumerations is to provide an example
- * of how enumerations should be documented.
- *
- **/
-enum class example_enum {
-  first_enum,   ///< Description of the first enum
-  second_enum,  ///< Description of the second enum
-  third_enum    ///< Description of the third enum
-};
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index cf739d5d33c..7520472aa3b 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -503,7 +503,6 @@ std::pair<rmm::device_vector<size_type>, size_type> extract_populated_keys(
  * requested in `requests`, we gather sparse results into a column of dense
  * results using the aforementioned index vector. Dense results are stored into
  * the in/out parameter `cache`.
- *
  */
 template <bool keys_have_nulls>
 std::unique_ptr<table> groupby_null_templated(table_view const& keys,
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index 77aa3fc65d3..f8cb49fe1aa 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -45,7 +45,6 @@ namespace {
 /**
  * @brief Compares two `table` rows for equality as if the table were
  * ordered according to a specified permutation map.
- *
  */
 template <bool nullable = true>
 struct permuted_row_equality_comparator {
diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh
index f9e4fdc411b..2dfd15925d2 100644
--- a/cpp/src/hash/concurrent_unordered_map.cuh
+++ b/cpp/src/hash/concurrent_unordered_map.cuh
@@ -66,7 +66,7 @@ using packed_t = typename packed<sizeof(pair_type)>::type;
  * @tparam pair_type The pair type in question
  * @return true If the pair type can be packed
  * @return false  If the pair type cannot be packed
- **/
+ */
 template <typename pair_type,
           typename key_type   = typename pair_type::first_type,
           typename value_type = typename pair_type::second_type>
@@ -81,7 +81,7 @@ constexpr bool is_packable()
  *
  * Used as an optimization for inserting when a pair can be inserted with a
  * single atomicCAS
- **/
+ */
 template <typename pair_type, typename Enable = void>
 union pair_packer;
 
@@ -154,7 +154,7 @@ class concurrent_unordered_map {
    * equal
    * @param allocator The allocator to use for allocation the hash table's
    * storage
-   **/
+   */
   static auto create(size_type capacity,
                      rmm::cuda_stream_view stream     = rmm::cuda_stream_default,
                      const mapped_type unused_element = std::numeric_limits<mapped_type>::max(),
@@ -184,7 +184,7 @@ class concurrent_unordered_map {
    * synchronized with the creating stream.
    *
    * @returns iterator to the first element in the map.
-   **/
+   */
   __device__ iterator begin()
   {
     return iterator(m_hashtbl_values, m_hashtbl_values + m_capacity, m_hashtbl_values);
@@ -199,7 +199,7 @@ class concurrent_unordered_map {
    * synchronized with the creating stream.
    *
    * @returns constant iterator to the first element in the map.
-   **/
+   */
   __device__ const_iterator begin() const
   {
     return const_iterator(m_hashtbl_values, m_hashtbl_values + m_capacity, m_hashtbl_values);
@@ -214,7 +214,7 @@ class concurrent_unordered_map {
    * synchronized with the creating stream.
    *
    * @returns iterator to the one past the last element in the map.
-   **/
+   */
   __device__ iterator end()
   {
     return iterator(m_hashtbl_values, m_hashtbl_values + m_capacity, m_hashtbl_values + m_capacity);
@@ -228,7 +228,7 @@ class concurrent_unordered_map {
    * should be appropriately synchronized with the creating stream.
    *
    * @returns constant iterator to the one past the last element in the map.
-   **/
+   */
   __device__ const_iterator end() const
   {
     return const_iterator(
@@ -246,7 +246,7 @@ class concurrent_unordered_map {
   /**
    * @brief Enumeration of the possible results of attempting to insert into
    *a hash bucket
-   **/
+   */
   enum class insert_result {
     CONTINUE,  ///< Insert did not succeed, continue trying to insert
                ///< (collision)
@@ -260,7 +260,7 @@ class concurrent_unordered_map {
    * When the size of the key,value pair being inserted is equal in size to
    *a type where atomicCAS is natively supported, this optimization path
    *will insert the pair in a single atomicCAS operation.
-   **/
+   */
   template <typename pair_type = value_type>
   __device__ std::enable_if_t<is_packable<pair_type>(), insert_result> attempt_insert(
     value_type* insert_location, value_type const& insert_pair)
@@ -284,7 +284,7 @@ class concurrent_unordered_map {
    * @param[in] insert_location Pointer to hash bucket to attempt insert
    * @param[in] insert_pair The pair to insert
    * @return Enum indicating result of insert attempt.
-   **/
+   */
   template <typename pair_type = value_type>
   __device__ std::enable_if_t<not is_packable<pair_type>(), insert_result> attempt_insert(
     value_type* const __restrict__ insert_location, value_type const& insert_pair)
@@ -321,7 +321,7 @@ class concurrent_unordered_map {
    * @return Iterator, Boolean pair. Iterator is to the location of the
    *newly inserted pair, or the existing pair that prevented the insert.
    *Boolean indicates insert success.
-   **/
+   */
   __device__ thrust::pair<iterator, bool> insert(value_type const& insert_pair)
   {
     const size_type key_hash{m_hf(insert_pair.first)};
@@ -351,7 +351,7 @@ class concurrent_unordered_map {
    *
    * @param k The key to search for
    * @return An iterator to the key if it exists, else map.end()
-   **/
+   */
   __device__ const_iterator find(key_type const& k) const
   {
     size_type const key_hash = m_hf(k);
@@ -393,7 +393,7 @@ class concurrent_unordered_map {
    * @param f_equal   The equality function to use to compare this key with the
    *                  contents of the hash table
    * @return An iterator to the key if it exists, else map.end()
-   **/
+   */
   template <typename find_hasher, typename find_key_equal>
   __device__ const_iterator find(key_type const& k,
                                  find_hasher f_hash,
@@ -471,7 +471,7 @@ class concurrent_unordered_map {
    * from the `create()` factory function.
    *
    * @param stream CUDA stream used for device memory operations and kernel launches.
-   **/
+   */
   void destroy(rmm::cuda_stream_view stream = rmm::cuda_stream_default)
   {
     m_allocator.deallocate(m_hashtbl_values, m_capacity, stream);
@@ -506,7 +506,7 @@ class concurrent_unordered_map {
    * @param allocator The allocator to use for allocation the hash table's
    * storage
    * @param stream CUDA stream used for device memory operations and kernel launches.
-   **/
+   */
   concurrent_unordered_map(size_type capacity,
                            const mapped_type unused_element,
                            const key_type unused_key,
diff --git a/cpp/src/hash/concurrent_unordered_multimap.cuh b/cpp/src/hash/concurrent_unordered_multimap.cuh
index 8ba36e8696d..071214e80b0 100644
--- a/cpp/src/hash/concurrent_unordered_multimap.cuh
+++ b/cpp/src/hash/concurrent_unordered_multimap.cuh
@@ -100,7 +100,7 @@ class concurrent_unordered_multimap {
    * @param equal The equality comparison function for comparing if two keys are
    * equal
    * @param allocator The allocator to use for allocation of the map's storage
-   **/
+   */
   static auto create(size_type capacity,
                      rmm::cuda_stream_view stream    = rmm::cuda_stream_default,
                      const bool init                 = true,
@@ -134,7 +134,7 @@ class concurrent_unordered_multimap {
    * from the `create()` factory function.
    *
    * @param stream CUDA stream used for device memory operations and kernel launches.
-   **/
+   */
   void destroy(rmm::cuda_stream_view stream = rmm::cuda_stream_default)
   {
     m_allocator.deallocate(m_hashtbl_values, m_hashtbl_capacity, stream);
@@ -152,7 +152,7 @@ class concurrent_unordered_multimap {
    * should be appropriately synchronized with the creating stream.
    *
    * @returns iterator to the first element in the map.
-   **/
+   */
   __host__ __device__ iterator begin()
   {
     return iterator(m_hashtbl_values, m_hashtbl_values + m_hashtbl_size, m_hashtbl_values);
@@ -169,7 +169,7 @@ class concurrent_unordered_multimap {
    * should be appropriately synchronized with the creating stream.
    *
    * @returns constant iterator to the first element in the map.
-   **/
+   */
   __host__ __device__ const_iterator begin() const
   {
     return const_iterator(m_hashtbl_values, m_hashtbl_values + m_hashtbl_size, m_hashtbl_values);
@@ -186,7 +186,7 @@ class concurrent_unordered_multimap {
    * should be appropriately synchronized with the creating stream.
    *
    * @returns iterator to the one past the last element in the map.
-   **/
+   */
   __host__ __device__ iterator end()
   {
     return iterator(
@@ -204,7 +204,7 @@ class concurrent_unordered_multimap {
    * should be appropriately synchronized with the creating stream.
    *
    * @returns constant iterator to the one past the last element in the map.
-   **/
+   */
   __host__ __device__ const_iterator end() const
   {
     return const_iterator(
@@ -216,7 +216,6 @@ class concurrent_unordered_multimap {
     return unused_key;
   }
 
-  /* --------------------------------------------------------------------------*/
   /**
    * @brief Computes a hash value for a key
    *
@@ -225,14 +224,12 @@ class concurrent_unordered_multimap {
    *
    * @returns   The hash value for the key
    */
-  /* ----------------------------------------------------------------------------*/
   template <typename hash_value_type = typename Hasher::result_type>
   __forceinline__ __host__ __device__ hash_value_type get_hash(const key_type& the_key) const
   {
     return m_hf(the_key);
   }
 
-  /* --------------------------------------------------------------------------*/
   /**
    * @brief Computes the destination hash map partition for a key
    *
@@ -248,7 +245,6 @@ class concurrent_unordered_multimap {
    *
    * @returns   The destination hash table partition for the specified key
    */
-  /* ----------------------------------------------------------------------------*/
   template <typename hash_value_type = typename Hasher::result_type>
   __forceinline__ __host__ __device__ int get_partition(
     const key_type& the_key,
@@ -281,7 +277,6 @@ class concurrent_unordered_multimap {
     return dest_part;
   }
 
-  /* --------------------------------------------------------------------------*/
   /**
    * @brief  Inserts a (key, value) pair into the hash map
    *
@@ -298,7 +293,6 @@ class concurrent_unordered_multimap {
    *
    * @returns An iterator to the newly inserted (key, value) pair
    */
-  /* ----------------------------------------------------------------------------*/
   template <typename hash_value_type = typename Hasher::result_type,
             typename comparison_type = key_equal>
   __forceinline__ __device__ iterator insert(const value_type& x,
@@ -368,7 +362,6 @@ class concurrent_unordered_multimap {
     return iterator(m_hashtbl_values, m_hashtbl_values + hashtbl_size, it);
   }
 
-  /* --------------------------------------------------------------------------*/
   /**
    * @brief  Inserts a (key, value) pair into the hash map partition. This
    * is useful when building the hash table in multiple passes, one
@@ -391,7 +384,6 @@ class concurrent_unordered_multimap {
    *
    * @returns An iterator to the newly inserted (key, value) pair
    */
-  /* ----------------------------------------------------------------------------*/
   template <typename hash_value_type = typename Hasher::result_type,
             typename comparison_type = key_equal>
   __forceinline__ __device__ iterator insert_part(const value_type& x,
@@ -423,7 +415,6 @@ class concurrent_unordered_multimap {
       return insert(x, true, hash_value, keys_are_equal);
   }
 
-  /* --------------------------------------------------------------------------*/
   /**
    * @brief Searches for a key in the hash map and returns an iterator to the
    * first instance of the key in the map.
@@ -441,7 +432,6 @@ class concurrent_unordered_multimap {
    *
    * @returns   An iterator to the first instance of the key in the map
    */
-  /* ----------------------------------------------------------------------------*/
   template <typename hash_value_type = typename Hasher::result_type,
             typename comparison_type = key_equal>
   __forceinline__ __host__ __device__ const_iterator
diff --git a/cpp/src/hash/helper_functions.cuh b/cpp/src/hash/helper_functions.cuh
index 57747142f58..3b8d8528894 100644
--- a/cpp/src/hash/helper_functions.cuh
+++ b/cpp/src/hash/helper_functions.cuh
@@ -35,7 +35,7 @@ constexpr int64_t DEFAULT_HASH_TABLE_OCCUPANCY = 50;
  * 50% occupancy
  * @return size_t The size of the hash table that will satisfy the desired
  * occupancy for the specified number of insertions
- **/
+ */
 inline size_t compute_hash_table_size(cudf::size_type num_keys_to_insert,
                                       uint32_t desired_occupancy = DEFAULT_HASH_TABLE_OCCUPANCY)
 {
diff --git a/cpp/src/hash/unordered_multiset.cuh b/cpp/src/hash/unordered_multiset.cuh
index ed8e15c03c7..0704425186e 100644
--- a/cpp/src/hash/unordered_multiset.cuh
+++ b/cpp/src/hash/unordered_multiset.cuh
@@ -62,7 +62,6 @@ class unordered_multiset_device_view {
 
 /*
  * Fixed size set on a device.
- *
  */
 template <typename Element,
           typename Hasher   = default_hash<Element>,
@@ -71,7 +70,7 @@ class unordered_multiset {
  public:
   /**
    * @brief Factory to construct a new unordered_multiset
-   **/
+   */
   static unordered_multiset<Element> create(column_view const &col, rmm::cuda_stream_view stream)
   {
     auto d_column = column_device_view::create(col, stream);
diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp
index 06e971bb9fd..0870fef4b62 100644
--- a/cpp/src/io/avro/avro.cpp
+++ b/cpp/src/io/avro/avro.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -168,7 +168,7 @@ bool container::parse(file_metadata *md, size_t max_num_rows, size_t first_row)
 
 /**
  * @brief Parser state
- **/
+ */
 enum json_state_e {
   state_attrname = 0,
   state_attrcolon,
diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu
index 83d2d74c999..4eefee66531 100644
--- a/cpp/src/io/avro/avro_gpu.cu
+++ b/cpp/src/io/avro/avro_gpu.cu
@@ -64,8 +64,7 @@ static inline int64_t __device__ avro_decode_zigzag_varint(const uint8_t *&cur,
  * @param[in] global_Dictionary Global dictionary entries
  *
  * @return data pointer at the end of the row (start of next row)
- *
- **/
+ */
 static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema,
                                                  schemadesc_s *schema_g,
                                                  uint32_t schema_len,
@@ -226,8 +225,7 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema,
  * @param[in] min_row_size Minimum size in bytes of a row
  * @param[in] max_rows Maximum number of rows to load
  * @param[in] first_row Crop all rows below first_row
- *
- **/
+ */
 // blockDim {32,num_warps,1}
 extern "C" __global__ void __launch_bounds__(num_warps * 32, 2)
   gpuDecodeAvroColumnData(block_desc_s *blocks,
@@ -312,7 +310,7 @@ extern "C" __global__ void __launch_bounds__(num_warps * 32, 2)
  * @param[in] first_row Crop all rows below first_row
  * @param[in] min_row_size Minimum size in bytes of a row
  * @param[in] stream CUDA stream to use, default 0
- **/
+ */
 void DecodeAvroColumnData(block_desc_s *blocks,
                           schemadesc_s *schema,
                           device_span<nvstrdesc_s> global_dictionary,
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index 24d28402677..6bd6f59571e 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -17,7 +17,7 @@
 /**
  * @file reader_impl.cu
  * @brief cuDF-IO Avro reader class implementation
- **/
+ */
 
 #include "reader_impl.hpp"
 
@@ -45,7 +45,7 @@ using namespace cudf::io;
 namespace {
 /**
  * @brief Function that translates Avro data kind to cuDF type enum
- **/
+ */
 type_id to_type_id(const avro::schema_entry *col)
 {
   switch (col->kind) {
@@ -66,7 +66,7 @@ type_id to_type_id(const avro::schema_entry *col)
 /**
  * @brief A helper wrapper for Avro file metadata. Provides some additional
  * convenience methods for initializing and accessing the metadata and schema
- **/
+ */
 class metadata : public file_metadata {
  public:
   explicit metadata(datasource *const src) : source(src) {}
@@ -76,7 +76,7 @@ class metadata : public file_metadata {
    *
    * @param[in,out] row_start Starting row of the selection
    * @param[in,out] row_count Total number of rows selected
-   **/
+   */
   void init_and_select_rows(int &row_start, int &row_count)
   {
     const auto buffer = source->host_read(0, source->size());
@@ -92,7 +92,7 @@ class metadata : public file_metadata {
    * @param[in] use_names List of column names to select
    *
    * @return List of column names
-   **/
+   */
   auto select_columns(std::vector<std::string> use_names)
   {
     std::vector<std::pair<int, std::string>> selection;
diff --git a/cpp/src/io/comp/cpu_unbz2.cpp b/cpp/src/io/comp/cpu_unbz2.cpp
index 556ce0e39e3..ea72b2f4ee6 100644
--- a/cpp/src/io/comp/cpu_unbz2.cpp
+++ b/cpp/src/io/comp/cpu_unbz2.cpp
@@ -25,7 +25,6 @@
  * https://github.com/asimonov-im/bzip2/blob/master/LICENSE
  * original source code available at
  * http://www.sourceware.org/bzip2/
- *
  */
 
 /*--
diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu
index f81f9ade10e..953872ab7ed 100644
--- a/cpp/src/io/comp/debrotli.cu
+++ b/cpp/src/io/comp/debrotli.cu
@@ -23,7 +23,7 @@
  *
  * Portions of this file are derived from Google's Brotli project at
  * https://github.com/google/brotli, original license text below.
- **/
+ */
 
 /* Copyright 2013 Google Inc. All Rights Reserved.
 
@@ -117,7 +117,7 @@ __inline__ __device__ int brotli_context(int p1, int p2, int lut)
 
 /**
  * @brief Various local scratch arrays
- **/
+ */
 struct huff_scratch_s {
   uint16_t code_length_histo[16];
   uint8_t code_length_code_lengths[brotli_code_length_codes];
@@ -132,7 +132,7 @@ struct huff_scratch_s {
  * @brief Contains a collection of Huffman trees with the same alphabet size.
  * max_symbol is needed due to simple codes since log2(alphabet_size) could be
  * greater than log2(max_symbol).
- **/
+ */
 struct debrotli_huff_tree_group_s {
   uint16_t alphabet_size;
   uint16_t max_symbol;
@@ -149,7 +149,7 @@ constexpr int local_heap_size =
 
 /**
  * Brotli decoder state
- **/
+ */
 struct debrotli_state_s {
   // Bitstream
   const uint8_t *cur;
@@ -270,7 +270,7 @@ inline __device__ uint32_t getbits_bytealign(debrotli_state_s *s)
  * 33..64      xxxxx1011
  * 65..128    xxxxxx1101
  * 129..256   xxxxxxx1111
- **/
+ */
 static __device__ uint32_t getbits_u8vlc(debrotli_state_s *s)
 {
   uint32_t next32 = next32bits(s);
@@ -884,8 +884,7 @@ the number of times to repeat the previous length or repeat a zero
 length would result in more lengths in total than the number of
 symbols in the alphabet, then the stream should be rejected as
 invalid.
-
-**/
+*/
 
 // Decode Huffman tree (thread0-only)
 static __device__ uint32_t DecodeHuffmanTree(debrotli_state_s *s,
@@ -1073,7 +1072,7 @@ non - dictionary reference backward distance, is given by the following
 formula :
 
 window size = (1 << WBITS) - 16
-**/
+*/
 static __device__ void DecodeStreamHeader(debrotli_state_s *s)
 {
   uint32_t next32 = next32bits(s);
@@ -1158,7 +1157,7 @@ next byte boundary are ignored, and the rest of the meta - block contains
 MLEN bytes of literal data; this field is only present if the ISLAST bit is
 not set(if the ignored bits are not all zeros, the stream should be rejected
 as invalid)
-**/
+*/
 
 static __device__ void DecodeMetaBlockHeader(debrotli_state_s *s)
 {
@@ -1241,8 +1240,7 @@ block counts, appears only if NBLTYPESD >= 2
 
 Block count code + extra bits for first distance block count,
 appears only if NBLTYPESD >= 2
-
-**/
+*/
 
 static __device__ void DecodeHuffmanTables(debrotli_state_s *s)
 {
@@ -1291,7 +1289,7 @@ static __device__ void DecodeHuffmanTables(debrotli_state_s *s)
  *
  * Most of input values are 0 and 1. To reduce number of branches, we replace
  * inner for loop with do-while.
- **/
+ */
 static __device__ void InverseMoveToFrontTransform(debrotli_state_s *s, uint8_t *v, uint32_t v_len)
 {
   // Reinitialize elements that could have been changed.
@@ -1409,8 +1407,7 @@ the same variable-length code as NBLTYPESD
 Distance context map, encoded as described in Section 7.3,
 appears only if NTREESD >= 2; otherwise, the context map has
 only zero values
-
-**/
+*/
 
 static __device__ debrotli_huff_tree_group_s *HuffmanTreeGroupInit(debrotli_state_s *s,
                                                                    uint32_t alphabet_size,
@@ -1896,7 +1893,7 @@ static __device__ void ProcessCommands(debrotli_state_s *s, const brotli_diction
  * @param scratch_size Size of scratch heap space (smaller sizes may result in serialization between
  *blocks)
  * @param count Number of blocks to decompress
- **/
+ */
 extern "C" __global__ void __launch_bounds__(block_size, 2)
   gpu_debrotli_kernel(gpu_inflate_input_s *inputs,
                       gpu_inflate_status_s *outputs,
@@ -2027,7 +2024,7 @@ extern "C" __global__ void __launch_bounds__(block_size, 2)
  * @param[in] max_num_inputs The maximum number of compressed input chunks
  *
  * @return The size in bytes of required temporary memory
- **/
+ */
 size_t __host__ get_gpu_debrotli_scratch_size(int max_num_inputs)
 {
   int sm_count = 0;
diff --git a/cpp/src/io/comp/gpuinflate.h b/cpp/src/io/comp/gpuinflate.h
index 692752c4e33..7ca6dd13e9a 100644
--- a/cpp/src/io/comp/gpuinflate.h
+++ b/cpp/src/io/comp/gpuinflate.h
@@ -24,7 +24,7 @@ namespace cudf {
 namespace io {
 /**
  * @brief Input parameters for the decompression interface
- **/
+ */
 struct gpu_inflate_input_s {
   const void *srcDevice;
   uint64_t srcSize;
@@ -34,7 +34,7 @@ struct gpu_inflate_input_s {
 
 /**
  * @brief Output parameters for the decompression interface
- **/
+ */
 struct gpu_inflate_status_s {
   uint64_t bytes_written;
   uint32_t status;
@@ -52,7 +52,7 @@ struct gpu_inflate_status_s {
  * @param[in] count Number of input/output structures, default 1
  * @param[in] parse_hdr Whether or not to parse GZIP header, default false
  * @param[in] stream CUDA stream to use, default 0
- **/
+ */
 cudaError_t gpuinflate(gpu_inflate_input_s *inputs,
                        gpu_inflate_status_s *outputs,
                        int count                    = 1,
@@ -65,7 +65,7 @@ cudaError_t gpuinflate(gpu_inflate_input_s *inputs,
  * @param[in] inputs List of input argument structures
  * @param[in] count Number of input structures, default 1
  * @param[in] stream CUDA stream to use, default 0
- **/
+ */
 cudaError_t gpu_copy_uncompressed_blocks(gpu_inflate_input_s *inputs,
                                          int count                    = 1,
                                          rmm::cuda_stream_view stream = rmm::cuda_stream_default);
@@ -80,7 +80,7 @@ cudaError_t gpu_copy_uncompressed_blocks(gpu_inflate_input_s *inputs,
  * @param[out] outputs List of output status structures
  * @param[in] count Number of input/output structures, default 1
  * @param[in] stream CUDA stream to use, default 0
- **/
+ */
 cudaError_t gpu_unsnap(gpu_inflate_input_s *inputs,
                        gpu_inflate_status_s *outputs,
                        int count                    = 1,
@@ -92,7 +92,7 @@ cudaError_t gpu_unsnap(gpu_inflate_input_s *inputs,
  * @param[in] max_num_inputs The maximum number of compressed input chunks
  *
  * @return The size in bytes of required temporary memory
- **/
+ */
 size_t get_gpu_debrotli_scratch_size(int max_num_inputs = 0);
 
 /**
@@ -107,7 +107,7 @@ size_t get_gpu_debrotli_scratch_size(int max_num_inputs = 0);
  * @param[in] scratch_size Size in bytes of the temporary memory
  * @param[in] count Number of input/output structures, default 1
  * @param[in] stream CUDA stream to use, default 0
- **/
+ */
 cudaError_t gpu_debrotli(gpu_inflate_input_s *inputs,
                          gpu_inflate_status_s *outputs,
                          void *scratch,
@@ -125,7 +125,7 @@ cudaError_t gpu_debrotli(gpu_inflate_input_s *inputs,
  * @param[out] outputs List of output status structures
  * @param[in] count Number of input/output structures, default 1
  * @param[in] stream CUDA stream to use, default 0
- **/
+ */
 cudaError_t gpu_snap(gpu_inflate_input_s *inputs,
                      gpu_inflate_status_s *outputs,
                      int count                    = 1,
diff --git a/cpp/src/io/comp/snap.cu b/cpp/src/io/comp/snap.cu
index 98fe32bfce2..999d02e3a50 100644
--- a/cpp/src/io/comp/snap.cu
+++ b/cpp/src/io/comp/snap.cu
@@ -29,7 +29,7 @@ constexpr int hash_bits = 12;
 
 /**
  * @brief snappy compressor state
- **/
+ */
 struct snap_state_s {
   const uint8_t *src;                 ///< Ptr to uncompressed data
   uint32_t src_len;                   ///< Uncompressed data length
@@ -44,7 +44,7 @@ struct snap_state_s {
 
 /**
  * @brief 12-bit hash from four consecutive bytes
- **/
+ */
 static inline __device__ uint32_t snap_hash(uint32_t v)
 {
   return (v * ((1 << 20) + (0x2a00) + (0x6a) + 1)) >> (32 - hash_bits);
@@ -52,7 +52,7 @@ static inline __device__ uint32_t snap_hash(uint32_t v)
 
 /**
  * @brief Fetches four consecutive bytes
- **/
+ */
 static inline __device__ uint32_t fetch4(const uint8_t *src)
 {
   uint32_t src_align    = 3 & reinterpret_cast<uintptr_t>(src);
@@ -71,7 +71,7 @@ static inline __device__ uint32_t fetch4(const uint8_t *src)
  * @param t Thread in warp
  *
  * @return Updated pointer to compressed byte stream
- **/
+ */
 static __device__ uint8_t *StoreLiterals(
   uint8_t *dst, uint8_t *end, const uint8_t *src, uint32_t len_minus1, uint32_t t)
 {
@@ -124,7 +124,7 @@ static __device__ uint8_t *StoreLiterals(
  * @param distance Copy distance
  *
  * @return Updated pointer to compressed byte stream
- **/
+ */
 static __device__ uint8_t *StoreCopy(uint8_t *dst,
                                      uint8_t *end,
                                      uint32_t copy_len,
@@ -151,7 +151,7 @@ static __device__ uint8_t *StoreCopy(uint8_t *dst,
 /**
  * @brief Returns mask of any thread in the warp that has a hash value
  * equal to that of the calling thread
- **/
+ */
 static inline __device__ uint32_t HashMatchAny(uint32_t v, uint32_t t)
 {
 #if (__CUDA_ARCH__ >= 700)
@@ -177,7 +177,7 @@ static inline __device__ uint32_t HashMatchAny(uint32_t v, uint32_t t)
  * @param t thread in warp
  *
  * @return Number of bytes before first match (literal length)
- **/
+ */
 static __device__ uint32_t FindFourByteMatch(snap_state_s *s,
                                              const uint8_t *src,
                                              uint32_t pos0,
@@ -256,7 +256,7 @@ static __device__ uint32_t Match60(const uint8_t *src1,
  * @param[in] inputs Source/Destination buffer information per block
  * @param[out] outputs Compression status per block
  * @param[in] count Number of blocks to compress
- **/
+ */
 extern "C" __global__ void __launch_bounds__(128)
   snap_kernel(gpu_inflate_input_s *inputs, gpu_inflate_status_s *outputs, int count)
 {
diff --git a/cpp/src/io/comp/unbz2.h b/cpp/src/io/comp/unbz2.h
index 4054c3a41eb..8f3a6eace5a 100644
--- a/cpp/src/io/comp/unbz2.h
+++ b/cpp/src/io/comp/unbz2.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,6 @@
  * https://github.com/asimonov-im/bzip2/blob/master/LICENSE
  * original source code available at
  * http://www.sourceware.org/bzip2/
- *
  */
 
 /*--
diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu
index a84f69b173d..2b799b5e1bf 100644
--- a/cpp/src/io/comp/unsnap.cu
+++ b/cpp/src/io/comp/unsnap.cu
@@ -31,7 +31,7 @@ constexpr bool log_cyclecount   = false;
 
 /**
  * @brief Describes a single LZ77 symbol (single entry in batch)
- **/
+ */
 struct unsnap_batch_s {
   int32_t len;  // 1..64 = Number of bytes
   uint32_t
@@ -40,7 +40,7 @@ struct unsnap_batch_s {
 
 /**
  * @brief Queue structure used to exchange data between warps
- **/
+ */
 struct unsnap_queue_s {
   uint32_t prefetch_wrpos;         ///< Prefetcher write position
   uint32_t prefetch_rdpos;         ///< Prefetch consumer read position
@@ -52,7 +52,7 @@ struct unsnap_queue_s {
 
 /**
  * @brief snappy decompression state
- **/
+ */
 struct unsnap_state_s {
   const uint8_t *base;         ///< base ptr of compressed stream
   const uint8_t *end;          ///< end of compressed stream
@@ -74,7 +74,7 @@ inline __device__ volatile uint8_t &byte_access(unsnap_state_s *s, uint32_t pos)
  *
  * @param s decompression state
  * @param t warp lane id
- **/
+ */
 __device__ void snappy_prefetch_bytestream(unsnap_state_s *s, int t)
 {
   const uint8_t *base  = s->base;
@@ -126,8 +126,7 @@ __device__ void snappy_prefetch_bytestream(unsnap_state_s *s, int t)
  *       }
  *       k_len3lut[k] = v | (n << 4);
  *   }
- *
- **/
+ */
 static const uint8_t __device__ __constant__ k_len3lut[1 << 10] = {
   0x80, 0x91, 0x80, 0x91, 0x92, 0x91, 0x92, 0x91, 0x80, 0xa3, 0x80, 0xa3, 0x92, 0xa3, 0x92, 0xa3,
   0x94, 0x91, 0x94, 0x91, 0x92, 0x91, 0x92, 0x91, 0x94, 0xa3, 0x94, 0xa3, 0x92, 0xa3, 0x92, 0xa3,
@@ -199,7 +198,7 @@ static const uint8_t __device__ __constant__ k_len3lut[1 << 10] = {
  * code length, given an input mask of up to 96 bits.
  *
  * Implemented by doing 8 consecutive lookups, building the result 4-bit at a time
- **/
+ */
 inline __device__ uint32_t get_len3_mask(uint32_t v0, uint32_t v1, uint32_t v2)
 {
   uint32_t m, v, m4, n;
@@ -243,7 +242,7 @@ inline __device__ uint32_t get_len3_mask(uint32_t v0, uint32_t v1, uint32_t v2)
  * @brief Returns a 32-bit mask where each 2-bit pair contains the symbol length
  * minus 2, given two input masks each containing bit0 or bit1 of the corresponding
  * code length minus 2 for up to 32 bytes
- **/
+ */
 inline __device__ uint32_t get_len5_mask(uint32_t v0, uint32_t v1)
 {
   uint32_t m;
@@ -265,7 +264,7 @@ inline __device__ uint32_t get_len5_mask(uint32_t v0, uint32_t v1)
  *
  * @param s decompression state
  * @param t warp lane id
- **/
+ */
 __device__ void snappy_decode_symbols(unsnap_state_s *s, uint32_t t)
 {
   uint32_t cur        = 0;
@@ -478,7 +477,7 @@ __device__ void snappy_decode_symbols(unsnap_state_s *s, uint32_t t)
  *
  * NOTE: No error checks at this stage (WARP0 responsible for not sending offsets and lengths that
  *would result in out-of-bounds accesses)
- **/
+ */
 template <typename Storage>
 __device__ void snappy_process_symbols(unsnap_state_s *s, int t, Storage &temp_storage)
 {
@@ -604,7 +603,7 @@ __device__ void snappy_process_symbols(unsnap_state_s *s, int t, Storage &temp_s
  *
  * @param[in] inputs Source & destination information per block
  * @param[out] outputs Decompression status per block
- **/
+ */
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
   unsnap_kernel(gpu_inflate_input_s *inputs, gpu_inflate_status_s *outputs)
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index c6182620384..d3820141495 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -225,7 +225,7 @@ __global__ void __launch_bounds__(csvparse_block_dim)
       long tempPos   = pos - 1;
       long field_len = pos - start;
 
-      if (field_len < 0 || serialized_trie_contains(opts.trie_na, raw_csv + start, field_len)) {
+      if (serialized_trie_contains(opts.trie_na, raw_csv + start, field_len)) {
         atomicAdd(&d_columnData[actual_col].null_count, 1);
       } else if (serialized_trie_contains(opts.trie_true, raw_csv + start, field_len) ||
                  serialized_trie_contains(opts.trie_false, raw_csv + start, field_len)) {
@@ -332,7 +332,11 @@ __inline__ __device__ cudf::timestamp_s decode_value(char const *begin,
                                                      parse_options_view const &opts)
 {
   auto milli = parseDateTimeFormat(begin, end, opts.dayfirst);
-  return timestamp_s{cudf::duration_s{milli / 1000}};
+  if (milli == -1) {
+    return timestamp_s{cudf::duration_s{convertStrToInteger<int64_t>(begin, end)}};
+  } else {
+    return timestamp_s{cudf::duration_s{milli / 1000}};
+  }
 }
 
 template <>
@@ -341,7 +345,11 @@ __inline__ __device__ cudf::timestamp_ms decode_value(char const *begin,
                                                       parse_options_view const &opts)
 {
   auto milli = parseDateTimeFormat(begin, end, opts.dayfirst);
-  return timestamp_ms{cudf::duration_ms{milli}};
+  if (milli == -1) {
+    return timestamp_ms{cudf::duration_ms{convertStrToInteger<int64_t>(begin, end)}};
+  } else {
+    return timestamp_ms{cudf::duration_ms{milli}};
+  }
 }
 
 template <>
@@ -350,7 +358,11 @@ __inline__ __device__ cudf::timestamp_us decode_value(char const *begin,
                                                       parse_options_view const &opts)
 {
   auto milli = parseDateTimeFormat(begin, end, opts.dayfirst);
-  return timestamp_us{cudf::duration_us{milli * 1000}};
+  if (milli == -1) {
+    return timestamp_us{cudf::duration_us{convertStrToInteger<int64_t>(begin, end)}};
+  } else {
+    return timestamp_us{cudf::duration_us{milli * 1000}};
+  }
 }
 
 template <>
@@ -359,7 +371,11 @@ __inline__ __device__ cudf::timestamp_ns decode_value(char const *begin,
                                                       parse_options_view const &opts)
 {
   auto milli = parseDateTimeFormat(begin, end, opts.dayfirst);
-  return timestamp_ns{cudf::duration_ns{milli * 1000000}};
+  if (milli == -1) {
+    return timestamp_ns{cudf::duration_ns{convertStrToInteger<int64_t>(begin, end)}};
+  } else {
+    return timestamp_ns{cudf::duration_ns{milli * 1000000}};
+  }
 }
 
 #ifndef DURATION_DECODE_VALUE
@@ -556,7 +572,7 @@ struct decode_op {
  * @param[out] data The output column data
  * @param[out] valid The bitmaps indicating whether column fields are valid
  * @param[out] num_valid The numbers of valid fields in columns
- **/
+ */
 __global__ void __launch_bounds__(csvparse_block_dim)
   convert_csv_to_cudf(cudf::io::parse_options_view options,
                       device_span<char const> data,
@@ -640,7 +656,7 @@ __global__ void __launch_bounds__(csvparse_block_dim)
 /*
  * @brief Merge two packed row contexts (each corresponding to a block of characters)
  * and return the packed row context corresponding to the merged character block
- **/
+ */
 inline __device__ packed_rowctx_t merge_row_contexts(packed_rowctx_t first_ctx,
                                                      packed_rowctx_t second_ctx)
 {
@@ -657,7 +673,7 @@ inline __device__ packed_rowctx_t merge_row_contexts(packed_rowctx_t first_ctx,
  * @brief Per-character context:
  * 1-bit count (0 or 1) per context in the lower 4 bits
  * 2-bit output context id per input context in bits 8..15
- **/
+ */
 constexpr __device__ uint32_t make_char_context(uint32_t id0,
                                                 uint32_t id1,
                                                 uint32_t id2 = ROW_CTX_COMMENT,
@@ -681,8 +697,7 @@ constexpr __device__ uint32_t make_char_context(uint32_t id0,
  * NOTE: This is probably the most performance-critical piece of the row gathering kernel.
  * The char_ctx value should be created via make_char_context, and its value should
  * have been evaluated at compile-time.
- *
- **/
+ */
 inline __device__ void merge_char_context(uint4 &ctx, uint32_t char_ctx, uint32_t pos)
 {
   uint32_t id0 = (ctx.w >> 0) & 3;
@@ -699,7 +714,7 @@ inline __device__ void merge_char_context(uint4 &ctx, uint32_t char_ctx, uint32_
 
 /*
  * Convert the context-with-row-bitmaps version to a packed row context
- **/
+ */
 inline __device__ packed_rowctx_t pack_rowmaps(uint4 ctx_map)
 {
   return pack_row_contexts(make_row_context(__popc(ctx_map.x), (ctx_map.w >> 0) & 3),
@@ -709,7 +724,7 @@ inline __device__ packed_rowctx_t pack_rowmaps(uint4 ctx_map)
 
 /*
  * Selects the row bitmap corresponding to the given parser state
- **/
+ */
 inline __device__ uint32_t select_rowmap(uint4 ctx_map, uint32_t ctxid)
 {
   return (ctxid == ROW_CTX_NONE)
@@ -732,7 +747,6 @@ inline __device__ uint32_t select_rowmap(uint4 ctx_map, uint32_t ctxid)
  * @param ctxtree[out] packed row context tree
  * @param ctxb[in] packed row context for the current character block
  * @param t thread id (leaf node id)
- *
  */
 template <uint32_t lanemask, uint32_t tmask, uint32_t base, uint32_t level_scale>
 inline __device__ void ctx_merge(uint64_t *ctxtree, packed_rowctx_t *ctxb, uint32_t t)
@@ -789,8 +803,7 @@ inline __device__ void ctx_unmerge(
  * @param ctxtree[out] packed row context tree
  * @param ctxb[in] packed row context for the current character block
  * @param t thread id (leaf node id)
- *
- **/
+ */
 static inline __device__ void rowctx_merge_transform(uint64_t ctxtree[1024],
                                                      packed_rowctx_t ctxb,
                                                      uint32_t t)
@@ -824,7 +837,7 @@ static inline __device__ void rowctx_merge_transform(uint64_t ctxtree[1024],
  * @param[in] t thread id (leaf node id)
  *
  * @return Final row context and count (row_position*4 + context_id format)
- **/
+ */
 static inline __device__ rowctx32_t rowctx_inverse_merge_transform(uint64_t ctxtree[1024],
                                                                    uint32_t t)
 {
@@ -872,7 +885,7 @@ static inline __device__ rowctx32_t rowctx_inverse_merge_transform(uint64_t ctxt
  * @param quotechar Quote character
  * @param escapechar Delimiter escape character
  * @param commentchar Comment line character (skip rows starting with this character)
- **/
+ */
 __global__ void __launch_bounds__(rowofs_block_dim)
   gather_row_offsets_gpu(uint64_t *row_ctx,
                          device_span<uint64_t> offsets_out,
diff --git a/cpp/src/io/csv/csv_gpu.h b/cpp/src/io/csv/csv_gpu.h
index b1fb7d2cca5..d0e0698f8e7 100644
--- a/cpp/src/io/csv/csv_gpu.h
+++ b/cpp/src/io/csv/csv_gpu.h
@@ -36,7 +36,7 @@ namespace gpu {
  * QUOTE: Within a quoted field
  * COMMENT: Within a comment line (discard every character until terminator)
  * EOF: End state (EOF reached)
- **/
+ */
 enum { ROW_CTX_NONE = 0, ROW_CTX_QUOTE = 1, ROW_CTX_COMMENT = 2, ROW_CTX_EOF = 3 };
 
 constexpr uint32_t rowofs_block_dim = 512;
@@ -47,7 +47,7 @@ constexpr uint32_t rowofs_block_bytes = rowofs_block_dim * 32;  // 16KB/threadbl
  * Row parsing context with row count
  * Format: row_count * 4 + id, where `row_count` is the number of rows
  * in a character block, and `id` is the row parser state at the end of the block.
- **/
+ */
 typedef uint32_t rowctx32_t;
 typedef uint64_t rowctx64_t;
 
@@ -60,7 +60,7 @@ typedef uint64_t rowctx64_t;
  * to 18-bit) and concatenated to form a 80-bit value, whose upper 16 bits are
  * always zero (EOF input state implies a zero row count) and therefore
  * stored as 64-bit.
- **/
+ */
 typedef uint64_t packed_rowctx_t;
 
 /**
@@ -68,8 +68,7 @@ typedef uint64_t packed_rowctx_t;
  *
  * The 32-bit row context consists of the 2-bit parser state stored in the lower 2-bits
  * and a 30-bit row count in the upper 30 bits.
- *
- **/
+ */
 inline __host__ __device__ rowctx32_t make_row_context(uint32_t row_count, uint32_t out_ctx)
 {
   return (row_count << 2) + out_ctx;
@@ -86,8 +85,7 @@ inline __host__ __device__ rowctx32_t make_row_context(uint32_t row_count, uint3
  * since a block starting in a EOF state can only have a zero row count (and the output
  * state corresponding to an EOF input state can only be EOF, so only the first 3 output
  * states are included as parameters, and the EOF->EOF state transition is hardcoded)
- *
- **/
+ */
 constexpr __host__ __device__ packed_rowctx_t pack_row_contexts(rowctx32_t ctx0,
                                                                 rowctx32_t ctx1,
                                                                 rowctx32_t ctx2)
@@ -98,7 +96,7 @@ constexpr __host__ __device__ packed_rowctx_t pack_row_contexts(rowctx32_t ctx0,
 
 /**
  * @brief Unpack a row context  (select one of the 4 contexts in packed form)
- **/
+ */
 inline __host__ __device__ rowctx32_t get_row_context(packed_rowctx_t packed_ctx, uint32_t ctxid)
 {
   return static_cast<rowctx32_t>((packed_ctx >> (ctxid * 20)) & ((1 << 20) - 1));
@@ -114,7 +112,7 @@ inline __host__ __device__ rowctx32_t get_row_context(packed_rowctx_t packed_ctx
  * @param sel_ctx input context (2-bit context id, 62-bit row count)
  * @param packed_ctx row context of character block
  * @return total_row_count * 4 + output context id
- **/
+ */
 inline __host__ __device__ rowctx64_t select_row_context(rowctx64_t sel_ctx,
                                                          packed_rowctx_t packed_ctx)
 {
@@ -150,7 +148,7 @@ inline __host__ __device__ rowctx64_t select_row_context(rowctx64_t sel_ctx,
  * @param stream CUDA stream used for device memory operations and kernel launches.
  *
  * @return Number of row contexts
- **/
+ */
 uint32_t gather_row_offsets(cudf::io::parse_options_view const &options,
                             uint64_t *row_ctx,
                             device_span<uint64_t> offsets_out,
@@ -171,8 +169,7 @@ uint32_t gather_row_offsets(cudf::io::parse_options_view const &options,
  * @param data Character data buffer
  * @param row_offsets Row offsets in the character data buffer
  * @param stream CUDA stream used for device memory operations and kernel launches.
- *
- **/
+ */
 size_t count_blank_rows(cudf::io::parse_options_view const &options,
                         device_span<char const> data,
                         device_span<uint64_t const> row_offsets,
@@ -185,8 +182,7 @@ size_t count_blank_rows(cudf::io::parse_options_view const &options,
  * @param data Character data buffer
  * @param row_offsets Row offsets in the character data buffer
  * @param stream CUDA stream used for device memory operations and kernel launches.
- *
- **/
+ */
 void remove_blank_rows(const cudf::io::parse_options_view &options,
                        device_span<char const> data,
                        rmm::device_vector<uint64_t> &row_offsets,
@@ -202,7 +198,7 @@ void remove_blank_rows(const cudf::io::parse_options_view &options,
  * @param[in] stream CUDA stream to use, default 0
  *
  * @return stats Histogram of each dtypes' occurrence for each column
- **/
+ */
 thrust::host_vector<column_type_histogram> detect_column_types(
   cudf::io::parse_options_view const &options,
   device_span<char const> data,
@@ -222,7 +218,7 @@ thrust::host_vector<column_type_histogram> detect_column_types(
  * @param[out] columns Device memory output of column data
  * @param[out] valids Device memory output of column valids bitmap data
  * @param[in] stream CUDA stream to use, default 0
- **/
+ */
 void decode_row_column_data(cudf::io::parse_options_view const &options,
                             device_span<char const> data,
                             device_span<column_parse::flags const> column_flags,
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 8e498e42181..1e27ee39455 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -17,7 +17,7 @@
 /**
  * @file reader_impl.cu
  * @brief cuDF-IO CSV reader class implementation
- **/
+ */
 
 #include "reader_impl.hpp"
 
@@ -62,7 +62,7 @@ using namespace cudf::io;
  * @param[in] num_columns Number of columns in the CSV file (optional)
  *
  * @return Estimated maximum size of a row, in bytes
- **/
+ */
 constexpr size_t calculateMaxRowSize(int num_columns = 0) noexcept
 {
   constexpr size_t max_row_bytes = 16 * 1024;  // 16KB
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 6bbe83ef83f..dda2e0704f6 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -311,19 +311,15 @@ struct column_to_strings_fn {
     }();
 
     // handle the cases where delimiter / line-terminator can be
-    // "-" or ":", in which case they are to be dropped from the format:
+    // "-" or ":", in which case we need to add quotes to the format
     //
     std::string delimiter{options_.get_inter_column_delimiter()};
     std::string newline{options_.get_line_terminator()};
 
     constexpr char const* dash{"-"};
     constexpr char const* colon{":"};
-    if (delimiter == dash || newline == dash) {
-      format.erase(std::remove(format.begin(), format.end(), dash[0]), format.end());
-    }
-
-    if (delimiter == colon || newline == colon) {
-      format.erase(std::remove(format.begin(), format.end(), colon[0]), format.end());
+    if (delimiter == dash || newline == dash || delimiter == colon || newline == colon) {
+      format = "\"" + format + "\"";
     }
 
     auto conv_col_ptr = cudf::strings::from_timestamps(column, format, mr_);
diff --git a/cpp/src/io/csv/writer_impl.hpp b/cpp/src/io/csv/writer_impl.hpp
index f3d2f999070..9c42a3666fb 100644
--- a/cpp/src/io/csv/writer_impl.hpp
+++ b/cpp/src/io/csv/writer_impl.hpp
@@ -44,7 +44,7 @@ using namespace cudf::io;
 
 /**
  * @brief Implementation for CSV writer
- **/
+ */
 class writer::impl {
  public:
   /**
@@ -53,7 +53,7 @@ class writer::impl {
    * @param sink Output sink
    * @param options Settings for controlling behavior
    * @param mr Device memory resource to use for device memory allocation
-   **/
+   */
   impl(std::unique_ptr<data_sink> sink,
        csv_writer_options const& options,
        rmm::mr::device_memory_resource* mr);
@@ -64,7 +64,7 @@ class writer::impl {
    * @param table The set of columns
    * @param metadata The metadata associated with the table
    * @param stream CUDA stream used for device memory operations and kernel launches.
-   **/
+   */
   void write(table_view const& table,
              const table_metadata* metadata = nullptr,
              rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
@@ -75,7 +75,7 @@ class writer::impl {
    * @param table The set of columns
    * @param metadata The metadata associated with the table
    * @param stream CUDA stream used for device memory operations and kernel launches.
-   **/
+   */
   void write_chunked_begin(table_view const& table,
                            const table_metadata* metadata = nullptr,
                            rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
@@ -86,7 +86,7 @@ class writer::impl {
    * @param strings_column Subset of columns converted to string to be written.
    * @param metadata The metadata associated with the table
    * @param stream CUDA stream used for device memory operations and kernel launches.
-   **/
+   */
   void write_chunked(strings_column_view const& strings_column,
                      const table_metadata* metadata = nullptr,
                      rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
@@ -97,7 +97,7 @@ class writer::impl {
    * @param table The set of columns
    * @param metadata The metadata associated with the table
    * @param stream CUDA stream used for device memory operations and kernel launches.
-   **/
+   */
   void write_chunked_end(table_view const& table,
                          const table_metadata* metadata = nullptr,
                          rmm::cuda_stream_view stream   = rmm::cuda_stream_default)
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 38335f79a5c..a4504e12c2f 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -295,8 +295,7 @@ void write_orc(orc_writer_options const& options, rmm::mr::device_memory_resourc
 
 /**
  * @copydoc cudf::io::write_orc_chunked_begin
- *
- **/
+ */
 std::shared_ptr<orc_chunked_state> write_orc_chunked_begin(chunked_orc_writer_options const& opts,
                                                            rmm::mr::device_memory_resource* mr)
 {
@@ -320,8 +319,7 @@ std::shared_ptr<orc_chunked_state> write_orc_chunked_begin(chunked_orc_writer_op
 
 /**
  * @copydoc cudf::io::write_orc_chunked
- *
- **/
+ */
 void write_orc_chunked(table_view const& table, std::shared_ptr<orc_chunked_state> state)
 {
   CUDF_FUNC_RANGE();
@@ -330,8 +328,7 @@ void write_orc_chunked(table_view const& table, std::shared_ptr<orc_chunked_stat
 
 /**
  * @copydoc cudf::io::write_orc_chunked_end
- *
- **/
+ */
 void write_orc_chunked_end(std::shared_ptr<orc_chunked_state>& state)
 {
   CUDF_FUNC_RANGE();
@@ -368,8 +365,7 @@ std::unique_ptr<std::vector<uint8_t>> write_parquet(parquet_writer_options const
 
 /**
  * @copydoc cudf::io::merge_rowgroup_metadata
- *
- **/
+ */
 std::unique_ptr<std::vector<uint8_t>> merge_rowgroup_metadata(
   const std::vector<std::unique_ptr<std::vector<uint8_t>>>& metadata_list)
 {
@@ -379,8 +375,7 @@ std::unique_ptr<std::vector<uint8_t>> merge_rowgroup_metadata(
 
 /**
  * @copydoc cudf::io::write_parquet_chunked_begin
- *
- **/
+ */
 std::shared_ptr<pq_chunked_state> write_parquet_chunked_begin(
   chunked_parquet_writer_options const& op, rmm::mr::device_memory_resource* mr)
 {
@@ -407,8 +402,7 @@ std::shared_ptr<pq_chunked_state> write_parquet_chunked_begin(
 
 /**
  * @copydoc cudf::io::write_parquet_chunked
- *
- **/
+ */
 void write_parquet_chunked(table_view const& table, std::shared_ptr<pq_chunked_state> state)
 {
   CUDF_FUNC_RANGE();
@@ -417,8 +411,7 @@ void write_parquet_chunked(table_view const& table, std::shared_ptr<pq_chunked_s
 
 /**
  * @copydoc cudf::io::write_parquet_chunked_end
- *
- **/
+ */
 std::unique_ptr<std::vector<uint8_t>> write_parquet_chunked_end(
   std::shared_ptr<pq_chunked_state>& state,
   bool return_filemetadata,
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index 73d842503aa..c0019c6de62 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -545,12 +545,12 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts,
        input_field_index++) {
     auto const desc =
       next_field_descriptor(current, row_data_range.second, opts, input_field_index, col_map);
-    auto const value_len = desc.value_end - desc.value_begin;
+    auto const value_len = static_cast<size_t>(std::max(desc.value_end - desc.value_begin, 0L));
 
     current = desc.value_end + 1;
 
     // Empty fields are not legal values
-    if (value_len > 0 && !serialized_trie_contains(opts.trie_na, desc.value_begin, value_len)) {
+    if (!serialized_trie_contains(opts.trie_na, desc.value_begin, value_len)) {
       // Type dispatcher does not handle strings
       if (column_types[desc.column].id() == type_id::STRING) {
         auto str_list           = static_cast<string_pair *>(output_columns[desc.column]);
@@ -616,13 +616,13 @@ __global__ void detect_data_types_kernel(
        input_field_index++) {
     auto const desc =
       next_field_descriptor(current, row_data_range.second, opts, input_field_index, col_map);
-    auto const value_len = desc.value_end - desc.value_begin;
+    auto const value_len = static_cast<size_t>(std::max(desc.value_end - desc.value_begin, 0L));
 
     // Advance to the next field; +1 to skip the delimiter
     current = desc.value_end + 1;
 
     // Checking if the field is empty/valid
-    if (value_len <= 0 || serialized_trie_contains(opts.trie_na, desc.value_begin, value_len)) {
+    if (serialized_trie_contains(opts.trie_na, desc.value_begin, value_len)) {
       // Increase the null count for array rows, where the null count is initialized to zero.
       if (!are_rows_objects) { atomicAdd(&column_infos[desc.column].null_count, 1); }
       continue;
@@ -755,7 +755,6 @@ __device__ key_value_range get_next_key_value_range(char const *begin,
  * @param[in] row_offsets The offset of each row in the input
  * @param[out] keys_cnt Number of keys found in the file
  * @param[out] keys_info optional, information (offset, length, hash) for each found key
- *
  */
 __global__ void collect_keys_info_kernel(parse_options_view const options,
                                          device_span<char const> const data,
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 2279c35de02..4ae7e063b4b 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -361,7 +361,6 @@ void reader::impl::set_record_starts(rmm::cuda_stream_view stream)
  * Sets the d_data_ data member.
  * Only rows that need to be parsed are copied, based on the byte range
  * Also updates the array of record starts to match the device data offset.
- *
  */
 void reader::impl::upload_data_to_device(rmm::cuda_stream_view stream)
 {
@@ -635,7 +634,7 @@ reader::impl::impl(std::unique_ptr<datasource> source,
 
   opts_.trie_true  = createSerializedTrie({"true"});
   opts_.trie_false = createSerializedTrie({"false"});
-  opts_.trie_na    = createSerializedTrie({"null"});
+  opts_.trie_na    = createSerializedTrie({"", "null"});
 
   opts_.dayfirst = options.is_enabled_dayfirst();
 }
diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index 55844378ee9..b814f5364ca 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -62,8 +62,7 @@ static inline __device__ uint32_t nvstr_init_hash(char const *ptr, uint32_t len)
  *
  * @param[in,out] s dictionary builder state
  * @param[in] t thread id
- *
- **/
+ */
 static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s, int t)
 {
   if (t == 0) { s->nnz = 0; }
@@ -75,11 +74,14 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s, int t)
         s->scratch_red[t] = 0xffffffffu;
       } else {
         uint32_t row = s->chunk.start_row + i + t * 32;
-        uint32_t v   = (row < s->chunk.start_row + s->chunk.num_rows) ? valid_map[row >> 5] : 0;
+        uint32_t v   = (row < s->chunk.start_row + s->chunk.num_rows)
+                       ? valid_map[(row + s->chunk.column_offset) / 32]
+                       : 0;
         if (row & 0x1f) {
-          uint32_t v1 =
-            (row + 32 < s->chunk.start_row + s->chunk.num_rows) ? valid_map[(row >> 5) + 1] : 0;
-          v = __funnelshift_r(v, v1, row & 0x1f);
+          uint32_t v1 = (row + 32 < s->chunk.start_row + s->chunk.num_rows)
+                          ? valid_map[((row + s->chunk.column_offset) / 32) + 1]
+                          : 0;
+          v = __funnelshift_r(v, v1, row + s->chunk.column_offset);
         }
         s->scratch_red[t] = v;
       }
@@ -107,8 +109,7 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s, int t)
  *
  * @param[in] chunks DictionaryChunk device array [rowgroup][column]
  * @param[in] num_columns Number of columns
- *
- **/
+ */
 // blockDim {512,1,1}
 template <int block_size>
 __global__ void __launch_bounds__(block_size, 2)
@@ -295,8 +296,7 @@ __global__ void __launch_bounds__(block_size, 2)
  * @param[in] stripes StripeDictionary device array [stripe][column]
  * @param[in] chunks DictionaryChunk device array [rowgroup][column]
  * @param[in] num_columns Number of columns
- *
- **/
+ */
 // blockDim {1024,1,1}
 extern "C" __global__ void __launch_bounds__(1024)
   gpuCompactChunkDictionaries(StripeDictionary *stripes,
@@ -354,8 +354,7 @@ struct build_state_s {
  *
  * @param[in] stripes StripeDictionary device array [stripe][column]
  * @param[in] num_columns Number of string columns
- *
- **/
+ */
 // NOTE: Prone to poor utilization on small datasets due to 1 block per dictionary
 // blockDim {1024,1,1}
 template <int block_size>
diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index 28f9a25325b..c816f2ccbd8 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -169,16 +169,8 @@ bool ProtobufReader::InitSchema(FileFooter &ff)
   return true;
 }
 
-/* ----------------------------------------------------------------------------*/
-/**
- * @Brief ORC Protobuf Writer class
- *
- */
-/* ----------------------------------------------------------------------------*/
-
 /**
  * @Brief Add a single rowIndexEntry, negative input values treated as not present
- *
  */
 void ProtobufWriter::put_row_index_entry(int32_t present_blk,
                                          int32_t present_ofs,
@@ -322,13 +314,6 @@ size_t ProtobufWriter::write(const Metadata &s)
   return w.value();
 }
 
-/* ----------------------------------------------------------------------------*/
-/**
- * @Brief ORC decompression class
- *
- */
-/* ----------------------------------------------------------------------------*/
-
 OrcDecompressor::OrcDecompressor(CompressionKind kind, uint32_t blockSize)
   : m_kind(kind), m_blockSize(blockSize)
 {
@@ -353,7 +338,6 @@ OrcDecompressor::OrcDecompressor(CompressionKind kind, uint32_t blockSize)
   }
 }
 
-/* --------------------------------------------------------------------------*/
 /**
  * @Brief ORC block decompression
  *
@@ -363,8 +347,6 @@ OrcDecompressor::OrcDecompressor(CompressionKind kind, uint32_t blockSize)
  *
  * @returns pointer to uncompressed data, nullptr if error
  */
-/* ----------------------------------------------------------------------------*/
-
 const uint8_t *OrcDecompressor::Decompress(const uint8_t *srcBytes, size_t srcLen, size_t *dstLen)
 {
   // If uncompressed, just pass-through the input
diff --git a/cpp/src/io/orc/orc.h b/cpp/src/io/orc/orc.h
index 4ce922d5642..e425c0b8eef 100644
--- a/cpp/src/io/orc/orc.h
+++ b/cpp/src/io/orc/orc.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -108,8 +108,7 @@ struct Metadata {
 
 /**
  * @brief Class for parsing Orc's Protocol Buffers encoded metadata
- *
- **/
+ */
 
 class ProtobufReader {
  public:
@@ -208,8 +207,7 @@ class ProtobufReader {
 
 /**
  * @brief Class for encoding Orc's metadata with Protocol Buffers
- *
- **/
+ */
 class ProtobufWriter {
  public:
   ProtobufWriter() { m_buf = nullptr; }
@@ -258,8 +256,7 @@ class ProtobufWriter {
 
 /**
  * @brief Class for decompressing Orc data blocks using the CPU
- *
- **/
+ */
 
 class OrcDecompressor {
  public:
diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h
index a905b76f75d..811d440badd 100644
--- a/cpp/src/io/orc/orc_gpu.h
+++ b/cpp/src/io/orc/orc_gpu.h
@@ -21,6 +21,7 @@
 #include <io/comp/gpuinflate.h>
 #include <io/orc/orc_common.h>
 #include <io/statistics/column_stats.h>
+#include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -67,7 +68,7 @@ enum StreamIndexType {
 
 /**
  * @brief Struct to describe the output of a string datatype
- **/
+ */
 struct nvstrdesc_s {
   const char *ptr;
   size_t count;
@@ -75,7 +76,7 @@ struct nvstrdesc_s {
 
 /**
  * @brief Struct to describe a single entry in the global dictionary
- **/
+ */
 struct DictionaryEntry {
   uint32_t pos;  // Position in data stream
   uint32_t len;  // Length in data stream
@@ -83,12 +84,12 @@ struct DictionaryEntry {
 
 /**
  * @brief Mask to indicate conversion from decimals to float64
- **/
+ */
 constexpr int orc_decimal2float64_scale = 0x80;
 
 /**
  * @brief Struct to describe per stripe's column information
- **/
+ */
 struct ColumnDesc {
   const uint8_t *streams[CI_NUM_STREAMS];  // ptr to data stream index
   uint32_t strm_id[CI_NUM_STREAMS];        // stream ids
@@ -112,7 +113,7 @@ struct ColumnDesc {
 
 /**
  * @brief Struct to describe a groups of row belonging to a column stripe
- **/
+ */
 struct RowGroup {
   uint32_t chunk_id;        // Column chunk this entry belongs to
   uint32_t strm_offset[2];  // Index offset for CI_DATA and CI_DATA2 streams
@@ -121,12 +122,13 @@ struct RowGroup {
 
 /**
  * @brief Struct to describe an encoder data chunk
- **/
+ */
 struct EncChunk {
   uint8_t *streams[CI_NUM_STREAMS];   // encoded output
   int32_t strm_id[CI_NUM_STREAMS];    // stream id or -1 if not present
   uint32_t strm_len[CI_NUM_STREAMS];  // in: max length, out: actual length
   const uint32_t *valid_map_base;     // base ptr of input valid bit map
+  size_type column_offset;            // index of the first element relative to the base memory
   const void *column_data_base;       // base ptr of input column data
   uint32_t start_row;                 // start row of this chunk
   uint32_t num_rows;                  // number of rows in this chunk
@@ -139,7 +141,7 @@ struct EncChunk {
 
 /**
  * @brief Struct to describe a column stream within a stripe
- **/
+ */
 struct StripeStream {
   size_t bfr_offset;        // Offset of this stream in compressed buffer
   uint32_t stream_size;     // Size of stream in bytes
@@ -153,9 +155,10 @@ struct StripeStream {
 
 /**
  * @brief Struct to describe a dictionary chunk
- **/
+ */
 struct DictionaryChunk {
   const uint32_t *valid_map_base;  // base ptr of input valid bit map
+  size_type column_offset;         // index of the first element relative to the base memory
   const void *column_data_base;    // base ptr of column data (ptr,len pair)
   uint32_t *dict_data;             // dictionary data (index of non-null rows)
   uint32_t *dict_index;  // row indices of corresponding string (row from dictionary index)
@@ -170,7 +173,7 @@ struct DictionaryChunk {
 
 /**
  * @brief Struct to describe a dictionary
- **/
+ */
 struct StripeDictionary {
   const void *column_data_base;  // base ptr of column data (ptr,len pair)
   uint32_t *dict_data;           // row indices of corresponding string (row from dictionary index)
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 56694fd2064..e2676a1dce0 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -17,7 +17,7 @@
 /**
  * @file reader_impl.cu
  * @brief cuDF-IO ORC reader class implementation
- **/
+ */
 
 #include "reader_impl.hpp"
 #include "timezone.cuh"
@@ -46,7 +46,7 @@ using namespace cudf::io;
 namespace {
 /**
  * @brief Function that translates ORC data kind to cuDF type enum
- **/
+ */
 constexpr type_id to_type_id(const orc::SchemaType &schema,
                              bool use_np_dtypes,
                              type_id timestamp_type_id,
@@ -83,7 +83,7 @@ constexpr type_id to_type_id(const orc::SchemaType &schema,
 
 /**
  * @brief Function that translates cuDF time unit to ORC clock frequency
- **/
+ */
 constexpr int32_t to_clockrate(type_id timestamp_type_id)
 {
   switch (timestamp_type_id) {
@@ -124,7 +124,7 @@ constexpr std::pair<gpu::StreamIndexType, uint32_t> get_index_type_and_pos(
 /**
  * @brief A helper class for ORC file metadata. Provides some additional
  * convenience methods for initializing and accessing metadata.
- **/
+ */
 class metadata {
   using OrcStripeInfo = std::pair<const StripeInformation *, const StripeFooter *>;
 
@@ -242,7 +242,7 @@ class metadata {
    * @param[out] has_timestamp_column Whether there is a orc::TIMESTAMP column
    *
    * @return List of ORC column indexes
-   **/
+   */
   auto select_columns(std::vector<std::string> use_names, bool &has_timestamp_column)
   {
     std::vector<int> selection;
@@ -292,7 +292,7 @@ class metadata {
 namespace {
 /**
  * @brief Struct that maps ORC streams to columns
- **/
+ */
 struct orc_stream_info {
   orc_stream_info() = default;
   explicit orc_stream_info(
@@ -313,7 +313,7 @@ struct orc_stream_info {
 
 /**
  * @brief Function that populates column descriptors stream/chunk
- **/
+ */
 size_t gather_stream_info(const size_t stripe_index,
                           const orc::StripeInformation *stripeinfo,
                           const orc::StripeFooter *stripefooter,
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index a987c171392..50f8457d05b 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -33,8 +33,7 @@ namespace gpu {
  * @param[in] num_columns Number of columns
  * @param[in] num_rowgroups Number of rowgroups
  * @param[in] row_index_stride Rowgroup size in rows
- *
- **/
+ */
 constexpr unsigned int init_threads_per_group = 32;
 constexpr unsigned int init_groups_per_block  = 4;
 constexpr unsigned int init_threads_per_block = init_threads_per_group * init_groups_per_block;
@@ -65,8 +64,7 @@ __global__ void __launch_bounds__(init_threads_per_block)
  *
  * @param[in,out] groups Statistics merge groups
  * @param[in] statistics_count Number of statistics buffers
- *
- **/
+ */
 constexpr unsigned int buffersize_reduction_dim = 32;
 constexpr unsigned int buffersize_threads_per_block =
   buffersize_reduction_dim * buffersize_reduction_dim;
@@ -152,7 +150,6 @@ struct stats_state_s {
 /*
  * Protobuf encoding - see
  * https://developers.google.com/protocol-buffers/docs/encoding
- *
  */
 // Protobuf varint encoding for unsigned int
 __device__ inline uint8_t *pb_encode_uint(uint8_t *p, uint64_t v)
@@ -228,8 +225,7 @@ __device__ inline uint8_t *pb_put_fixed64(uint8_t *p, uint32_t id, const void *r
  *  optional TimestampStatistics timestampStatistics = 9;
  *  optional bool hasNull = 10;
  * }
- *
- **/
+ */
 constexpr unsigned int encode_threads_per_chunk = 32;
 constexpr unsigned int encode_chunks_per_block  = 4;
 constexpr unsigned int encode_threads_per_block =
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 92f2c190b00..856c23c0f55 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -166,8 +166,7 @@ static __device__ void bytestream_init(volatile orc_bytestream_s *bs,
  *
  * @param[in] bs Byte stream input
  * @param[in] bytes_consumed Number of bytes that were consumed
- *
- **/
+ */
 static __device__ void bytestream_flush_bytes(volatile orc_bytestream_s *bs,
                                               uint32_t bytes_consumed)
 {
@@ -186,8 +185,7 @@ static __device__ void bytestream_flush_bytes(volatile orc_bytestream_s *bs,
  *
  * @param[in] bs Byte stream input
  * @param[in] t thread id
- *
- **/
+ */
 static __device__ void bytestream_fill(orc_bytestream_s *bs, int t)
 {
   auto const count = bs->fill_count;
@@ -205,8 +203,7 @@ static __device__ void bytestream_fill(orc_bytestream_s *bs, int t)
  * @param[in] bs Byte stream input
  * @param[in] pos Position in byte stream
  * @return byte
- *
- **/
+ */
 inline __device__ uint8_t bytestream_readbyte(volatile orc_bytestream_s *bs, int pos)
 {
   return bs->buf.u8[pos & (bytestream_buffer_size - 1)];
@@ -218,8 +215,7 @@ inline __device__ uint8_t bytestream_readbyte(volatile orc_bytestream_s *bs, int
  * @param[in] bs Byte stream input
  * @param[in] pos Position in byte stream
  * @result bits
- *
- **/
+ */
 inline __device__ uint32_t bytestream_readu32(volatile orc_bytestream_s *bs, int pos)
 {
   uint32_t a = bs->buf.u32[(pos & (bytestream_buffer_size - 1)) >> 2];
@@ -234,8 +230,7 @@ inline __device__ uint32_t bytestream_readu32(volatile orc_bytestream_s *bs, int
  * @param[in] pos Position in byte stream
  * @param[in] numbits number of bits
  * @return bits
- *
- **/
+ */
 inline __device__ uint64_t bytestream_readu64(volatile orc_bytestream_s *bs, int pos)
 {
   uint32_t a    = bs->buf.u32[(pos & (bytestream_buffer_size - 1)) >> 2];
@@ -256,8 +251,7 @@ inline __device__ uint64_t bytestream_readu64(volatile orc_bytestream_s *bs, int
  * @param[in] bitpos Position in byte stream
  * @param[in] numbits number of bits
  * @return decoded value
- *
- **/
+ */
 inline __device__ uint32_t bytestream_readbits(volatile orc_bytestream_s *bs,
                                                int bitpos,
                                                uint32_t numbits)
@@ -275,8 +269,7 @@ inline __device__ uint32_t bytestream_readbits(volatile orc_bytestream_s *bs,
  * @param[in] bitpos Position in byte stream
  * @param[in] numbits number of bits
  * @return decoded value
- *
- **/
+ */
 inline __device__ uint64_t bytestream_readbits64(volatile orc_bytestream_s *bs,
                                                  int bitpos,
                                                  uint32_t numbits)
@@ -301,8 +294,7 @@ inline __device__ uint64_t bytestream_readbits64(volatile orc_bytestream_s *bs,
  * @param[in] bitpos Position in byte stream
  * @param[in] numbits number of bits
  * @param[out] result decoded value
- *
- **/
+ */
 inline __device__ void bytestream_readbe(volatile orc_bytestream_s *bs,
                                          int bitpos,
                                          uint32_t numbits,
@@ -318,8 +310,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s *bs,
  * @param[in] bitpos Position in byte stream
  * @param[in] numbits number of bits
  * @param[out] result decoded value
- *
- **/
+ */
 inline __device__ void bytestream_readbe(volatile orc_bytestream_s *bs,
                                          int bitpos,
                                          uint32_t numbits,
@@ -336,8 +327,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s *bs,
  * @param[in] bitpos Position in byte stream
  * @param[in] numbits number of bits
  * @param[out] result decoded value
- *
- **/
+ */
 inline __device__ void bytestream_readbe(volatile orc_bytestream_s *bs,
                                          int bitpos,
                                          uint32_t numbits,
@@ -353,8 +343,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s *bs,
  * @param[in] bitpos Position in byte stream
  * @param[in] numbits number of bits
  * @param[out] result decoded value
- *
- **/
+ */
 inline __device__ void bytestream_readbe(volatile orc_bytestream_s *bs,
                                          int bitpos,
                                          uint32_t numbits,
@@ -370,7 +359,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s *bs,
  * @param[in] bs Byte stream input
  * @param[in] pos Position in circular byte stream buffer
  * @return length of varint in bytes
- **/
+ */
 template <class T>
 inline __device__ uint32_t varint_length(volatile orc_bytestream_s *bs, int pos)
 {
@@ -408,7 +397,7 @@ inline __device__ uint32_t varint_length(volatile orc_bytestream_s *bs, int pos)
  * @param[in] pos Position in circular byte stream buffer
  * @param[in] result Unpacked value
  * @return new position in byte stream buffer
- **/
+ */
 template <class T>
 inline __device__ int decode_base128_varint(volatile orc_bytestream_s *bs, int pos, T &result)
 {
@@ -463,7 +452,7 @@ inline __device__ int decode_base128_varint(volatile orc_bytestream_s *bs, int p
 
 /**
  * @brief Decodes a signed int128 encoded as base-128 varint (used for decimals)
- **/
+ */
 inline __device__ int128_s decode_varint128(volatile orc_bytestream_s *bs, int pos)
 {
   uint32_t b        = bytestream_readbyte(bs, pos++);
@@ -491,7 +480,7 @@ inline __device__ int128_s decode_varint128(volatile orc_bytestream_s *bs, int p
 
 /**
  * @brief Decodes an unsigned 32-bit varint
- **/
+ */
 inline __device__ int decode_varint(volatile orc_bytestream_s *bs, int pos, uint32_t &result)
 {
   uint32_t u;
@@ -502,7 +491,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s *bs, int pos, uint
 
 /**
  * @brief Decodes an unsigned 64-bit varint
- **/
+ */
 inline __device__ int decode_varint(volatile orc_bytestream_s *bs, int pos, uint64_t &result)
 {
   uint64_t u;
@@ -513,7 +502,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s *bs, int pos, uint
 
 /**
  * @brief Signed version of 32-bit decode_varint
- **/
+ */
 inline __device__ int decode_varint(volatile orc_bytestream_s *bs, int pos, int32_t &result)
 {
   uint32_t u;
@@ -524,7 +513,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s *bs, int pos, int3
 
 /**
  * @brief Signed version of 64-bit decode_varint
- **/
+ */
 inline __device__ int decode_varint(volatile orc_bytestream_s *bs, int pos, int64_t &result)
 {
   uint64_t u;
@@ -541,7 +530,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s *bs, int pos, int6
  * @param[in] t thread id
  *
  * @return number of values decoded
- **/
+ */
 template <class T>
 inline __device__ void lengths_to_positions(volatile T *vals, uint32_t numvals, unsigned int t)
 {
@@ -561,7 +550,7 @@ inline __device__ void lengths_to_positions(volatile T *vals, uint32_t numvals,
  * @param[in] t thread id
  *
  * @return number of values decoded
- **/
+ */
 template <class T>
 static __device__ uint32_t Integer_RLEv1(
   orc_bytestream_s *bs, volatile orc_rlev1_state_s *rle, volatile T *vals, uint32_t maxvals, int t)
@@ -636,8 +625,7 @@ static __device__ uint32_t Integer_RLEv1(
 
 /**
  * @brief Maps the RLEv2 5-bit length code to 6-bit length
- *
- **/
+ */
 static const __device__ __constant__ uint8_t kRLEv2_W[32] = {
   1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
   17, 18, 19, 20, 21, 22, 23, 24, 26, 28, 30, 32, 40, 48, 56, 64};
@@ -666,7 +654,7 @@ static const __device__ __constant__ uint8_t ClosestFixedBitsMap[65] = {
  * @param[in] t thread id
  *
  * @return number of values decoded
- **/
+ */
 template <class T>
 static __device__ uint32_t Integer_RLEv2(
   orc_bytestream_s *bs, volatile orc_rlev2_state_s *rle, volatile T *vals, uint32_t maxvals, int t)
@@ -894,7 +882,7 @@ static __device__ uint32_t Integer_RLEv2(
  * @param[in] bitpos bit position
  *
  * @return 32-bit value
- **/
+ */
 inline __device__ uint32_t rle8_read_bool32(volatile uint32_t *vals, uint32_t bitpos)
 {
   uint32_t a = vals[(bitpos >> 5) + 0];
@@ -914,7 +902,7 @@ inline __device__ uint32_t rle8_read_bool32(volatile uint32_t *vals, uint32_t bi
  * @param[in] t thread id
  *
  * @return number of values decoded
- **/
+ */
 static __device__ uint32_t Byte_RLE(orc_bytestream_s *bs,
                                     volatile orc_byterle_state_s *rle,
                                     volatile uint8_t *vals,
@@ -977,8 +965,7 @@ static __device__ uint32_t Byte_RLE(orc_bytestream_s *bs,
 
 /**
  * @brief Powers of 10
- *
- **/
+ */
 static const __device__ __constant__ double kPow10[40] = {
   1.0,   1.e1,  1.e2,  1.e3,  1.e4,  1.e5,  1.e6,  1.e7,  1.e8,  1.e9,  1.e10, 1.e11, 1.e12, 1.e13,
   1.e14, 1.e15, 1.e16, 1.e17, 1.e18, 1.e19, 1.e20, 1.e21, 1.e22, 1.e23, 1.e24, 1.e25, 1.e26, 1.e27,
@@ -1023,8 +1010,7 @@ static const __device__ __constant__ int64_t kPow5i[28] = {1,
  * @param[in] t thread id
  *
  * @return number of values decoded
- *
- **/
+ */
 static __device__ int Decode_Decimals(orc_bytestream_s *bs,
                                       volatile orc_byterle_state_s *scratch,
                                       volatile orcdec_state_s::values &vals,
@@ -1122,8 +1108,7 @@ static __device__ int Decode_Decimals(orc_bytestream_s *bs,
  * @param[in] num_stripes Number of stripes
  * @param[in] max_num_rows Maximum number of rows to load
  * @param[in] first_row Crop all rows below first_row
- *
- **/
+ */
 // blockDim {block_size,1,1}
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
@@ -1305,8 +1290,7 @@ __global__ void __launch_bounds__(block_size)
  * @param[in] first_row crop all rows below first rows
  * @param[in] t thread id
  * @param[in] temp_storage shared memory storage to performance warp reduce
- *
- **/
+ */
 template <typename Storage>
 static __device__ void DecodeRowPositions(orcdec_state_s *s,
                                           size_t first_row,
@@ -1381,8 +1365,7 @@ static __device__ void DecodeRowPositions(orcdec_state_s *s,
 
 /**
  * @brief Trailing zeroes for decoding timestamp nanoseconds
- *
- **/
+ */
 static const __device__ __constant__ uint32_t kTimestampNanoScale[8] = {
   1, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000};
 
@@ -1398,8 +1381,7 @@ static const __device__ __constant__ uint32_t kTimestampNanoScale[8] = {
  * @param[in] num_chunks Number of column chunks (num_columns * num_stripes)
  * @param[in] num_rowgroups Number of row groups in row index data
  * @param[in] rowidx_stride Row index stride
- *
- **/
+ */
 // blockDim {block_size,1,1}
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
@@ -1539,7 +1521,8 @@ __global__ void __launch_bounds__(block_size)
       __syncthreads();
       // Account for skipped values
       if (num_rowgroups > 0 && !s->is_string) {
-        uint32_t run_pos = s->top.data.index.run_pos[CI_DATA];
+        uint32_t run_pos = (s->chunk.type_kind == DECIMAL) ? s->top.data.index.run_pos[CI_DATA2]
+                                                           : s->top.data.index.run_pos[CI_DATA];
         numvals =
           min(numvals + run_pos, (s->chunk.type_kind == BOOLEAN) ? blockDim.x * 2 : blockDim.x);
       }
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index b086b6945c7..05bc46dfdd6 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -15,6 +15,8 @@
  */
 
 #include <cub/cub.cuh>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/utilities/bit.hpp>
 #include <io/utilities/block_utils.cuh>
 #include <rmm/cuda_stream_view.hpp>
 #include "orc_common.h"
@@ -120,8 +122,7 @@ static inline __device__ uint32_t CountLeadingBytes64(uint64_t v) { return __clz
  * @param[in] inpos position in input buffer
  * @param[in] count number of bytes to encode
  * @param[in] t thread id
- *
- **/
+ */
 template <StreamIndexType cid, uint32_t inmask>
 static __device__ void StoreBytes(
   orcenc_state_s *s, const uint8_t *inbuf, uint32_t inpos, uint32_t count, int t)
@@ -152,8 +153,7 @@ static __device__ void StoreBytes(
  * @param[in] t thread id
  *
  * @return number of input values encoded
- *
- **/
+ */
 template <StreamIndexType cid, uint32_t inmask>
 static __device__ uint32_t ByteRLE(
   orcenc_state_s *s, const uint8_t *inbuf, uint32_t inpos, uint32_t numvals, uint32_t flush, int t)
@@ -264,13 +264,13 @@ static __device__ uint32_t ByteRLE(
 
 /**
  * @brief Maps the symbol size in bytes to RLEv2 5-bit length code
- **/
+ */
 static const __device__ __constant__ uint8_t kByteLengthToRLEv2_W[9] = {
   0, 7, 15, 23, 27, 28, 29, 30, 31};
 
 /**
  * @brief Encode a varint value, return the number of bytes written
- **/
+ */
 static inline __device__ uint32_t StoreVarint(uint8_t *dst, uint64_t v)
 {
   uint32_t bytecnt = 0;
@@ -354,8 +354,7 @@ static inline __device__ void StoreBitsBigEndian(
  * @param[in] temp_storage_half shared memory storage to performance half warp reduce
  *
  * @return number of input values encoded
- *
- **/
+ */
 template <StreamIndexType cid,
           class T,
           bool is_signed,
@@ -597,8 +596,7 @@ static __device__ uint32_t IntegerRLE(orcenc_state_s *s,
  * @param[in] strenc string encoder state
  * @param[in] len(t) string length (per thread)
  * @param[in] t thread id
- *
- **/
+ */
 static __device__ void StoreStringData(uint8_t *dst,
                                        strdata_enc_state_s *strenc,
                                        uint32_t len,
@@ -638,8 +636,7 @@ static __device__ void StoreStringData(uint8_t *dst,
  * @param[in] vals input values
  * @param[in] numvals number of values
  * @param[in] t thread id
- *
- **/
+ */
 template <class T>
 inline __device__ void lengths_to_positions(volatile T *vals, uint32_t numvals, unsigned int t)
 {
@@ -651,7 +648,7 @@ inline __device__ void lengths_to_positions(volatile T *vals, uint32_t numvals,
 
 /**
  * @brief Timestamp scale table (powers of 10)
- **/
+ */
 static const __device__ __constant__ int32_t kTimeScale[10] = {
   1000000000, 100000000, 10000000, 1000000, 100000, 10000, 1000, 100, 10, 1};
 
@@ -661,8 +658,7 @@ static const __device__ __constant__ int32_t kTimeScale[10] = {
  * @param[in] chunks EncChunk device array [rowgroup][column]
  * @param[in] num_columns Number of columns
  * @param[in] num_rowgroups Number of row groups
- *
- **/
+ */
 // blockDim {512,1,1}
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
@@ -712,10 +708,12 @@ __global__ void __launch_bounds__(block_size)
         uint8_t valid = 0;
         if (row < s->chunk.valid_rows) {
           if (s->chunk.valid_map_base) {
-            uint8_t valid_map[4];
-            auto const valid_map_byte_idx = row >> 3;
-            memcpy(valid_map, &s->chunk.valid_map_base[valid_map_byte_idx / 4], 4);
-            valid = valid_map[valid_map_byte_idx % 4];
+            size_type current_valid_offset = row + s->chunk.column_offset;
+            size_type next_valid_offset    = current_valid_offset + min(32, s->chunk.valid_rows);
+
+            bitmask_type mask = cudf::detail::get_mask_offset_word(
+              s->chunk.valid_map_base, 0, current_valid_offset, next_valid_offset);
+            valid = 0xff & mask;
           } else {
             valid = 0xff;
           }
@@ -971,8 +969,7 @@ __global__ void __launch_bounds__(block_size)
  * @param[in] stripes Stripe dictionaries device array [stripe][string_column]
  * @param[in] chunks EncChunk device array [rowgroup][column]
  * @param[in] num_columns Number of columns
- *
- **/
+ */
 // blockDim {512,1,1}
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
@@ -1059,8 +1056,7 @@ __global__ void __launch_bounds__(block_size)
  * @param[in] chunks EncChunk device array [rowgroup][column]
  * @param[in] num_stripe_streams Total number of streams
  * @param[in] num_columns Number of columns
- *
- **/
+ */
 // blockDim {1024,1,1}
 __global__ void __launch_bounds__(1024)
   gpuCompactOrcDataStreams(StripeStream *strm_desc, EncChunk *chunks, uint32_t num_columns)
@@ -1118,8 +1114,7 @@ __global__ void __launch_bounds__(1024)
  * @param[out] comp_out Per-block compression status
  * @param[in] compressed_bfr Compression output buffer
  * @param[in] comp_blk_size Compression block size
- *
- **/
+ */
 // blockDim {256,1,1}
 __global__ void __launch_bounds__(256) gpuInitCompressionBlocks(StripeStream *strm_desc,
                                                                 EncChunk *chunks,
@@ -1168,8 +1163,7 @@ __global__ void __launch_bounds__(256) gpuInitCompressionBlocks(StripeStream *st
  * @param[in] comp_out Per-block compression status
  * @param[in] compressed_bfr Compression output buffer
  * @param[in] comp_blk_size Compression block size
- *
- **/
+ */
 // blockDim {1024,1,1}
 __global__ void __launch_bounds__(1024) gpuCompactCompressedBlocks(StripeStream *strm_desc,
                                                                    gpu_inflate_input_s *comp_in,
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index da4c1d7ee7d..bd9f6694be4 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -189,7 +189,6 @@ extern "C" __global__ void __launch_bounds__(128, 8)
 
 /**
  * @brief Shared mem state for gpuParseRowGroupIndex
- *
  */
 struct rowindex_state_s {
   ColumnDesc chunk;
@@ -219,8 +218,7 @@ enum row_entry_state_e {
  * @param[in] start start position in byte stream
  * @param[in] end end of byte stream
  * @return bytes consumed
- *
- **/
+ */
 static uint32_t __device__ ProtobufParseRowIndexEntry(rowindex_state_s *s,
                                                       const uint8_t *start,
                                                       const uint8_t *end)
@@ -314,8 +312,7 @@ static uint32_t __device__ ProtobufParseRowIndexEntry(rowindex_state_s *s,
  *
  * @param[in,out] s row group index state
  * @param[in] num_rowgroups Number of index entries to read
- *
- **/
+ */
 static __device__ void gpuReadRowGroupIndexEntries(rowindex_state_s *s, int num_rowgroups)
 {
   const uint8_t *index_data = s->chunk.streams[CI_INDEX];
@@ -349,8 +346,7 @@ static __device__ void gpuReadRowGroupIndexEntries(rowindex_state_s *s, int num_
  * @param[in] ci_id index to convert (CI_DATA or CI_DATA2)
  * @param[in] num_rowgroups Number of index entries
  * @param[in] t thread id
- *
- **/
+ */
 static __device__ void gpuMapRowIndexToUncompressed(rowindex_state_s *s,
                                                     int ci_id,
                                                     int num_rowgroups,
@@ -396,8 +392,7 @@ static __device__ void gpuMapRowIndexToUncompressed(rowindex_state_s *s,
  * @param[in] num_columns Number of columns
  * @param[in] num_stripes Number of stripes
  * @param[in] num_rowgroups Number of row groups
- *
- **/
+ */
 // blockDim {128,1,1}
 extern "C" __global__ void __launch_bounds__(128, 8)
   gpuParseRowGroupIndex(RowGroup *row_groups,
diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp
index b7c97fcd529..bf8b96b89dc 100644
--- a/cpp/src/io/orc/timezone.cpp
+++ b/cpp/src/io/orc/timezone.cpp
@@ -398,9 +398,21 @@ timezone_table build_timezone_transition_table(std::string const &timezone_name)
     offsets.push_back(utcoff);
     if (!earliest_std_idx && !tzf.ttype[idx].isdst) { earliest_std_idx = ttimes.size() - 1; }
   }
-  if (!earliest_std_idx) { earliest_std_idx = 1; }
-  ttimes[0]  = ttimes[earliest_std_idx];
-  offsets[0] = offsets[earliest_std_idx];
+
+  if (tzf.timecnt() != 0) {
+    if (!earliest_std_idx) { earliest_std_idx = 1; }
+    ttimes[0]  = ttimes[earliest_std_idx];
+    offsets[0] = offsets[earliest_std_idx];
+  } else {
+    if (tzf.typecnt() == 0 || tzf.ttype[0].utcoff == 0) {
+      // No transitions, offset is zero; Table would be a no-op.
+      // Return an empty table to speed up parsing.
+      return {};
+    }
+    // No transitions to use for the time/offset - use the first offset and apply to all timestamps
+    ttimes[0]  = std::numeric_limits<int64_t>::max();
+    offsets[0] = tzf.ttype[0].utcoff;
+  }
 
   // Generate entries for times after the last transition
   auto future_std_offset = offsets[tzf.timecnt()];
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 54e0ce1c62d..750919a7fab 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -49,13 +49,13 @@ struct row_group_index_info {
 namespace {
 /**
  * @brief Helper for pinned host memory
- **/
+ */
 template <typename T>
 using pinned_buffer = std::unique_ptr<T, decltype(&cudaFreeHost)>;
 
 /**
  * @brief Function that translates GDF compression to ORC compression
- **/
+ */
 orc::CompressionKind to_orc_compression(compression_type compression)
 {
   switch (compression) {
@@ -68,7 +68,7 @@ orc::CompressionKind to_orc_compression(compression_type compression)
 
 /**
  * @brief Function that translates GDF dtype to ORC datatype
- **/
+ */
 constexpr orc::TypeKind to_orc_type(cudf::type_id id)
 {
   switch (id) {
@@ -91,7 +91,7 @@ constexpr orc::TypeKind to_orc_type(cudf::type_id id)
 
 /**
  * @brief Function that translates time unit to nanoscale multiple
- **/
+ */
 template <typename T>
 constexpr T to_clockscale(cudf::type_id timestamp_id)
 {
@@ -110,16 +110,19 @@ constexpr T to_clockscale(cudf::type_id timestamp_id)
  * @brief Helper kernel for converting string data/offsets into nvstrdesc
  * REMOVEME: Once we eliminate the legacy readers/writers, the kernels could be
  * made to use the native offset+data layout.
- **/
+ */
 __global__ void stringdata_to_nvstrdesc(gpu::nvstrdesc_s *dst,
                                         const size_type *offsets,
                                         const char *strdata,
                                         const uint32_t *nulls,
+                                        const size_type column_offset,
                                         size_type column_size)
 {
   size_type row = blockIdx.x * blockDim.x + threadIdx.x;
   if (row < column_size) {
-    uint32_t is_valid = (nulls) ? (nulls[row >> 5] >> (row & 0x1f)) & 1 : 1;
+    uint32_t is_valid = (nulls != nullptr)
+                          ? (nulls[(row + column_offset) / 32] >> ((row + column_offset) % 32)) & 1
+                          : 1;
     size_t count;
     const char *ptr;
     if (is_valid) {
@@ -138,13 +141,13 @@ __global__ void stringdata_to_nvstrdesc(gpu::nvstrdesc_s *dst,
 
 /**
  * @brief Helper class that adds ORC-specific column info
- **/
+ */
 class orc_column_view {
  public:
   /**
    * @brief Constructor that extracts out the string position + length pairs
    * for building dictionaries for string columns
-   **/
+   */
   explicit orc_column_view(size_t id,
                            size_t str_id,
                            column_view const &col,
@@ -158,6 +161,7 @@ class orc_column_view {
       _null_count(col.null_count()),
       _data(col.head<uint8_t>() + col.offset() * _type_width),
       _nulls(col.nullable() ? col.null_mask() : nullptr),
+      _column_offset(col.offset()),
       _clockscale(to_clockscale<uint8_t>(col.type().id())),
       _type_kind(to_orc_type(col.type().id()))
   {
@@ -170,6 +174,7 @@ class orc_column_view {
         view.offsets().data<size_type>() + view.offset(),
         view.chars().data<char>(),
         _nulls,
+        _column_offset,
         _data_count);
       _data = _indexes.data();
 
@@ -189,7 +194,7 @@ class orc_column_view {
 
   /**
    * @brief Function that associates an existing dictionary chunk allocation
-   **/
+   */
   void attach_dict_chunk(gpu::DictionaryChunk *host_dict, gpu::DictionaryChunk *dev_dict)
   {
     dict   = host_dict;
@@ -204,7 +209,7 @@ class orc_column_view {
 
   /**
    * @brief Function that associates an existing stripe dictionary allocation
-   **/
+   */
   void attach_stripe_dict(gpu::StripeDictionary *host_stripe_dict,
                           gpu::StripeDictionary *dev_stripe_dict)
   {
@@ -224,6 +229,7 @@ class orc_column_view {
   bool nullable() const noexcept { return (_nulls != nullptr); }
   void const *data() const noexcept { return _data; }
   uint32_t const *nulls() const noexcept { return _nulls; }
+  size_type column_offset() const noexcept { return _column_offset; }
   uint8_t clockscale() const noexcept { return _clockscale; }
 
   void set_orc_encoding(ColumnEncodingKind e) { _encoding_kind = e; }
@@ -237,12 +243,13 @@ class orc_column_view {
   size_t _str_id    = 0;
   bool _string_type = false;
 
-  size_t _type_width     = 0;
-  size_t _data_count     = 0;
-  size_t _null_count     = 0;
-  void const *_data      = nullptr;
-  uint32_t const *_nulls = nullptr;
-  uint8_t _clockscale    = 0;
+  size_t _type_width       = 0;
+  size_t _data_count       = 0;
+  size_t _null_count       = 0;
+  void const *_data        = nullptr;
+  uint32_t const *_nulls   = nullptr;
+  size_type _column_offset = 0;
+  uint8_t _clockscale      = 0;
 
   // ORC-related members
   std::string _name{};
@@ -277,6 +284,7 @@ void writer::impl::init_dictionaries(orc_column_view *columns,
     for (size_t g = 0; g < num_rowgroups; g++) {
       auto *ck              = &dict[g * str_col_ids.size() + i];
       ck->valid_map_base    = str_column.nulls();
+      ck->column_offset     = str_column.column_offset();
       ck->column_data_base  = str_column.data();
       ck->dict_data         = dict_data + i * num_rows + g * row_index_stride_;
       ck->dict_index        = dict_index + i * num_rows;  // Indexed by abs row
@@ -579,12 +587,14 @@ rmm::device_buffer writer::impl::encode_columns(orc_column_view *columns,
       ck->type_kind     = columns[i].orc_kind();
       if (ck->type_kind == TypeKind::STRING) {
         ck->valid_map_base   = columns[i].nulls();
+        ck->column_offset    = columns[i].column_offset();
         ck->column_data_base = (ck->encoding_kind == DICTIONARY_V2)
                                  ? columns[i].host_stripe_dict(stripe_id)->dict_index
                                  : columns[i].data();
         ck->dtype_len = 1;
       } else {
         ck->valid_map_base   = columns[i].nulls();
+        ck->column_offset    = columns[i].column_offset();
         ck->column_data_base = columns[i].data();
         ck->dtype_len        = columns[i].type_width();
       }
@@ -760,6 +770,7 @@ std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
     desc->num_rows         = columns[i].data_count();
     desc->num_values       = columns[i].data_count();
     desc->valid_map_base   = columns[i].nulls();
+    desc->column_offset    = columns[i].column_offset();
     desc->column_data_base = columns[i].data();
     if (desc->stats_dtype == dtype_timestamp64) {
       // Timestamp statistics are in milliseconds
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index a7b1fef87ba..f280fb11884 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -47,7 +47,7 @@ using namespace cudf::io;
 
 /**
  * @brief Implementation for ORC writer
- **/
+ */
 class writer::impl {
   // ORC datasets start with a 3 byte header
   static constexpr const char* MAGIC = "ORC";
@@ -65,7 +65,7 @@ class writer::impl {
    * @param sink Output sink
    * @param options Settings for controlling behavior
    * @param mr Device memory resource to use for device memory allocation
-   **/
+   */
   explicit impl(std::unique_ptr<data_sink> sink,
                 orc_writer_options const& options,
                 rmm::mr::device_memory_resource* mr);
@@ -76,7 +76,7 @@ class writer::impl {
    * @param table The set of columns
    * @param metadata The metadata associated with the table
    * @param stream CUDA stream used for device memory operations and kernel launches.
-   **/
+   */
   void write(table_view const& table, const table_metadata* metadata, rmm::cuda_stream_view stream);
 
   /**
@@ -115,7 +115,7 @@ class writer::impl {
    * @param dict_index Dictionary index memory
    * @param dict List of dictionary chunks
    * @param stream CUDA stream used for device memory operations and kernel launches.
-   **/
+   */
   void init_dictionaries(orc_column_view* columns,
                          size_t num_rows,
                          std::vector<int> const& str_col_ids,
@@ -135,7 +135,7 @@ class writer::impl {
    * @param dict_index List of dictionary indices
    * @param stripe_dict List of stripe dictionaries
    * @param stream CUDA stream used for device memory operations and kernel launches.
-   **/
+   */
   void build_dictionaries(orc_column_view* columns,
                           size_t num_rows,
                           std::vector<int> const& str_col_ids,
@@ -155,7 +155,7 @@ class writer::impl {
    * @param strm_ids List of unique stream identifiers
    *
    * @return The streams
-   **/
+   */
   std::vector<Stream> gather_streams(orc_column_view* columns,
                                      size_t num_columns,
                                      size_t num_rows,
@@ -178,7 +178,7 @@ class writer::impl {
    * @param stream CUDA stream used for device memory operations and kernel launches.
    *
    * @return Device buffer containing encoded data
-   **/
+   */
   rmm::device_buffer encode_columns(orc_column_view* columns,
                                     size_t num_columns,
                                     size_t num_rows,
@@ -204,7 +204,7 @@ class writer::impl {
    * @param stream CUDA stream used for device memory operations and kernel launches.
    *
    * @return The stripes' information
-   **/
+   */
   std::vector<StripeInformation> gather_stripes(size_t num_columns,
                                                 size_t num_rows,
                                                 size_t num_index_streams,
@@ -228,7 +228,7 @@ class writer::impl {
    * @param stream CUDA stream used for device memory operations and kernel launches.
    *
    * @return The statistic blobs
-   **/
+   */
   std::vector<std::vector<uint8_t>> gather_statistic_blobs(
     orc_column_view const* columns,
     size_t num_columns,
@@ -254,7 +254,7 @@ class writer::impl {
    * @param comp_out Output status for compressed streams
    * @param streams List of all streams
    * @param pbw Protobuf writer
-   **/
+   */
   void write_index_stream(int32_t stripe_id,
                           int32_t stream_id,
                           orc_column_view* columns,
@@ -279,7 +279,7 @@ class writer::impl {
    * @param stripe Stream's parent stripe
    * @param streams List of all streams
    * @param stream CUDA stream used for device memory operations and kernel launches.
-   **/
+   */
   void write_data_stream(gpu::StripeStream const& strm_desc,
                          gpu::EncChunk const& chunk,
                          uint8_t const* compressed_data,
@@ -300,7 +300,7 @@ class writer::impl {
    *
    * @tparam T Optional type
    * @param num_rows Number of rows
-   **/
+   */
   template <typename T = size_t>
   constexpr inline auto div_by_rowgroups(T num_rows) const
   {
@@ -312,7 +312,7 @@ class writer::impl {
    *
    * @tparam T Optional type
    * @param modulus Number to use for division
-   **/
+   */
   template <typename T = size_t>
   constexpr inline auto div_rowgroups_by(T modulus) const
   {
diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp
index e23b5bb58ea..680ea078a2f 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.hpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.hpp
@@ -33,8 +33,7 @@ namespace parquet {
  * @brief Class for parsing Parquet's Thrift Compact Protocol encoded metadata
  *
  * This class takes in the Parquet structs and outputs a Thrift-encoded binary blob
- *
- **/
+ */
 class CompactProtocolWriter {
  public:
   CompactProtocolWriter(std::vector<uint8_t> *output) : m_buf(*output) {}
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 93c18bf951f..6e8937607b9 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -815,6 +815,39 @@ inline __device__ void gpuOutputDecimalAsFloat(volatile page_state_s *s,
   *dst  = (scale < 0) ? (d * kPow10[min(-scale, 39)]) : (d / kPow10[min(scale, 39)]);
 }
 
+/**
+ * @brief Output a fixed-length byte array(len <= 8) as a 64-bit int
+ *
+ * @param[in,out] s Page state input/output
+ * @param[in] src_pos Source position
+ * @param[in] dst Pointer to row output data
+ */
+inline __device__ void gpuOutputFixedLenByteArrayAsInt64(volatile page_state_s *s,
+                                                         int src_pos,
+                                                         int64_t *dst)
+{
+  uint32_t const dtype_len_in = s->dtype_len_in;
+  uint8_t const *data         = s->dict_base ? s->dict_base : s->data_start;
+  uint32_t const pos =
+    (s->dict_base ? ((s->dict_bits > 0) ? s->dict_idx[src_pos & (non_zero_buffer_size - 1)] : 0)
+                  : src_pos) *
+    dtype_len_in;
+  uint32_t const dict_size = s->dict_size;
+
+  int64_t unscaled64 = 0;
+  for (unsigned int i = 0; i < dtype_len_in; i++) {
+    uint32_t v = (pos + i < dict_size) ? data[pos + i] : 0;
+    unscaled64 = (unscaled64 << 8) | v;
+  }
+  // Shift the unscaled value up and back down when it isn't all 8 bytes,
+  // which sign extend the value for correctly representing negative numbers.
+  if (dtype_len_in < 8) {
+    unscaled64 <<= 64 - dtype_len_in * 8;
+    unscaled64 >>= 64 - dtype_len_in * 8;
+  }
+  *dst = unscaled64;
+}
+
 /**
  * @brief Output a small fixed-length value
  *
@@ -1690,7 +1723,14 @@ extern "C" __global__ void __launch_bounds__(block_size)
           switch (dtype) {
             case INT32: gpuOutputFast(s, src_pos, static_cast<uint32_t *>(dst)); break;
             case INT64: gpuOutputFast(s, src_pos, static_cast<uint2 *>(dst)); break;
-            default: gpuOutputDecimalAsFloat(s, src_pos, static_cast<double *>(dst), dtype); break;
+            default:
+              // we currently do not support reading byte arrays larger than DECIMAL64
+              if (s->dtype_len_in <= 8) {
+                gpuOutputFixedLenByteArrayAsInt64(s, src_pos, static_cast<int64_t *>(dst));
+              } else {
+                gpuOutputDecimalAsFloat(s, src_pos, static_cast<double *>(dst), dtype);
+              }
+              break;
           }
         } else if (dtype == INT96)
           gpuOutputInt96Timestamp(s, src_pos, static_cast<int64_t *>(dst));
diff --git a/cpp/src/io/parquet/page_dict.cu b/cpp/src/io/parquet/page_dict.cu
index 2c12cc62480..c69dcde607b 100644
--- a/cpp/src/io/parquet/page_dict.cu
+++ b/cpp/src/io/parquet/page_dict.cu
@@ -44,7 +44,7 @@ struct dict_state_s {
 
 /**
  * @brief Computes a 16-bit dictionary hash
- **/
+ */
 inline __device__ uint32_t uint32_hash16(uint32_t v) { return (v + (v >> 16)) & 0xffff; }
 
 inline __device__ uint32_t uint64_hash16(uint64_t v)
@@ -82,7 +82,7 @@ inline __device__ uint32_t nvstr_hash16(const uint8_t *p, uint32_t len)
  *fetching)
  * @param[in] frag_start_row row position of current fragment
  * @param[in] t thread id
- **/
+ */
 __device__ void FetchDictionaryFragment(dict_state_s *s,
                                         uint32_t *dict_data,
                                         uint32_t frag_start_row,
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 8ffb3f7a961..51f6e3bd014 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -107,8 +107,7 @@ inline __device__ uint32_t uint64_init_hash(uint64_t v)
  * @param[in] col_desc Column description array [column_id]
  * @param[in] num_fragments Number of fragments per column
  * @param[in] num_columns Number of columns
- *
- **/
+ */
 // blockDim {512,1,1}
 template <int block_size>
 __global__ void __launch_bounds__(block_size) gpuInitPageFragments(PageFragment *frag,
@@ -191,12 +190,16 @@ __global__ void __launch_bounds__(block_size) gpuInitPageFragments(PageFragment
   size_type nvals           = s->frag.num_leaf_values;
   size_type start_value_idx = s->start_value_idx;
 
+  size_type validity_offset = (s->col.nesting_offsets == nullptr) ? s->col.column_offset : 0;
   for (uint32_t i = 0; i < nvals; i += block_size) {
     const uint32_t *valid = s->col.valid_map_base;
     uint32_t val_idx      = start_value_idx + i + t;
-    uint32_t is_valid     = (i + t < nvals && val_idx < s->col.num_values)
-                          ? (valid) ? (valid[val_idx >> 5] >> (val_idx & 0x1f)) & 1 : 1
-                          : 0;
+    uint32_t is_valid =
+      (i + t < nvals && val_idx < s->col.num_values)
+        ? (valid)
+            ? (valid[(val_idx + validity_offset) / 32] >> ((val_idx + validity_offset) % 32)) & 1
+            : 1
+        : 0;
     uint32_t valid_warp = ballot(is_valid);
     uint32_t len, nz_pos, hash;
     if (is_valid) {
@@ -643,13 +646,13 @@ __global__ void __launch_bounds__(128) gpuInitPages(EncColumnChunk *chunks,
 /**
  * @brief Mask table representing how many consecutive repeats are needed to code a repeat run
  *[nbits-1]
- **/
+ */
 static __device__ __constant__ uint32_t kRleRunMask[16] = {
   0x00ffffff, 0x0fff, 0x00ff, 0x3f, 0x0f, 0x0f, 0x7, 0x7, 0x3, 0x3, 0x3, 0x3, 0x1, 0x1, 0x1, 0x1};
 
 /**
  * @brief Variable-length encode an integer
- **/
+ */
 inline __device__ uint8_t *VlqEncode(uint8_t *p, uint32_t v)
 {
   while (v > 0x7f) {
@@ -662,7 +665,7 @@ inline __device__ uint8_t *VlqEncode(uint8_t *p, uint32_t v)
 
 /**
  * @brief Pack literal values in output bitstream (1,2,4,8,12 or 16 bits per value)
- **/
+ */
 inline __device__ void PackLiterals(
   uint8_t *dst, uint32_t v, uint32_t count, uint32_t w, uint32_t t)
 {
@@ -964,6 +967,7 @@ __global__ void __launch_bounds__(128, 8) gpuEncodePages(EncPage *pages,
   }
   __syncthreads();
 
+  size_type validity_offset = (s->col.nesting_offsets == nullptr) ? s->col.column_offset : 0;
   // Encode Repetition and Definition levels
   if (s->page.page_type != PageType::DICTIONARY_PAGE && s->col.level_bits != 0 &&
       s->col.nesting_levels == 0) {
@@ -984,9 +988,12 @@ __global__ void __launch_bounds__(128, 8) gpuEncodePages(EncPage *pages,
         uint32_t row         = s->page.start_row + rle_numvals + t;
         // Definition level encodes validity. Checks the valid map and if it is valid, then sets the
         // def_lvl accordingly and sets it in s->vals which is then given to RleEncode to encode
-        uint32_t def_lvl = (rle_numvals + t < s->page.num_rows && row < s->col.num_rows)
-                             ? (valid) ? (valid[row >> 5] >> (row & 0x1f)) & 1 : 1
-                             : 0;
+        uint32_t def_lvl =
+          (rle_numvals + t < s->page.num_rows && row < s->col.num_rows)
+            ? (valid != nullptr)
+                ? (valid[(row + validity_offset) / 32] >> ((row + validity_offset) % 32)) & 1
+                : 1
+            : 0;
         s->vals[(rle_numvals + t) & (rle_buffer_size - 1)] = def_lvl;
         __syncthreads();
         rle_numvals += nrows;
@@ -1083,9 +1090,12 @@ __global__ void __launch_bounds__(128, 8) gpuEncodePages(EncPage *pages,
       val_idx  = (is_valid) ? s->col.dict_data[val_idx] : val_idx;
     } else {
       const uint32_t *valid = s->col.valid_map_base;
-      is_valid = (val_idx < s->col.num_values && cur_val_idx + t < s->page.num_leaf_values)
-                   ? (valid) ? (valid[val_idx >> 5] >> (val_idx & 0x1f)) & 1 : 1
-                   : 0;
+      is_valid =
+        (val_idx < s->col.num_values && cur_val_idx + t < s->page.num_leaf_values)
+          ? (valid != nullptr)
+              ? (valid[(val_idx + validity_offset) / 32] >> ((val_idx + validity_offset) % 32)) & 1
+              : 1
+          : 0;
     }
     warp_valids = ballot(is_valid);
     cur_val_idx += nvals;
@@ -1310,7 +1320,7 @@ __global__ void __launch_bounds__(128) gpuDecideCompression(EncColumnChunk *chun
 
 /**
  * Minimal thrift compact protocol support
- **/
+ */
 inline __device__ uint8_t *cpw_put_uint32(uint8_t *p, uint32_t v)
 {
   while (v > 0x7f) {
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 8dacc2a8dac..6c1c6209266 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -32,14 +32,14 @@ constexpr uint32_t parquet_magic = (('P' << 0) | ('A' << 8) | ('R' << 16) | ('1'
 
 /**
  * @brief Struct that describes the Parquet file data header
- **/
+ */
 struct file_header_s {
   uint32_t magic;
 };
 
 /**
  * @brief Struct that describes the Parquet file data postscript
- **/
+ */
 struct file_ender_s {
   uint32_t footer_len;
   uint32_t magic;
@@ -209,7 +209,7 @@ struct SchemaElement {
 
 /**
  * @brief Thrift-derived struct describing a column chunk
- **/
+ */
 struct ColumnChunkMetaData {
   Type type = BOOLEAN;
   std::vector<Encoding> encodings;
@@ -234,7 +234,7 @@ struct ColumnChunkMetaData {
  * Each column chunk lives in a particular row group and are guaranteed to be
  * contiguous in the file. Any mssing or corrupted chunks can be skipped during
  * reading.
- **/
+ */
 struct ColumnChunk {
   std::string file_path = "";
   int64_t file_offset   = 0;
@@ -253,7 +253,7 @@ struct ColumnChunk {
  *
  * There may be one or more row groups within a dataset, with each row group
  * consisting of a column chunk for each column.
- **/
+ */
 struct RowGroup {
   int64_t total_byte_size = 0;
   std::vector<ColumnChunk> columns;
@@ -262,7 +262,7 @@ struct RowGroup {
 
 /**
  * @brief Thrift-derived struct describing a key-value pair, for user metadata
- **/
+ */
 struct KeyValue {
   std::string key;
   std::string value;
@@ -274,7 +274,7 @@ struct KeyValue {
  * The additional information stored in the key_value_metadata can be used
  * during reading to reconstruct the output data to the exact original dataset
  * prior to conversion to Parquet.
- **/
+ */
 struct FileMetaData {
   int32_t version = 0;
   std::vector<SchemaElement> schema;
@@ -287,7 +287,7 @@ struct FileMetaData {
 
 /**
  * @brief Thrift-derived struct describing the header for a data page
- **/
+ */
 struct DataPageHeader {
   int32_t num_values                 = 0;  // Number of values, including NULLs, in this data page.
   Encoding encoding                  = Encoding::PLAIN;  // Encoding used for this data page
@@ -297,7 +297,7 @@ struct DataPageHeader {
 
 /**
  * @brief Thrift-derived struct describing the header for a dictionary page
- **/
+ */
 struct DictionaryPageHeader {
   int32_t num_values = 0;                // Number of values in the dictionary
   Encoding encoding  = Encoding::PLAIN;  // Encoding using this dictionary page
@@ -311,7 +311,7 @@ struct DictionaryPageHeader {
  * can be multiple page types interleaved in a column chunk, and each page is
  * individually compressed and encoded. Any missing or corrupted pages can be
  * skipped during reading.
- **/
+ */
 struct PageHeader {
   PageType type =
     PageType::DATA_PAGE;  // the type of the page: indicates which of the *_header fields is set
@@ -323,7 +323,7 @@ struct PageHeader {
 
 /**
  * @brief Count the number of leading zeros in an unsigned integer
- **/
+ */
 static inline int CountLeadingZeros32(uint32_t value)
 {
 #if defined(__clang__) || defined(__GNUC__)
@@ -350,7 +350,7 @@ static inline int CountLeadingZeros32(uint32_t value)
  *
  * The parser handles both V1 and V2 Parquet datasets, although not all
  * compression codecs are supported yet.
- **/
+ */
 class CompactProtocolReader {
  protected:
   static const uint8_t g_list2struct[16];
diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp
index 8a3d1f89f8d..bf86fe1c838 100644
--- a/cpp/src/io/parquet/parquet_common.hpp
+++ b/cpp/src/io/parquet/parquet_common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@ namespace io {
 namespace parquet {
 /**
  * @brief Basic data types in Parquet, determines how data is physically stored
- **/
+ */
 enum Type {
   UNDEFINED_TYPE       = -1,  // Undefined for non-leaf nodes
   BOOLEAN              = 0,
@@ -41,7 +41,7 @@ enum Type {
 
 /**
  * @brief High-level data types in Parquet, determines how data is logically interpreted
- **/
+ */
 enum ConvertedType {
   UNKNOWN = -1,  // No type information present
   UTF8    = 0,   // a BYTE_ARRAY actually contains UTF8 encoded chars
@@ -77,7 +77,7 @@ enum ConvertedType {
 
 /**
  * @brief Encoding types for the actual data stream
- **/
+ */
 enum class Encoding : uint8_t {
   PLAIN                   = 0,
   GROUP_VAR_INT           = 1,  // Deprecated, never used
@@ -92,7 +92,7 @@ enum class Encoding : uint8_t {
 
 /**
  * @brief Compression codec used for compressed data pages
- **/
+ */
 enum Compression {
   UNCOMPRESSED = 0,
   SNAPPY       = 1,
@@ -105,7 +105,7 @@ enum Compression {
 
 /**
  * @brief Compression codec used for compressed data pages
- **/
+ */
 enum FieldRepetitionType {
   NO_REPETITION_TYPE = -1,
   REQUIRED = 0,  // This field is required (can not be null) and each record has exactly 1 value.
@@ -115,7 +115,7 @@ enum FieldRepetitionType {
 
 /**
  * @brief Types of pages
- **/
+ */
 enum class PageType : uint8_t {
   DATA_PAGE       = 0,
   INDEX_PAGE      = 1,
@@ -125,7 +125,7 @@ enum class PageType : uint8_t {
 
 /**
  * @brief Thrift compact protocol struct field types
- **/
+ */
 enum {
   ST_FLD_TRUE   = 1,
   ST_FLD_FALSE  = 2,
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index e35803b124d..1e2c5584078 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -137,7 +137,9 @@ type_id to_type_id(SchemaElement const &schema,
         return type_id::DECIMAL32;
       else if (physical == parquet::INT64)
         return type_id::DECIMAL64;
-      else {
+      else if (physical == parquet::FIXED_LEN_BYTE_ARRAY && schema.type_length <= 8) {
+        return type_id::DECIMAL64;
+      } else {
         CUDF_EXPECTS(strict_decimal_types == false, "Unsupported decimal type read!");
         return type_id::FLOAT64;
       }
@@ -704,7 +706,6 @@ class aggregate_metadata {
  * @param remap Maps column schema index to the R/D remapping vectors for that column
  * @param src_col_schema The column schema to generate the new mapping for
  * @param md File metadata information
- *
  */
 void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::vector<int>>> &remap,
                                int src_col_schema,
@@ -1195,11 +1196,24 @@ void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc> &chu
   rmm::device_vector<gpu::nvstrdesc_s> str_dict_index;
   if (total_str_dict_indexes > 0) { str_dict_index.resize(total_str_dict_indexes); }
 
-  std::vector<hostdevice_vector<uint32_t *>> chunk_nested_valids;
-  std::vector<hostdevice_vector<void *>> chunk_nested_data;
+  // TODO (dm): hd_vec should have begin and end iterator members
+  size_t sum_max_depths =
+    std::accumulate(chunks.host_ptr(),
+                    chunks.host_ptr(chunks.size()),
+                    0,
+                    [&](size_t cursum, gpu::ColumnChunkDesc const &chunk) {
+                      return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema);
+                    });
+
+  // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector
+  // to store all per-chunk pointers to nested data/nullmask. `chunk_offsets[i]` will store the
+  // offset into `chunk_nested_data`/`chunk_nested_valids` for the array of pointers for chunk `i`
+  auto chunk_nested_valids = hostdevice_vector<uint32_t *>(sum_max_depths);
+  auto chunk_nested_data   = hostdevice_vector<void *>(sum_max_depths);
+  auto chunk_offsets       = std::vector<size_t>();
 
   // Update chunks with pointers to column data.
-  for (size_t c = 0, page_count = 0, str_ofs = 0; c < chunks.size(); c++) {
+  for (size_t c = 0, page_count = 0, str_ofs = 0, chunk_off = 0; c < chunks.size(); c++) {
     input_column_info const &input_col = _input_columns[chunks[c].src_col_index];
     CUDF_EXPECTS(input_col.schema_idx == chunks[c].src_col_schema,
                  "Column/page schema index mismatch");
@@ -1210,16 +1224,19 @@ void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc> &chu
     }
 
     size_t max_depth = _metadata->get_output_nesting_depth(chunks[c].src_col_schema);
+    chunk_offsets.push_back(chunk_off);
+
+    // get a slice of size `nesting depth` from `chunk_nested_valids` to store an array of pointers
+    // to validity data
+    auto valids              = chunk_nested_valids.host_ptr(chunk_off);
+    chunks[c].valid_map_base = chunk_nested_valids.device_ptr(chunk_off);
 
-    // allocate (gpu) an array of pointers to validity data of size : nesting depth
-    chunk_nested_valids.emplace_back(hostdevice_vector<uint32_t *>{max_depth});
-    hostdevice_vector<uint32_t *> &valids = chunk_nested_valids.back();
-    chunks[c].valid_map_base              = valids.device_ptr();
+    // get a slice of size `nesting depth` from `chunk_nested_data` to store an array of pointers to
+    // out data
+    auto data                  = chunk_nested_data.host_ptr(chunk_off);
+    chunks[c].column_data_base = chunk_nested_data.device_ptr(chunk_off);
 
-    // allocate (gpu) an array of pointers to out data of size : nesting depth
-    chunk_nested_data.emplace_back(hostdevice_vector<void *>{max_depth});
-    hostdevice_vector<void *> &data = chunk_nested_data.back();
-    chunks[c].column_data_base      = data.device_ptr();
+    chunk_off += max_depth;
 
     // fill in the arrays on the host.  there are some important considerations to
     // take into account here for nested columns.  specifically, with structs
@@ -1269,15 +1286,13 @@ void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc> &chu
       }
     }
 
-    // copy to the gpu
-    valids.host_to_device(stream);
-    data.host_to_device(stream);
-
     // column_data_base will always point to leaf data, even for nested types.
     page_count += chunks[c].max_num_pages;
   }
 
   chunks.host_to_device(stream);
+  chunk_nested_valids.host_to_device(stream);
+  chunk_nested_data.host_to_device(stream);
 
   if (total_str_dict_indexes > 0) {
     gpu::BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), stream);
@@ -1337,7 +1352,9 @@ void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc> &chu
       cols          = &out_buf.children;
 
       // if I wasn't the one who wrote out the validity bits, skip it
-      if (chunk_nested_valids[pi->chunk_idx][l_idx] == nullptr) { continue; }
+      if (chunk_nested_valids.host_ptr(chunk_offsets[pi->chunk_idx])[l_idx] == nullptr) {
+        continue;
+      }
       out_buf.null_count() += pni[l_idx].value_count - pni[l_idx].valid_count;
     }
   }
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 09f5d7875d7..afb7489bfa9 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -49,13 +49,13 @@ using namespace cudf::io;
 namespace {
 /**
  * @brief Helper for pinned host memory
- **/
+ */
 template <typename T>
 using pinned_buffer = std::unique_ptr<T, decltype(&cudaFreeHost)>;
 
 /**
  * @brief Function that translates GDF compression to parquet compression
- **/
+ */
 parquet::Compression to_parquet_compression(compression_type compression)
 {
   switch (compression) {
@@ -124,7 +124,7 @@ column_view get_leaf_col(column_view col)
  * @brief Helper kernel for converting string data/offsets into nvstrdesc
  * REMOVEME: Once we eliminate the legacy readers/writers, the kernels could be
  * made to use the native offset+data layout.
- **/
+ */
 __global__ void stringdata_to_nvstrdesc(gpu::nvstrdesc_s *dst,
                                         const size_type *offsets,
                                         const char *strdata,
@@ -152,13 +152,13 @@ __global__ void stringdata_to_nvstrdesc(gpu::nvstrdesc_s *dst,
 
 /**
  * @brief Helper class that adds parquet-specific column info
- **/
+ */
 class parquet_column_view {
  public:
   /**
    * @brief Constructor that extracts out the string position + length pairs
    * for building dictionaries for string columns
-   **/
+   */
   explicit parquet_column_view(size_t id,
                                column_view const &col,
                                std::vector<bool> const &nullability,
@@ -175,6 +175,7 @@ class parquet_column_view {
       _null_count(_leaf_col.null_count()),
       _data(col.head<uint8_t>() + col.offset() * _type_width),
       _nulls(_leaf_col.nullable() ? _leaf_col.null_mask() : nullptr),
+      _offset(col.offset()),
       _converted_type(ConvertedType::UNKNOWN),
       _ts_scale(0),
       _dremel_offsets(0, stream),
@@ -378,6 +379,7 @@ class parquet_column_view {
   bool nullable() const { return _nullability.back(); }
   void const *data() const noexcept { return _data; }
   uint32_t const *nulls() const noexcept { return _nulls; }
+  size_type offset() const noexcept { return _offset; }
   bool level_nullable(size_t level) const { return _nullability[level]; }
 
   // List related data
@@ -435,6 +437,7 @@ class parquet_column_view {
   size_t _null_count     = 0;
   void const *_data      = nullptr;
   uint32_t const *_nulls = nullptr;
+  size_type _offset      = 0;
 
   // parquet-related members
   std::string _name{};
@@ -830,6 +833,7 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state)
     *desc                  = gpu::EncColumnDesc{};  // Zero out all fields
     desc->column_data_base = col.data();
     desc->valid_map_base   = col.nulls();
+    desc->column_offset    = col.offset();
     desc->stats_dtype      = col.stats_type();
     desc->ts_scale         = col.ts_scale();
     // TODO (dm): Enable dictionary for list after refactor
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 75130c1881d..7d9e4b92a36 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -53,7 +53,7 @@ using namespace cudf::io;
 
 /**
  * @brief Implementation for parquet writer
- **/
+ */
 class writer::impl {
   // Parquet datasets are divided into fixed-size, independent rowgroups
   static constexpr uint32_t DEFAULT_ROWGROUP_MAXSIZE = 128 * 1024 * 1024;  // 128MB
@@ -69,7 +69,7 @@ class writer::impl {
    * @param filepath Filepath if storing dataset to a file
    * @param options Settings for controlling behavior
    * @param mr Device memory resource to use for device memory allocation
-   **/
+   */
   explicit impl(std::unique_ptr<data_sink> sink,
                 parquet_writer_options const& options,
                 rmm::mr::device_memory_resource* mr);
@@ -83,7 +83,7 @@ class writer::impl {
    * @param column_chunks_file_path Column chunks file path to be set in the raw output metadata
    * @param stream CUDA stream used for device memory operations and kernel launches.
    * @return unique_ptr to FileMetadata thrift message if requested
-   **/
+   */
   std::unique_ptr<std::vector<uint8_t>> write(table_view const& table,
                                               const table_metadata* metadata,
                                               bool return_filemetadata,
@@ -131,7 +131,7 @@ class writer::impl {
    * @param num_rows Total number of rows
    * @param fragment_size Number of rows per fragment
    * @param stream CUDA stream used for device memory operations and kernel launches.
-   **/
+   */
   void init_page_fragments(hostdevice_vector<gpu::PageFragment>& frag,
                            hostdevice_vector<gpu::EncColumnDesc>& col_desc,
                            uint32_t num_columns,
@@ -149,7 +149,7 @@ class writer::impl {
    * @param num_fragments Total number of fragments per column
    * @param fragment_size Number of rows per fragment
    * @param stream CUDA stream used for device memory operations and kernel launches.
-   **/
+   */
   void gather_fragment_statistics(statistics_chunk* dst_stats,
                                   hostdevice_vector<gpu::PageFragment>& frag,
                                   hostdevice_vector<gpu::EncColumnDesc>& col_desc,
@@ -166,7 +166,7 @@ class writer::impl {
    * @param num_columns Total number of columns
    * @param num_dictionaries Total number of dictionaries
    * @param stream CUDA stream used for device memory operations and kernel launches.
-   **/
+   */
   void build_chunk_dictionaries(hostdevice_vector<gpu::EncColumnChunk>& chunks,
                                 hostdevice_vector<gpu::EncColumnDesc>& col_desc,
                                 uint32_t num_rowgroups,
@@ -184,7 +184,7 @@ class writer::impl {
    * @param num_pages Total number of pages
    * @param num_stats_bfr Number of statistics buffers
    * @param stream CUDA stream used for device memory operations and kernel launches.
-   **/
+   */
   void init_encoder_pages(hostdevice_vector<gpu::EncColumnChunk>& chunks,
                           hostdevice_vector<gpu::EncColumnDesc>& col_desc,
                           gpu::EncPage* pages,
@@ -210,7 +210,7 @@ class writer::impl {
    * @param page_stats optional page-level statistics (nullptr if none)
    * @param chunk_stats optional chunk-level statistics (nullptr if none)
    * @param stream CUDA stream used for device memory operations and kernel launches.
-   **/
+   */
   void encode_pages(hostdevice_vector<gpu::EncColumnChunk>& chunks,
                     gpu::EncPage* pages,
                     uint32_t num_columns,
diff --git a/cpp/src/io/statistics/column_stats.cu b/cpp/src/io/statistics/column_stats.cu
index 3f8921f6011..f89151ca31b 100644
--- a/cpp/src/io/statistics/column_stats.cu
+++ b/cpp/src/io/statistics/column_stats.cu
@@ -28,7 +28,7 @@ namespace cudf {
 namespace io {
 /**
  * @brief shared state for statistics gather kernel
- **/
+ */
 struct stats_state_s {
   stats_column_desc col;                 ///< Column information
   statistics_group group;                ///< Group description
@@ -40,7 +40,7 @@ struct stats_state_s {
 
 /**
  * @brief shared state for statistics merge kernel
- **/
+ */
 struct merge_state_s {
   stats_column_desc col;                 ///< Column information
   statistics_merge_group group;          ///< Group description
@@ -54,7 +54,7 @@ struct merge_state_s {
 
 /**
  * Custom addition functor to ignore NaN inputs
- **/
+ */
 struct IgnoreNaNSum {
   __device__ __forceinline__ double operator()(const double &a, const double &b)
   {
@@ -66,7 +66,7 @@ struct IgnoreNaNSum {
 
 /**
  * Warp-wide Min reduction for string types
- **/
+ */
 inline __device__ string_stats WarpReduceMinString(const char *smin, uint32_t lmin)
 {
   uint32_t len = shuffle_xor(lmin, 1);
@@ -105,7 +105,7 @@ inline __device__ string_stats WarpReduceMinString(const char *smin, uint32_t lm
 
 /**
  * Warp-wide Max reduction for string types
- **/
+ */
 inline __device__ string_stats WarpReduceMaxString(const char *smax, uint32_t lmax)
 {
   uint32_t len = shuffle_xor(lmax, 1);
@@ -149,7 +149,7 @@ inline __device__ string_stats WarpReduceMaxString(const char *smax, uint32_t lm
  * @param dtype data type
  * @param t thread id
  * @param storage temporary storage for warp reduction
- **/
+ */
 template <typename Storage>
 void __device__
 gatherIntColumnStats(stats_state_s *s, statistics_dtype dtype, uint32_t t, Storage &storage)
@@ -166,7 +166,10 @@ gatherIntColumnStats(stats_state_s *s, statistics_dtype dtype, uint32_t t, Stora
     uint32_t row              = r + s->group.start_row;
     const uint32_t *valid_map = s->col.valid_map_base;
     uint32_t is_valid         = (r < s->group.num_rows && row < s->col.num_values)
-                          ? (valid_map) ? (valid_map[row >> 5] >> (row & 0x1f)) & 1 : 1
+                          ? (valid_map) ? (valid_map[(row + s->col.column_offset) / 32] >>
+                                           ((row + s->col.column_offset) % 32)) &
+                                            1
+                                        : 1
                           : 0;
     if (is_valid) {
       switch (dtype) {
@@ -234,7 +237,7 @@ gatherIntColumnStats(stats_state_s *s, statistics_dtype dtype, uint32_t t, Stora
  * @param dtype data type
  * @param t thread id
  * @param storage temporary storage for warp reduction
- **/
+ */
 template <typename Storage>
 void __device__
 gatherFloatColumnStats(stats_state_s *s, statistics_dtype dtype, uint32_t t, Storage &storage)
@@ -251,7 +254,10 @@ gatherFloatColumnStats(stats_state_s *s, statistics_dtype dtype, uint32_t t, Sto
     uint32_t row              = r + s->group.start_row;
     const uint32_t *valid_map = s->col.valid_map_base;
     uint32_t is_valid         = (r < s->group.num_rows && row < s->col.num_values)
-                          ? (valid_map) ? (valid_map[row >> 5] >> (row & 0x1f)) & 1 : 1
+                          ? (valid_map) ? (valid_map[(row + s->col.column_offset) >> 5] >>
+                                           ((row + s->col.column_offset) & 0x1f)) &
+                                            1
+                                        : 1
                           : 0;
     if (is_valid) {
       if (dtype == dtype_float64) {
@@ -312,7 +318,7 @@ struct nvstrdesc_s {
  * @param s shared block state
  * @param t thread id
  * @param storage temporary storage for warp reduction
- **/
+ */
 template <typename Storage>
 void __device__ gatherStringColumnStats(stats_state_s *s, uint32_t t, Storage &storage)
 {
@@ -331,7 +337,10 @@ void __device__ gatherStringColumnStats(stats_state_s *s, uint32_t t, Storage &s
     uint32_t row              = r + s->group.start_row;
     const uint32_t *valid_map = s->col.valid_map_base;
     uint32_t is_valid         = (r < s->group.num_rows && row < s->col.num_values)
-                          ? (valid_map) ? (valid_map[row >> 5] >> (row & 0x1f)) & 1 : 1
+                          ? (valid_map) ? (valid_map[(row + s->col.column_offset) >> 5] >>
+                                           ((row + s->col.column_offset) & 0x1f)) &
+                                            1
+                                        : 1
                           : 0;
     if (is_valid) {
       const nvstrdesc_s *str_col = static_cast<const nvstrdesc_s *>(s->col.column_data_base);
@@ -396,7 +405,7 @@ void __device__ gatherStringColumnStats(stats_state_s *s, uint32_t t, Storage &s
  *
  * @param chunks Destination statistics results
  * @param groups Statistics source information
- **/
+ */
 template <int block_size>
 __global__ void __launch_bounds__(block_size, 1)
   gpuGatherColumnStatistics(statistics_chunk *chunks, const statistics_group *groups)
@@ -447,7 +456,7 @@ __global__ void __launch_bounds__(block_size, 1)
  * @param num_chunks number of statistic chunks to merge
  * @param t thread id
  * @param storage temporary storage for warp reduction
- **/
+ */
 template <typename Storage>
 void __device__ mergeIntColumnStats(merge_state_s *s,
                                     statistics_dtype dtype,
@@ -530,7 +539,7 @@ void __device__ mergeIntColumnStats(merge_state_s *s,
  * @param num_chunks number of statistic chunks to merge
  * @param t thread id
  * @param storage temporary storage for warp reduction
- **/
+ */
 template <typename Storage>
 void __device__ mergeFloatColumnStats(merge_state_s *s,
                                       const statistics_chunk *ck_in,
@@ -613,7 +622,7 @@ void __device__ mergeFloatColumnStats(merge_state_s *s,
  * @param num_chunks number of statistic chunks to merge
  * @param t thread id
  * @param storage temporary storage for warp reduction
- **/
+ */
 template <typename Storage>
 void __device__ mergeStringColumnStats(merge_state_s *s,
                                        const statistics_chunk *ck_in,
@@ -706,7 +715,7 @@ void __device__ mergeStringColumnStats(merge_state_s *s,
  * @param chunks_out Destination statistic chunks
  * @param chunks_in Source statistic chunks
  * @param groups Statistic chunk grouping information
- **/
+ */
 template <int block_size>
 __global__ void __launch_bounds__(block_size, 1)
   gpuMergeColumnStatistics(statistics_chunk *chunks_out,
diff --git a/cpp/src/io/statistics/column_stats.h b/cpp/src/io/statistics/column_stats.h
index bbecc85b8d8..6812678f01d 100644
--- a/cpp/src/io/statistics/column_stats.h
+++ b/cpp/src/io/statistics/column_stats.h
@@ -16,6 +16,7 @@
 #pragma once
 #include <stdint.h>
 
+#include <cudf/types.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
@@ -43,6 +44,7 @@ struct stats_column_desc {
   uint32_t num_values;  //!< Number of data values in column. Different from num_rows in case of
                         //!< nested columns
   const uint32_t *valid_map_base;  //!< base of valid bit map for this column (null if not present)
+  size_type column_offset;         //! < index of the first element relative to the base memory
   const void *column_data_base;    //!< base ptr to column data
   int32_t ts_scale;  //!< timestamp scale (>0: multiply by scale, <0: divide by -scale)
 };
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index b4c2f491927..48558005303 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -25,7 +25,6 @@ namespace cudf {
 namespace io {
 /**
  * @brief Implementation class for storing data into a local file.
- *
  */
 class file_sink : public data_sink {
  public:
@@ -52,7 +51,6 @@ class file_sink : public data_sink {
 
 /**
  * @brief Implementation class for storing data into a std::vector.
- *
  */
 class host_buffer_sink : public data_sink {
  public:
@@ -76,7 +74,6 @@ class host_buffer_sink : public data_sink {
 
 /**
  * @brief Implementation class for voiding data (no io performed)
- *
  */
 class void_sink : public data_sink {
  public:
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 5a4ac8e1d7e..148c5a1ef35 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -29,7 +29,7 @@
  * initialized upfront, or gradually initialized as required.
  * The host-side memory can be used to manipulate data on the CPU before and
  * after operating on the same data on the GPU.
- **/
+ */
 template <typename T>
 class hostdevice_vector {
  public:
diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu
index 10dd9afb06c..5713a562131 100644
--- a/cpp/src/io/utilities/parsing_utils.cu
+++ b/cpp/src/io/utilities/parsing_utils.cu
@@ -30,7 +30,7 @@ constexpr T divCeil(T dividend, T divisor) noexcept
 
 /**
  * @brief Sets the specified element of the array to the passed value
- **/
+ */
 template <class T, class V>
 __device__ __forceinline__ void setElement(T* array, cudf::size_type idx, const T& t, const V& v)
 {
@@ -40,7 +40,7 @@ __device__ __forceinline__ void setElement(T* array, cudf::size_type idx, const
 /**
  * @brief Sets the specified element of the array of pairs using the two passed
  * parameters.
- **/
+ */
 template <class T, class V>
 __device__ __forceinline__ void setElement(thrust::pair<T, V>* array,
                                            cudf::size_type idx,
@@ -53,7 +53,7 @@ __device__ __forceinline__ void setElement(thrust::pair<T, V>* array,
 /**
  * @brief Overloads the setElement() functions for void* arrays.
  * Does not do anything, indexing is not allowed with void* arrays.
- **/
+ */
 template <class T, class V>
 __device__ __forceinline__ void setElement(void* array, cudf::size_type idx, const T& t, const V& v)
 {
@@ -72,7 +72,7 @@ __device__ __forceinline__ void setElement(void* array, cudf::size_type idx, con
  * @param[out] positions Array containing the output positions
  *
  * @return void
- **/
+ */
 template <class T>
 __global__ void count_and_set_positions(const char* data,
                                         uint64_t size,
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 7c4a86c1a87..49f5d285647 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -383,7 +383,7 @@ __device__ __inline__ cudf::size_type* infer_integral_field_counter(char const*
  * @param[out] positions Array containing the output positions
  *
  * @return cudf::size_type total number of occurrences
- **/
+ */
 template <class T>
 cudf::size_type find_all_from_set(const rmm::device_buffer& d_data,
                                   const std::vector<char>& keys,
@@ -405,7 +405,7 @@ cudf::size_type find_all_from_set(const rmm::device_buffer& d_data,
  * @param[out] positions Array containing the output positions
  *
  * @return cudf::size_type total number of occurrences
- **/
+ */
 template <class T>
 cudf::size_type find_all_from_set(const char* h_data,
                                   size_t h_size,
@@ -421,7 +421,7 @@ cudf::size_type find_all_from_set(const char* h_data,
  * @param[in] keys Vector containing the keys to count in the buffer
  *
  * @return cudf::size_type total number of occurrences
- **/
+ */
 cudf::size_type count_all_from_set(const rmm::device_buffer& d_data, const std::vector<char>& keys);
 
 /**
@@ -436,7 +436,7 @@ cudf::size_type count_all_from_set(const rmm::device_buffer& d_data, const std::
  * @param[in] keys Vector containing the keys to count in the buffer
  *
  * @return cudf::size_type total number of occurrences
- **/
+ */
 cudf::size_type count_all_from_set(const char* h_data,
                                    size_t h_size,
                                    const std::vector<char>& keys);
@@ -453,7 +453,7 @@ cudf::size_type count_all_from_set(const char* h_data,
  * @param[in] ext_to_comp_map User supplied mapping of file extension to compression type
  *
  * @return string representing compression type ("gzip, "bz2", etc)
- **/
+ */
 std::string infer_compression_type(
   const compression_type& compression_arg,
   const std::string& filename,
diff --git a/cpp/src/io/utilities/type_conversion.cpp b/cpp/src/io/utilities/type_conversion.cpp
index 01c7cfe496e..50ed6fa5aa0 100644
--- a/cpp/src/io/utilities/type_conversion.cpp
+++ b/cpp/src/io/utilities/type_conversion.cpp
@@ -24,8 +24,7 @@ namespace cudf {
 namespace io {
 /**
  * @copydoc cudf::io:convert_string_to_dtype
- *
- **/
+ */
 data_type convert_string_to_dtype(const std::string& dtype_in)
 {
   // TODO: This function should be cleanup to take only libcudf type instances.
diff --git a/cpp/src/jit/cache.cpp b/cpp/src/jit/cache.cpp
index a0d9a6c3deb..10647dd934d 100644
--- a/cpp/src/jit/cache.cpp
+++ b/cpp/src/jit/cache.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,7 +61,7 @@ boost::filesystem::path get_user_home_cache_dir()
  * The default cache directory is `$HOME/.cudf/$CUDF_VERSION`. If no overrides
  * are used and if $HOME is not defined, returns an empty path and file
  * caching is not used.
- **/
+ */
 boost::filesystem::path getCacheDir()
 {
   // The environment variable always overrides the
diff --git a/cpp/src/jit/cache.h b/cpp/src/jit/cache.h
index e2af78cd817..071a951023b 100644
--- a/cpp/src/jit/cache.h
+++ b/cpp/src/jit/cache.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,7 +43,7 @@ using named_prog = std::pair<std::string, std::shared_ptr<Tv>>;
  * The default cache directory is `$HOME/.cudf/$CUDF_VERSION`. If no overrides
  * are used and if $HOME is not defined, returns an empty path and file
  * caching is not used.
- **/
+ */
 boost::filesystem::path getCacheDir();
 
 class cudfJitCache {
@@ -51,7 +51,7 @@ class cudfJitCache {
   /**
    * @brief Get a process wide singleton cache object
    *
-   **/
+   */
   static cudfJitCache& Instance()
   {
     // Meyers' singleton is thread safe in C++11
@@ -73,7 +73,7 @@ class cudfJitCache {
    * @param program    Jitify preprocessed program to get the kernel from
    * @param arguments  template arguments for kernel in vector of strings
    * @return  Pair of string kernel identifier and compiled kernel object
-   **/
+   */
   named_prog<jitify::experimental::KernelInstantiation> getKernelInstantiation(
     std::string const& kern_name,
     named_prog<jitify::experimental::Program> const& program,
@@ -92,7 +92,7 @@ class cudfJitCache {
    * @param given_options  vector of strings options to pass to NVRTC
    * @param file_callback  pointer to callback function to call whenever a header needs to be loaded
    * @return named_prog<jitify::experimental::Program>
-   **/
+   */
   named_prog<jitify::experimental::Program> getProgram(
     std::string const& prog_file_name,
     std::string const& cuda_source                         = "",
@@ -123,7 +123,7 @@ class cudfJitCache {
   /**
    * @brief Class to allow process wise exclusive access to cache files
    *
-   **/
+   */
   class cacheFile {
    private:
     std::string _file_name;
@@ -137,13 +137,13 @@ class cudfJitCache {
     /**
      * @brief Read this file and return the contents as a std::string
      *
-     **/
+     */
     std::string read();
 
     /**
      * @brief Write the passed string to this file
      *
-     **/
+     */
     void write(std::string);
 
     /**
@@ -151,7 +151,7 @@ class cudfJitCache {
      *
      * @return true Read was successful. String returned by `read()` is valid
      * @return false Read was unsuccessful. String returned by `read()` is empty
-     **/
+     */
     bool is_read_successful() { return successful_read; }
 
     /**
@@ -159,7 +159,7 @@ class cudfJitCache {
      *
      * @return true Write was successful.
      * @return false Write was unsuccessful. File state is undefined
-     **/
+     */
     bool is_write_successful() { return successful_write; }
   };
 
@@ -205,4 +205,4 @@ class cudfJitCache {
 }  // namespace jit
 }  // namespace cudf
 
-#endif  // CUDF_JIT_CACHE_H_
\ No newline at end of file
+#endif  // CUDF_JIT_CACHE_H_
diff --git a/cpp/src/jit/launcher.h b/cpp/src/jit/launcher.h
index 60720816bc1..cc5b6ab0c5a 100644
--- a/cpp/src/jit/launcher.h
+++ b/cpp/src/jit/launcher.h
@@ -35,7 +35,6 @@ namespace cudf {
 namespace jit {
 /**
  * @brief Class used to handle compilation and execution of JIT kernels
- *
  */
 class launcher {
  public:
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index ba1a0369b5a..c2c32d4165a 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -198,18 +198,21 @@ get_left_join_indices_complement(rmm::device_vector<size_type> &right_indices,
  * @throw std::out_of_range if elements of `build_on` exceed the number of columns in the `build`
  * table.
  *
- * @param build_table Table of build side columns to join.
+ * @param build Table of columns used to build join hash.
+ * @param compare_nulls Controls whether null join-key values should match or not.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  *
  * @return Built hash table.
  */
 std::unique_ptr<multimap_type, std::function<void(multimap_type *)>> build_join_hash_table(
-  cudf::table_device_view build_table, rmm::cuda_stream_view stream)
+  cudf::table_view const &build, null_equality compare_nulls, rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS(0 != build_table.num_columns(), "Selected build dataset is empty");
-  CUDF_EXPECTS(0 != build_table.num_rows(), "Build side table has no rows");
+  auto build_device_table = cudf::table_device_view::create(build, stream);
 
-  const size_type build_table_num_rows{build_table.num_rows()};
+  CUDF_EXPECTS(0 != build_device_table->num_columns(), "Selected build dataset is empty");
+  CUDF_EXPECTS(0 != build_device_table->num_rows(), "Build side table has no rows");
+
+  size_type const build_table_num_rows{build_device_table->num_rows()};
   size_t const hash_table_size = compute_hash_table_size(build_table_num_rows);
 
   auto hash_table = multimap_type::create(hash_table_size,
@@ -219,12 +222,19 @@ std::unique_ptr<multimap_type, std::function<void(multimap_type *)>> build_join_
                                           multimap_type::key_equal(),
                                           multimap_type::allocator_type());
 
-  row_hash hash_build{build_table};
+  row_hash hash_build{*build_device_table};
   rmm::device_scalar<int> failure(0, stream);
   constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE};
   detail::grid_1d config(build_table_num_rows, block_size);
+  auto const row_bitmask = (compare_nulls == null_equality::EQUAL)
+                             ? rmm::device_buffer{0, stream}
+                             : cudf::detail::bitmask_and(build, stream);
   build_hash_table<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
-    *hash_table, hash_build, build_table_num_rows, failure.data());
+    *hash_table,
+    hash_build,
+    build_table_num_rows,
+    static_cast<bitmask_type const *>(row_bitmask.data()),
+    failure.data());
   // Check error code from the kernel
   if (failure.value(stream) == 1) { CUDF_FAIL("Hash Table insert failure."); }
 
@@ -488,6 +498,7 @@ hash_join::hash_join_impl::~hash_join_impl() = default;
 
 hash_join::hash_join_impl::hash_join_impl(cudf::table_view const &build,
                                           std::vector<size_type> const &build_on,
+                                          null_equality compare_nulls,
                                           rmm::cuda_stream_view stream)
   : _build(build),
     _build_selected(build.select(build_on)),
@@ -501,8 +512,7 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const &build,
 
   if (_build_on.empty() || 0 == build.num_rows()) { return; }
 
-  auto build_table = cudf::table_device_view::create(_build_selected, stream);
-  _hash_table      = build_join_hash_table(*build_table, stream);
+  _hash_table = build_join_hash_table(_build_selected, compare_nulls, stream);
 }
 
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>>
diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh
index fd60557698d..712d771bd73 100644
--- a/cpp/src/join/hash_join.cuh
+++ b/cpp/src/join/hash_join.cuh
@@ -224,9 +224,11 @@ struct hash_join::hash_join_impl {
    *
    * @param build The build table, from which the hash table is built.
    * @param build_on The column indices from `build` to join on.
+   * @param compare_nulls Controls whether null join-key values should match or not.
    */
   hash_join_impl(cudf::table_view const& build,
                  std::vector<size_type> const& build_on,
+                 null_equality compare_nulls,
                  rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
   std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>> inner_join(
diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu
index af649fe5fb0..ce27cfcd616 100644
--- a/cpp/src/join/join.cu
+++ b/cpp/src/join/join.cu
@@ -51,7 +51,7 @@ std::unique_ptr<table> inner_join(
   // building/probing the hash map. Because building is typically more expensive than probing, we
   // build the hash map from the smaller table.
   if (right.num_rows() > left.num_rows()) {
-    cudf::hash_join hj_obj(left, left_on, stream);
+    cudf::hash_join hj_obj(left, left_on, compare_nulls, stream);
     auto actual_columns_in_common = columns_in_common;
     std::for_each(actual_columns_in_common.begin(), actual_columns_in_common.end(), [](auto& pair) {
       std::swap(pair.first, pair.second);
@@ -66,7 +66,7 @@ std::unique_ptr<table> inner_join(
     return cudf::detail::combine_table_pair(std::move(probe_build_pair.second),
                                             std::move(probe_build_pair.first));
   } else {
-    cudf::hash_join hj_obj(right, right_on, stream);
+    cudf::hash_join hj_obj(right, right_on, compare_nulls, stream);
     auto probe_build_pair = hj_obj.inner_join(left,
                                               left_on,
                                               columns_in_common,
@@ -99,7 +99,7 @@ std::unique_ptr<table> left_join(
   table_view const left  = scatter_columns(matched.second.front(), left_on, left_input);
   table_view const right = scatter_columns(matched.second.back(), right_on, right_input);
 
-  cudf::hash_join hj_obj(right, right_on, stream);
+  cudf::hash_join hj_obj(right, right_on, compare_nulls, stream);
   return hj_obj.left_join(left, left_on, columns_in_common, compare_nulls, stream, mr);
 }
 
@@ -123,7 +123,7 @@ std::unique_ptr<table> full_join(
   table_view const left  = scatter_columns(matched.second.front(), left_on, left_input);
   table_view const right = scatter_columns(matched.second.back(), right_on, right_input);
 
-  cudf::hash_join hj_obj(right, right_on, stream);
+  cudf::hash_join hj_obj(right, right_on, compare_nulls, stream);
   return hj_obj.full_join(left, left_on, columns_in_common, compare_nulls, stream, mr);
 }
 
@@ -133,8 +133,9 @@ hash_join::~hash_join() = default;
 
 hash_join::hash_join(cudf::table_view const& build,
                      std::vector<size_type> const& build_on,
+                     null_equality compare_nulls,
                      rmm::cuda_stream_view stream)
-  : impl{std::make_unique<const hash_join::hash_join_impl>(build, build_on, stream)}
+  : impl{std::make_unique<const hash_join::hash_join_impl>(build, build_on, compare_nulls, stream)}
 {
 }
 
diff --git a/cpp/src/join/join_kernels.cuh b/cpp/src/join/join_kernels.cuh
index 02e7b6c6c85..c353ec2e895 100644
--- a/cpp/src/join/join_kernels.cuh
+++ b/cpp/src/join/join_kernels.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cub/cub.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/table/table_device_view.cuh>
 
@@ -71,28 +72,33 @@ constexpr auto remap_sentinel_hash(H hash, S sentinel)
  * @param[in,out] multi_map The hash table to be built to insert rows into
  * @param[in] hash_build Row hasher for the build table
  * @param[in] build_table_num_rows The number of rows in the build table
+ * @param[in] row_bitmask Bitmask where bit `i` indicates the presence of a null
+ * value in row `i` of input keys. This is nullptr if nulls are equal.
  * @param[out] error Pointer used to set an error code if the insert fails
  */
 template <typename multimap_type>
 __global__ void build_hash_table(multimap_type multi_map,
                                  row_hash hash_build,
                                  const cudf::size_type build_table_num_rows,
+                                 bitmask_type const* row_bitmask,
                                  int* error)
 {
   cudf::size_type i = threadIdx.x + blockIdx.x * blockDim.x;
 
   while (i < build_table_num_rows) {
-    // Compute the hash value of this row
-    auto const row_hash_value = remap_sentinel_hash(hash_build(i), multi_map.get_unused_key());
-
-    // Insert the (row hash value, row index) into the map
-    // using the row hash value to determine the location in the
-    // hash map where the new pair should be inserted
-    const auto insert_location =
-      multi_map.insert(thrust::make_pair(row_hash_value, i), true, row_hash_value);
-
-    // If the insert failed, set the error code accordingly
-    if (multi_map.end() == insert_location) { *error = 1; }
+    if (!row_bitmask || cudf::bit_is_set(row_bitmask, i)) {
+      // Compute the hash value of this row
+      auto const row_hash_value = remap_sentinel_hash(hash_build(i), multi_map.get_unused_key());
+
+      // Insert the (row hash value, row index) into the map
+      // using the row hash value to determine the location in the
+      // hash map where the new pair should be inserted
+      auto const insert_location =
+        multi_map.insert(thrust::make_pair(row_hash_value, i), true, row_hash_value);
+
+      // If the insert failed, set the error code accordingly
+      if (multi_map.end() == insert_location) { *error = 1; }
+    }
     i += blockDim.x * gridDim.x;
   }
 }
diff --git a/cpp/src/lists/copying/concatenate.cu b/cpp/src/lists/copying/concatenate.cu
index ead5a34fdf0..425b8f43065 100644
--- a/cpp/src/lists/copying/concatenate.cu
+++ b/cpp/src/lists/copying/concatenate.cu
@@ -88,7 +88,6 @@ std::unique_ptr<column> merge_offsets(std::vector<lists_column_view> const& colu
 
 /**
  * @copydoc cudf::lists::detail::concatenate
- *
  */
 std::unique_ptr<column> concatenate(
   std::vector<column_view> const& columns,
diff --git a/cpp/src/lists/copying/gather.cu b/cpp/src/lists/copying/gather.cu
index a6d9e0baf40..82bfab0d0c5 100644
--- a/cpp/src/lists/copying/gather.cu
+++ b/cpp/src/lists/copying/gather.cu
@@ -51,7 +51,6 @@ namespace detail {
  * step 3, add base offsets to get the final sequence
  * [0, 1, 5, 6, 7, 8, 9]
  * @endcode
- *
  */
 struct list_gatherer {
   typedef size_type argument_type;
@@ -85,7 +84,6 @@ struct list_gatherer {
 
 /**
  * @copydoc cudf::lists::detail::gather_list_leaf
- *
  */
 std::unique_ptr<column> gather_list_leaf(column_view const& column,
                                          gather_data const& gd,
@@ -134,7 +132,6 @@ std::unique_ptr<column> gather_list_leaf(column_view const& column,
 
 /**
  * @copydoc cudf::lists::detail::gather_list_nested
- *
  */
 std::unique_ptr<column> gather_list_nested(cudf::lists_column_view const& list,
                                            gather_data& gd,
diff --git a/cpp/src/lists/lists_column_factories.cu b/cpp/src/lists/lists_column_factories.cu
index baee0e82b72..ebf5e07f76a 100644
--- a/cpp/src/lists/lists_column_factories.cu
+++ b/cpp/src/lists/lists_column_factories.cu
@@ -23,7 +23,6 @@ namespace cudf {
 
 /**
  * @copydoc cudf::make_lists_column
- *
  */
 std::unique_ptr<column> make_lists_column(size_type num_rows,
                                           std::unique_ptr<column> offsets_column,
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index 0eb0511258a..4d409edac65 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -86,7 +86,6 @@ class bitwise_partitioner {
   const size_type mask;
 };
 
-/* --------------------------------------------------------------------------*/
 /**
  * @brief Computes which partition each row of a device_table will belong to
  based on hashing each row, and applying a partition function to the hash value.
@@ -112,7 +111,6 @@ class bitwise_partitioner {
  partition(num_partitions -1) size, ...} }
  * @param[out] global_partition_sizes The number of rows in each partition.
  */
-/* ----------------------------------------------------------------------------*/
 template <class row_hasher_t, typename partitioner_type>
 __global__ void compute_row_partition_numbers(row_hasher_t the_hasher,
                                               const size_type num_rows,
@@ -170,7 +168,6 @@ __global__ void compute_row_partition_numbers(row_hasher_t the_hasher,
   }
 }
 
-/* --------------------------------------------------------------------------*/
 /**
  * @brief  Given an array of partition numbers, computes the final output
  location for each element in the output such that all rows with the same
@@ -188,7 +185,6 @@ __global__ void compute_row_partition_numbers(row_hasher_t the_hasher,
          {block0 partition(num_partitions-1) offset, block1
  partition(num_partitions -1) offset, ...} }
  */
-/* ----------------------------------------------------------------------------*/
 __global__ void compute_row_output_locations(size_type* __restrict__ row_partition_numbers,
                                              const size_type num_rows,
                                              const size_type num_partitions,
@@ -228,7 +224,6 @@ __global__ void compute_row_output_locations(size_type* __restrict__ row_partiti
   }
 }
 
-/* --------------------------------------------------------------------------*/
 /**
  * @brief Move one column from the input table to the hashed table.
  *
@@ -245,7 +240,6 @@ __global__ void compute_row_output_locations(size_type* __restrict__ row_partiti
  * for each block
  * @param[in] scanned_block_partition_sizes The scan of block_partition_sizes
  */
-/* ----------------------------------------------------------------------------*/
 template <typename InputIter, typename DataType>
 __global__ void copy_block_partitions(InputIter input_iter,
                                       DataType* __restrict__ output_buf,
@@ -399,10 +393,12 @@ struct copy_block_partitions_dispatcher {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
-    rmm::device_buffer output(input.size() * sizeof(DataType), stream, mr);
+    using Type = device_storage_type_t<DataType>;
 
-    copy_block_partitions_impl(input.data<DataType>(),
-                               static_cast<DataType*>(output.data()),
+    rmm::device_buffer output(input.size() * sizeof(Type), stream, mr);
+
+    copy_block_partitions_impl(input.data<Type>(),
+                               static_cast<Type*>(output.data()),
                                input.size(),
                                num_partitions,
                                row_partition_numbers,
diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu
index 3df0120dc72..3fda828d355 100644
--- a/cpp/src/quantiles/quantile.cu
+++ b/cpp/src/quantiles/quantile.cu
@@ -45,19 +45,24 @@ struct quantile_functor {
   rmm::mr::device_memory_resource* mr;
 
   template <typename T>
-  std::enable_if_t<not std::is_arithmetic<T>::value, std::unique_ptr<column>> operator()(
-    column_view const& input)
+  std::enable_if_t<not std::is_arithmetic<T>::value and not cudf::is_fixed_point<T>(),
+                   std::unique_ptr<column>>
+  operator()(column_view const& input)
   {
     CUDF_FAIL("quantile does not support non-numeric types");
   }
 
   template <typename T>
-  std::enable_if_t<std::is_arithmetic<T>::value, std::unique_ptr<column>> operator()(
-    column_view const& input)
+  std::enable_if_t<std::is_arithmetic<T>::value or cudf::is_fixed_point<T>(),
+                   std::unique_ptr<column>>
+  operator()(column_view const& input)
   {
-    using Result = std::conditional_t<exact, double, T>;
+    using StorageType   = cudf::device_storage_type_t<T>;
+    using ExactResult   = std::conditional_t<exact and not cudf::is_fixed_point<T>(), double, T>;
+    using StorageResult = cudf::device_storage_type_t<ExactResult>;
 
-    auto type = data_type{type_to_id<Result>()};
+    auto const type =
+      is_fixed_point(input.type()) ? input.type() : data_type{type_to_id<StorageResult>()};
     auto output =
       make_fixed_width_column(type, q.size(), mask_state::UNALLOCATED, stream.value(), mr);
 
@@ -75,21 +80,22 @@ struct quantile_functor {
     rmm::device_vector<double> q_device{q};
 
     if (!cudf::is_dictionary(input.type())) {
-      auto sorted_data = thrust::make_permutation_iterator(input.data<T>(), ordered_indices);
+      auto sorted_data =
+        thrust::make_permutation_iterator(input.data<StorageType>(), ordered_indices);
       thrust::transform(q_device.begin(),
                         q_device.end(),
-                        d_output->template begin<Result>(),
+                        d_output->template begin<StorageResult>(),
                         [sorted_data, interp = interp, size = size] __device__(double q) {
-                          return select_quantile_data<Result>(sorted_data, size, q, interp);
+                          return select_quantile_data<StorageResult>(sorted_data, size, q, interp);
                         });
     } else {
       auto sorted_data = thrust::make_permutation_iterator(
         dictionary::detail::make_dictionary_iterator<T>(*d_input), ordered_indices);
       thrust::transform(q_device.begin(),
                         q_device.end(),
-                        d_output->template begin<Result>(),
+                        d_output->template begin<StorageResult>(),
                         [sorted_data, interp = interp, size = size] __device__(double q) {
-                          return select_quantile_data<Result>(sorted_data, size, q, interp);
+                          return select_quantile_data<StorageResult>(sorted_data, size, q, interp);
                         });
     }
 
diff --git a/cpp/src/reductions/compound.cuh b/cpp/src/reductions/compound.cuh
index 44939b67829..011af214d45 100644
--- a/cpp/src/reductions/compound.cuh
+++ b/cpp/src/reductions/compound.cuh
@@ -25,7 +25,7 @@
 namespace cudf {
 namespace reduction {
 namespace compound {
-/** --------------------------------------------------------------------------*
+/**
  * @brief Multi-step reduction for operations such as mean and variance, and
  * standard deviation.
  *
@@ -41,7 +41,7 @@ namespace compound {
  * @tparam ResultType   the output cudf dtype
  * @tparam Op           the compound operator derived from
  * `cudf::reduction::op::compound_op`
- * ----------------------------------------------------------------------------**/
+ */
 template <typename ElementType, typename ResultType, typename Op>
 std::unique_ptr<scalar> compound_reduction(column_view const& col,
                                            data_type const output_dtype,
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index ab0e45f648a..100c5ee7aec 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -99,7 +99,6 @@ rmm::device_scalar<OutputType> reduce_device(InputIterator d_in,
  * @brief Functor that accepts two minmax_pairs and returns a
  * minmax_pair whose minimum and maximum values are the min() and max()
  * respectively of the minimums and maximums of the input pairs.
- *
  */
 template <typename T>
 struct minmax_binary_op
@@ -113,7 +112,6 @@ struct minmax_binary_op
 
 /**
  * @brief Creates a minmax_pair<T> from a T
- *
  */
 template <typename T>
 struct create_minmax {
@@ -124,7 +122,6 @@ struct create_minmax {
  * @brief Functor that takes a thrust::pair<T, bool> and produces a minmax_pair
  * that is <T, T> for minimum and maximum or <cudf::DeviceMin::identity<T>(),
  * cudf::DeviceMax::identity<T>()>
- *
  */
 template <typename T>
 struct create_minmax_with_nulls {
diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh
index a4edec6bde8..ab780e70a17 100644
--- a/cpp/src/reductions/simple.cuh
+++ b/cpp/src/reductions/simple.cuh
@@ -104,9 +104,16 @@ std::unique_ptr<scalar> fixed_point_reduction(column_view const& col,
     }
   }();
 
-  auto const scale = std::is_same<Op, cudf::reduction::op::product>::value
-                       ? numeric::scale_type{col.type().scale() * (col.size() - col.null_count())}
-                       : numeric::scale_type{col.type().scale()};
+  auto const scale = [&] {
+    if (std::is_same<Op, cudf::reduction::op::product>::value) {
+      auto const valid_count = static_cast<int32_t>(col.size() - col.null_count());
+      return numeric::scale_type{col.type().scale() * valid_count};
+    } else if (std::is_same<Op, cudf::reduction::op::sum_of_squares>::value) {
+      return numeric::scale_type{col.type().scale() * 2};
+    }
+    return numeric::scale_type{col.type().scale()};
+  }();
+
   auto const val = static_cast<cudf::scalar_type_t<Type>*>(result.get());
   return cudf::make_fixed_point_scalar<DecimalXX>(val->value(), scale);
 }
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index d4fd2223d50..5ce1a7af4ad 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -374,55 +374,41 @@ struct replace_policy_functor {
 };
 
 /**
- * @brief Functor called by the `type_dispatcher` in order to invoke and instantiate
- *        `replace_nulls` with the appropriate data types.
+ * @brief Function used by replace_nulls policy
  */
-struct replace_nulls_policy_kernel_forwarder {
-  template <typename col_type,
-            typename std::enable_if_t<cudf::is_fixed_width<col_type>()>* = nullptr>
-  std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
-                                           cudf::replace_policy const& replace_policy,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
-  {
-    using Type     = cudf::device_storage_type_t<col_type>;
-    auto device_in = cudf::column_device_view::create(input);
-    auto index     = thrust::make_counting_iterator<cudf::size_type>(0);
-    auto valid_it  = cudf::detail::make_validity_iterator(*device_in);
-    auto in_begin  = thrust::make_zip_iterator(thrust::make_tuple(index, valid_it));
-
-    rmm::device_vector<cudf::size_type> gather_map(input.size());
-    auto gm_begin = thrust::make_zip_iterator(
-      thrust::make_tuple(gather_map.begin(), thrust::make_discard_iterator()));
-
-    auto func = replace_policy_functor();
-    if (replace_policy == cudf::replace_policy::PRECEDING) {
-      thrust::inclusive_scan(
-        rmm::exec_policy(stream), in_begin, in_begin + input.size(), gm_begin, func);
-    } else {
-      auto in_rbegin = thrust::make_reverse_iterator(in_begin + input.size());
-      auto gm_rbegin = thrust::make_reverse_iterator(gm_begin + gather_map.size());
-      thrust::inclusive_scan(
-        rmm::exec_policy(stream), in_rbegin, in_rbegin + input.size(), gm_rbegin, func);
-    }
-
-    auto output = cudf::detail::gather(cudf::table_view({input}),
-                                       gather_map.begin(),
-                                       gather_map.end(),
-                                       cudf::out_of_bounds_policy::DONT_CHECK);
 
-    return std::move(output->release()[0]);
+std::unique_ptr<cudf::column> replace_nulls_policy_impl(cudf::column_view const& input,
+                                                        cudf::replace_policy const& replace_policy,
+                                                        rmm::cuda_stream_view stream,
+                                                        rmm::mr::device_memory_resource* mr)
+{
+  auto device_in = cudf::column_device_view::create(input);
+  auto index     = thrust::make_counting_iterator<cudf::size_type>(0);
+  auto valid_it  = cudf::detail::make_validity_iterator(*device_in);
+  auto in_begin  = thrust::make_zip_iterator(thrust::make_tuple(index, valid_it));
+
+  rmm::device_vector<cudf::size_type> gather_map(input.size());
+  auto gm_begin = thrust::make_zip_iterator(
+    thrust::make_tuple(gather_map.begin(), thrust::make_discard_iterator()));
+
+  auto func = replace_policy_functor();
+  if (replace_policy == cudf::replace_policy::PRECEDING) {
+    thrust::inclusive_scan(
+      rmm::exec_policy(stream), in_begin, in_begin + input.size(), gm_begin, func);
+  } else {
+    auto in_rbegin = thrust::make_reverse_iterator(in_begin + input.size());
+    auto gm_rbegin = thrust::make_reverse_iterator(gm_begin + gather_map.size());
+    thrust::inclusive_scan(
+      rmm::exec_policy(stream), in_rbegin, in_rbegin + input.size(), gm_rbegin, func);
   }
 
-  template <typename col_type, std::enable_if_t<not cudf::is_fixed_width<col_type>()>* = nullptr>
-  std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
-                                           cudf::replace_policy const& fillna_policy,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
-  {
-    CUDF_FAIL("No specialization exists for the given type.");
-  }
-};
+  auto output = cudf::detail::gather(cudf::table_view({input}),
+                                     gather_map.begin(),
+                                     gather_map.end(),
+                                     cudf::out_of_bounds_policy::DONT_CHECK);
+
+  return std::move(output->release()[0]);
+}
 
 }  // end anonymous namespace
 
@@ -468,8 +454,7 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
 
   if (!input.has_nulls()) { return std::make_unique<cudf::column>(input, stream, mr); }
 
-  return cudf::type_dispatcher(
-    input.type(), replace_nulls_policy_kernel_forwarder{}, input, replace_policy, stream, mr);
+  return replace_nulls_policy_impl(input, replace_policy, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 6abacc6095e..6126b4824e5 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -232,7 +232,6 @@ __global__ void replace_strings_second_pass(cudf::column_device_view input,
  * of old values to be replaced
  * @param[in] d_replacement_values Device array with the new values
  * @param[in] replacement_valid Valid mask associated with d_replacement_values
- *
  */
 template <class T, bool input_has_nulls, bool replacement_has_nulls>
 __global__ void replace_kernel(cudf::column_device_view input,
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
new file mode 100644
index 00000000000..3a9bc8c2779
--- /dev/null
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -0,0 +1,914 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/unary.hpp>
+#include "rolling_detail.cuh"
+
+namespace cudf {
+
+std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
+                                               column_view const& input,
+                                               size_type preceding_window,
+                                               size_type following_window,
+                                               size_type min_periods,
+                                               std::unique_ptr<aggregation> const& aggr,
+                                               rmm::mr::device_memory_resource* mr)
+{
+  return grouped_rolling_window(group_keys,
+                                input,
+                                window_bounds::get(preceding_window),
+                                window_bounds::get(following_window),
+                                min_periods,
+                                aggr,
+                                mr);
+}
+
+std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
+                                               column_view const& input,
+                                               window_bounds preceding_window,
+                                               window_bounds following_window,
+                                               size_type min_periods,
+                                               std::unique_ptr<aggregation> const& aggr,
+                                               rmm::mr::device_memory_resource* mr)
+{
+  return grouped_rolling_window(group_keys,
+                                input,
+                                empty_like(input)->view(),
+                                preceding_window,
+                                following_window,
+                                min_periods,
+                                aggr,
+                                mr);
+}
+
+std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
+                                               column_view const& input,
+                                               column_view const& default_outputs,
+                                               size_type preceding_window,
+                                               size_type following_window,
+                                               size_type min_periods,
+                                               std::unique_ptr<aggregation> const& aggr,
+                                               rmm::mr::device_memory_resource* mr)
+{
+  return grouped_rolling_window(group_keys,
+                                input,
+                                default_outputs,
+                                window_bounds::get(preceding_window),
+                                window_bounds::get(following_window),
+                                min_periods,
+                                aggr,
+                                mr);
+}
+
+namespace detail {
+
+std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
+                                               column_view const& input,
+                                               column_view const& default_outputs,
+                                               window_bounds preceding_window_bounds,
+                                               window_bounds following_window_bounds,
+                                               size_type min_periods,
+                                               std::unique_ptr<aggregation> const& aggr,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+
+  if (input.is_empty()) return empty_like(input);
+
+  CUDF_EXPECTS((group_keys.num_columns() == 0 || group_keys.num_rows() == input.size()),
+               "Size mismatch between group_keys and input vector.");
+
+  CUDF_EXPECTS((min_periods > 0), "min_periods must be positive");
+
+  CUDF_EXPECTS((default_outputs.is_empty() || default_outputs.size() == input.size()),
+               "Defaults column must be either empty or have as many rows as the input column.");
+
+  auto const preceding_window = preceding_window_bounds.value;
+  auto const following_window = following_window_bounds.value;
+
+  if (group_keys.num_columns() == 0) {
+    // No Groupby columns specified. Treat as one big group.
+    return rolling_window(
+      input, default_outputs, preceding_window, following_window, min_periods, aggr, mr);
+  }
+
+  using sort_groupby_helper = cudf::groupby::detail::sort::sort_groupby_helper;
+
+  sort_groupby_helper helper{group_keys, cudf::null_policy::INCLUDE, cudf::sorted::YES};
+  auto group_offsets{helper.group_offsets()};
+  auto const& group_labels{helper.group_labels()};
+
+  // `group_offsets` are interpreted in adjacent pairs, each pair representing the offsets
+  // of the first, and one past the last elements in a group.
+  //
+  // If `group_offsets` is not empty, it must contain at least two offsets:
+  //   a. 0, indicating the first element in `input`
+  //   b. input.size(), indicating one past the last element in `input`.
+  //
+  // Thus, for an input of 1000 rows,
+  //   0. [] indicates a single group, spanning the entire column.
+  //   1  [10] is invalid.
+  //   2. [0, 1000] indicates a single group, spanning the entire column (thus, equivalent to no
+  //   groups.)
+  //   3. [0, 500, 1000] indicates two equal-sized groups: [0,500), and [500,1000).
+
+  assert(group_offsets.size() >= 2 && group_offsets[0] == 0 &&
+         group_offsets[group_offsets.size() - 1] == input.size() &&
+         "Must have at least one group.");
+
+  auto preceding_calculator = [d_group_offsets = group_offsets.data().get(),
+                               d_group_labels  = group_labels.data().get(),
+                               preceding_window] __device__(size_type idx) {
+    auto group_label = d_group_labels[idx];
+    auto group_start = d_group_offsets[group_label];
+    return thrust::minimum<size_type>{}(preceding_window,
+                                        idx - group_start + 1);  // Preceding includes current row.
+  };
+
+  auto following_calculator = [d_group_offsets = group_offsets.data().get(),
+                               d_group_labels  = group_labels.data().get(),
+                               following_window] __device__(size_type idx) {
+    auto group_label = d_group_labels[idx];
+    auto group_end =
+      d_group_offsets[group_label +
+                      1];  // Cannot fall off the end, since offsets is capped with `input.size()`.
+    return thrust::minimum<size_type>{}(following_window, (group_end - 1) - idx);
+  };
+
+  if (aggr->kind == aggregation::CUDA || aggr->kind == aggregation::PTX) {
+    cudf::detail::preceding_window_wrapper grouped_preceding_window{
+      group_offsets.data().get(), group_labels.data().get(), preceding_window};
+
+    cudf::detail::following_window_wrapper grouped_following_window{
+      group_offsets.data().get(), group_labels.data().get(), following_window};
+
+    return cudf::detail::rolling_window_udf(input,
+                                            grouped_preceding_window,
+                                            "cudf::detail::preceding_window_wrapper",
+                                            grouped_following_window,
+                                            "cudf::detail::following_window_wrapper",
+                                            min_periods,
+                                            aggr,
+                                            stream,
+                                            mr);
+  } else {
+    return cudf::detail::rolling_window(
+      input,
+      default_outputs,
+      thrust::make_transform_iterator(thrust::make_counting_iterator<size_type>(0),
+                                      preceding_calculator),
+      thrust::make_transform_iterator(thrust::make_counting_iterator<size_type>(0),
+                                      following_calculator),
+      min_periods,
+      aggr,
+      stream,
+      mr);
+  }
+}
+
+}  // namespace detail
+
+std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
+                                               column_view const& input,
+                                               column_view const& default_outputs,
+                                               window_bounds preceding_window_bounds,
+                                               window_bounds following_window_bounds,
+                                               size_type min_periods,
+                                               std::unique_ptr<aggregation> const& aggr,
+                                               rmm::mr::device_memory_resource* mr)
+{
+  return detail::grouped_rolling_window(group_keys,
+                                        input,
+                                        default_outputs,
+                                        preceding_window_bounds,
+                                        following_window_bounds,
+                                        min_periods,
+                                        aggr,
+                                        rmm::cuda_stream_default,
+                                        mr);
+}
+
+namespace {
+
+bool is_supported_range_frame_unit(cudf::data_type const& data_type)
+{
+  auto id = data_type.id();
+  return id == cudf::type_id::TIMESTAMP_DAYS || id == cudf::type_id::TIMESTAMP_SECONDS ||
+         id == cudf::type_id::TIMESTAMP_MILLISECONDS ||
+         id == cudf::type_id::TIMESTAMP_MICROSECONDS || id == cudf::type_id::TIMESTAMP_NANOSECONDS;
+}
+
+/// Fetches multiplication factor to normalize window sizes, depending on the datatype of the
+/// timestamp column. Used for time-based rolling-window operations. E.g. If the timestamp column is
+/// in TIMESTAMP_SECONDS, and the window sizes are specified in DAYS, the window size needs to be
+/// multiplied by `24*60*60`, before comparisons with the timestamps.
+size_t multiplication_factor(cudf::data_type const& data_type)
+{
+  // Assume timestamps.
+  switch (data_type.id()) {
+    case cudf::type_id::TIMESTAMP_DAYS: return 1L;
+    case cudf::type_id::TIMESTAMP_SECONDS: return 24L * 60 * 60;
+    case cudf::type_id::TIMESTAMP_MILLISECONDS: return 24L * 60 * 60 * 1000;
+    case cudf::type_id::TIMESTAMP_MICROSECONDS: return 24L * 60 * 60 * 1000 * 1000;
+    case cudf::type_id::TIMESTAMP_NANOSECONDS: return 24L * 60 * 60 * 1000 * 1000 * 1000;
+    default:
+      CUDF_FAIL("Unexpected data-type for timestamp-based rolling window operation!");
+      return {};
+  }
+}
+
+/// Given a single, ungrouped timestamp column, return the indices corresponding
+/// to the first null timestamp, and (one past) the last null timestamp.
+/// The input column is sorted, with all null values clustered either
+/// at the beginning of the column or at the end.
+/// If no null values are founds, null_begin and null_end are 0.
+std::tuple<size_type, size_type> get_null_bounds_for_timestamp_column(
+  column_view const& timestamp_column)
+{
+  auto const num_rows  = timestamp_column.size();
+  auto const num_nulls = timestamp_column.null_count();
+
+  if (num_nulls == num_rows || num_nulls == 0) {
+    // Short-circuit: All nulls, or no nulls.
+    return std::make_tuple(0, num_nulls);
+  }
+
+  auto const first_row_is_null = timestamp_column.null_count(0, 1) == 1;
+
+  return first_row_is_null ? std::make_tuple(0, num_nulls)
+                           : std::make_tuple(num_rows - num_nulls, num_rows);
+}
+
+using TimeT = int64_t;  // Timestamp representations normalized to int64_t.
+
+template <typename Calculator>
+std::unique_ptr<column> expand_to_column(Calculator const& calc,
+                                         size_type const& num_rows,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  auto window_column = cudf::make_fixed_width_column(
+    cudf::data_type{type_id::INT32}, num_rows, cudf::mask_state::UNALLOCATED, stream, mr);
+
+  auto begin = thrust::make_transform_iterator(thrust::make_counting_iterator<size_type>(0), calc);
+
+  thrust::copy_n(
+    rmm::exec_policy(stream), begin, num_rows, window_column->mutable_view().data<size_type>());
+
+  return window_column;
+}
+
+/// Time-range window computation, with
+///   1. no grouping keys specified
+///   2. timetamps in ASCENDING order.
+/// Treat as one single group.
+std::unique_ptr<column> time_range_window_ASC(column_view const& input,
+                                              column_view const& timestamp_column,
+                                              TimeT preceding_window,
+                                              bool preceding_window_is_unbounded,
+                                              TimeT following_window,
+                                              bool following_window_is_unbounded,
+                                              size_type min_periods,
+                                              std::unique_ptr<aggregation> const& aggr,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  size_type nulls_begin_idx, nulls_end_idx;
+  std::tie(nulls_begin_idx, nulls_end_idx) = get_null_bounds_for_timestamp_column(timestamp_column);
+
+  auto preceding_calculator =
+    [nulls_begin_idx,
+     nulls_end_idx,
+     d_timestamps = timestamp_column.data<TimeT>(),
+     preceding_window,
+     preceding_window_is_unbounded] __device__(size_type idx) -> size_type {
+    if (preceding_window_is_unbounded) {
+      return idx + 1;  // Technically `idx - 0 + 1`,
+                       // where 0 == Group start,
+                       // and   1 accounts for the current row
+    }
+    if (idx >= nulls_begin_idx && idx < nulls_end_idx) {
+      // Current row is in the null group.
+      // Must consider beginning of null-group as window start.
+      return idx - nulls_begin_idx + 1;
+    }
+
+    // timestamp[idx] not null. Binary search the group, excluding null group.
+    // If nulls_begin_idx == 0, either
+    //  1. NULLS FIRST ordering: Binary search starts where nulls_end_idx.
+    //  2. NO NULLS: Binary search starts at 0 (also nulls_end_idx).
+    // Otherwise, NULLS LAST ordering. Start at 0.
+    auto group_start                = nulls_begin_idx == 0 ? nulls_end_idx : 0;
+    auto lowest_timestamp_in_window = d_timestamps[idx] - preceding_window;
+
+    return ((d_timestamps + idx) - thrust::lower_bound(thrust::seq,
+                                                       d_timestamps + group_start,
+                                                       d_timestamps + idx,
+                                                       lowest_timestamp_in_window)) +
+           1;  // Add 1, for `preceding` to account for current row.
+  };
+
+  auto preceding_column = expand_to_column(preceding_calculator, input.size(), stream, mr);
+
+  auto following_calculator =
+    [nulls_begin_idx,
+     nulls_end_idx,
+     num_rows     = input.size(),
+     d_timestamps = timestamp_column.data<TimeT>(),
+     following_window,
+     following_window_is_unbounded] __device__(size_type idx) -> size_type {
+    if (following_window_is_unbounded) { return num_rows - idx - 1; }
+    if (idx >= nulls_begin_idx && idx < nulls_end_idx) {
+      // Current row is in the null group.
+      // Window ends at the end of the null group.
+      return nulls_end_idx - idx - 1;
+    }
+
+    // timestamp[idx] not null. Binary search the group, excluding null group.
+    // If nulls_begin_idx == 0, either
+    //  1. NULLS FIRST ordering: Binary search ends at num_rows.
+    //  2. NO NULLS: Binary search also ends at num_rows.
+    // Otherwise, NULLS LAST ordering. End at nulls_begin_idx.
+
+    auto group_end                   = nulls_begin_idx == 0 ? num_rows : nulls_begin_idx;
+    auto highest_timestamp_in_window = d_timestamps[idx] + following_window;
+
+    return (thrust::upper_bound(thrust::seq,
+                                d_timestamps + idx,
+                                d_timestamps + group_end,
+                                highest_timestamp_in_window) -
+            (d_timestamps + idx)) -
+           1;
+  };
+
+  auto following_column = expand_to_column(following_calculator, input.size(), stream, mr);
+
+  return cudf::rolling_window(
+    input, preceding_column->view(), following_column->view(), min_periods, aggr, mr);
+}
+
+/// Given a timestamp column grouped as specified in group_offsets,
+/// return the following two vectors:
+///  1. Vector with one entry per group, indicating the offset in the group
+///     where the null values begin.
+///  2. Vector with one entry per group, indicating the offset in the group
+///     where the null values end. (i.e. 1 past the last null.)
+/// Each group in the input timestamp column must be sorted,
+/// with null values clustered at either the start or the end of each group.
+/// If there are no nulls for any given group, (nulls_begin, nulls_end) == (0,0).
+std::tuple<rmm::device_vector<size_type>, rmm::device_vector<size_type>>
+get_null_bounds_for_timestamp_column(column_view const& timestamp_column,
+                                     rmm::device_vector<size_type> const& group_offsets)
+{
+  // For each group, the null values are themselves clustered
+  // at the beginning or the end of the group.
+  // These nulls cannot participate, except in their own window.
+
+  // If the input has n groups, group_offsets will have n+1 values.
+  // null_start and null_end should eventually have 1 entry per group.
+  auto null_start = rmm::device_vector<size_type>(group_offsets.begin(), group_offsets.end() - 1);
+  auto null_end   = rmm::device_vector<size_type>(group_offsets.begin(), group_offsets.end() - 1);
+
+  if (timestamp_column.has_nulls()) {
+    auto p_timestamps_device_view = column_device_view::create(timestamp_column);
+    auto num_groups               = group_offsets.size();
+
+    // Null timestamps exist. Find null bounds, per group.
+    thrust::for_each(
+      thrust::device,
+      thrust::make_counting_iterator(static_cast<size_type>(0)),
+      thrust::make_counting_iterator(static_cast<size_type>(num_groups)),
+      [d_timestamps    = *p_timestamps_device_view,
+       d_group_offsets = group_offsets.data().get(),
+       d_null_start    = null_start.data(),
+       d_null_end      = null_end.data()] __device__(auto group_label) {
+        auto group_start           = d_group_offsets[group_label];
+        auto group_end             = d_group_offsets[group_label + 1];
+        auto first_element_is_null = d_timestamps.is_null_nocheck(group_start);
+        auto last_element_is_null  = d_timestamps.is_null_nocheck(group_end - 1);
+        if (!first_element_is_null && !last_element_is_null) {
+          // Short circuit: No nulls.
+          d_null_start[group_label] = group_start;
+          d_null_end[group_label]   = group_start;
+        } else if (first_element_is_null && last_element_is_null) {
+          // Short circuit: All nulls.
+          d_null_start[group_label] = group_start;
+          d_null_end[group_label]   = group_end;
+        } else if (first_element_is_null) {
+          // NULLS FIRST.
+          d_null_start[group_label] = group_start;
+          d_null_end[group_label]   = *thrust::partition_point(
+            thrust::seq,
+            thrust::make_counting_iterator(group_start),
+            thrust::make_counting_iterator(group_end),
+            [&d_timestamps] __device__(auto i) { return d_timestamps.is_null_nocheck(i); });
+        } else {
+          // NULLS LAST.
+          d_null_end[group_label]   = group_end;
+          d_null_start[group_label] = *thrust::partition_point(
+            thrust::seq,
+            thrust::make_counting_iterator(group_start),
+            thrust::make_counting_iterator(group_end),
+            [&d_timestamps] __device__(auto i) { return d_timestamps.is_valid_nocheck(i); });
+        }
+      });
+  }
+
+  return std::make_tuple(std::move(null_start), std::move(null_end));
+}
+
+// Time-range window computation, for timestamps in ASCENDING order.
+std::unique_ptr<column> time_range_window_ASC(
+  column_view const& input,
+  column_view const& timestamp_column,
+  rmm::device_vector<cudf::size_type> const& group_offsets,
+  rmm::device_vector<cudf::size_type> const& group_labels,
+  TimeT preceding_window,
+  bool preceding_window_is_unbounded,
+  TimeT following_window,
+  bool following_window_is_unbounded,
+  size_type min_periods,
+  std::unique_ptr<aggregation> const& aggr,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  rmm::device_vector<size_type> null_start, null_end;
+  std::tie(null_start, null_end) =
+    get_null_bounds_for_timestamp_column(timestamp_column, group_offsets);
+
+  auto preceding_calculator =
+    [d_group_offsets = group_offsets.data().get(),
+     d_group_labels  = group_labels.data().get(),
+     d_timestamps    = timestamp_column.data<TimeT>(),
+     d_nulls_begin   = null_start.data().get(),
+     d_nulls_end     = null_end.data().get(),
+     preceding_window,
+     preceding_window_is_unbounded] __device__(size_type idx) -> size_type {
+    auto group_label = d_group_labels[idx];
+    auto group_start = d_group_offsets[group_label];
+    auto nulls_begin = d_nulls_begin[group_label];
+    auto nulls_end   = d_nulls_end[group_label];
+
+    if (preceding_window_is_unbounded) { return idx - group_start + 1; }
+
+    // If idx lies in the null-range, the window is the null range.
+    if (idx >= nulls_begin && idx < nulls_end) {
+      // Current row is in the null group.
+      // The window starts at the start of the null group.
+      return idx - nulls_begin + 1;
+    }
+
+    // timestamp[idx] not null. Search must exclude the null group.
+    // If nulls_begin == group_start, either of the following is true:
+    //  1. NULLS FIRST ordering: Search must begin at nulls_end.
+    //  2. NO NULLS: Search must begin at group_start (which also equals nulls_end.)
+    // Otherwise, NULLS LAST ordering. Search must start at nulls group_start.
+    auto search_start = nulls_begin == group_start ? nulls_end : group_start;
+
+    auto lowest_timestamp_in_window = d_timestamps[idx] - preceding_window;
+
+    return ((d_timestamps + idx) - thrust::lower_bound(thrust::seq,
+                                                       d_timestamps + search_start,
+                                                       d_timestamps + idx,
+                                                       lowest_timestamp_in_window)) +
+           1;  // Add 1, for `preceding` to account for current row.
+  };
+
+  auto preceding_column = expand_to_column(preceding_calculator, input.size(), stream, mr);
+
+  auto following_calculator =
+    [d_group_offsets = group_offsets.data().get(),
+     d_group_labels  = group_labels.data().get(),
+     d_timestamps    = timestamp_column.data<TimeT>(),
+     d_nulls_begin   = null_start.data().get(),
+     d_nulls_end     = null_end.data().get(),
+     following_window,
+     following_window_is_unbounded] __device__(size_type idx) -> size_type {
+    auto group_label = d_group_labels[idx];
+    auto group_start = d_group_offsets[group_label];
+    auto group_end =
+      d_group_offsets[group_label +
+                      1];  // Cannot fall off the end, since offsets is capped with `input.size()`.
+    auto nulls_begin = d_nulls_begin[group_label];
+    auto nulls_end   = d_nulls_end[group_label];
+
+    if (following_window_is_unbounded) { return (group_end - idx) - 1; }
+
+    // If idx lies in the null-range, the window is the null range.
+    if (idx >= nulls_begin && idx < nulls_end) {
+      // Current row is in the null group.
+      // The window ends at the end of the null group.
+      return nulls_end - idx - 1;
+    }
+
+    // timestamp[idx] not null. Search must exclude the null group.
+    // If nulls_begin == group_start, either of the following is true:
+    //  1. NULLS FIRST ordering: Search ends at group_end.
+    //  2. NO NULLS: Search ends at group_end.
+    // Otherwise, NULLS LAST ordering. Search ends at nulls_begin.
+    auto search_end = nulls_begin == group_start ? group_end : nulls_begin;
+
+    auto highest_timestamp_in_window = d_timestamps[idx] + following_window;
+
+    return (thrust::upper_bound(thrust::seq,
+                                d_timestamps + idx,
+                                d_timestamps + search_end,
+                                highest_timestamp_in_window) -
+            (d_timestamps + idx)) -
+           1;
+  };
+
+  auto following_column = expand_to_column(following_calculator, input.size(), stream, mr);
+
+  return cudf::rolling_window(
+    input, preceding_column->view(), following_column->view(), min_periods, aggr, mr);
+}
+
+/// Time-range window computation, with
+///   1. no grouping keys specified
+///   2. timetamps in DESCENDING order.
+/// Treat as one single group.
+std::unique_ptr<column> time_range_window_DESC(column_view const& input,
+                                               column_view const& timestamp_column,
+                                               TimeT preceding_window,
+                                               bool preceding_window_is_unbounded,
+                                               TimeT following_window,
+                                               bool following_window_is_unbounded,
+                                               size_type min_periods,
+                                               std::unique_ptr<aggregation> const& aggr,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr)
+{
+  size_type nulls_begin_idx, nulls_end_idx;
+  std::tie(nulls_begin_idx, nulls_end_idx) = get_null_bounds_for_timestamp_column(timestamp_column);
+
+  auto preceding_calculator =
+    [nulls_begin_idx,
+     nulls_end_idx,
+     d_timestamps = timestamp_column.data<TimeT>(),
+     preceding_window,
+     preceding_window_is_unbounded] __device__(size_type idx) -> size_type {
+    if (preceding_window_is_unbounded) {
+      return idx + 1;  // Technically `idx - 0 + 1`,
+                       // where 0 == Group start,
+                       // and   1 accounts for the current row
+    }
+    if (idx >= nulls_begin_idx && idx < nulls_end_idx) {
+      // Current row is in the null group.
+      // Must consider beginning of null-group as window start.
+      return idx - nulls_begin_idx + 1;
+    }
+
+    // timestamp[idx] not null. Binary search the group, excluding null group.
+    // If nulls_begin_idx == 0, either
+    //  1. NULLS FIRST ordering: Binary search starts where nulls_end_idx.
+    //  2. NO NULLS: Binary search starts at 0 (also nulls_end_idx).
+    // Otherwise, NULLS LAST ordering. Start at 0.
+    auto group_start                 = nulls_begin_idx == 0 ? nulls_end_idx : 0;
+    auto highest_timestamp_in_window = d_timestamps[idx] + preceding_window;
+
+    return ((d_timestamps + idx) -
+            thrust::lower_bound(thrust::seq,
+                                d_timestamps + group_start,
+                                d_timestamps + idx,
+                                highest_timestamp_in_window,
+                                thrust::greater<decltype(highest_timestamp_in_window)>())) +
+           1;  // Add 1, for `preceding` to account for current row.
+  };
+
+  auto preceding_column = expand_to_column(preceding_calculator, input.size(), stream, mr);
+
+  auto following_calculator =
+    [nulls_begin_idx,
+     nulls_end_idx,
+     num_rows     = input.size(),
+     d_timestamps = timestamp_column.data<TimeT>(),
+     following_window,
+     following_window_is_unbounded] __device__(size_type idx) -> size_type {
+    if (following_window_is_unbounded) { return (num_rows - idx) - 1; }
+    if (idx >= nulls_begin_idx && idx < nulls_end_idx) {
+      // Current row is in the null group.
+      // Window ends at the end of the null group.
+      return nulls_end_idx - idx - 1;
+    }
+
+    // timestamp[idx] not null. Search must exclude null group.
+    // If nulls_begin_idx = 0, either
+    //  1. NULLS FIRST ordering: Search ends at num_rows.
+    //  2. NO NULLS: Search also ends at num_rows.
+    // Otherwise, NULLS LAST ordering: End at nulls_begin_idx.
+
+    auto group_end                  = nulls_begin_idx == 0 ? num_rows : nulls_begin_idx;
+    auto lowest_timestamp_in_window = d_timestamps[idx] - following_window;
+
+    return (thrust::upper_bound(thrust::seq,
+                                d_timestamps + idx,
+                                d_timestamps + group_end,
+                                lowest_timestamp_in_window,
+                                thrust::greater<decltype(lowest_timestamp_in_window)>()) -
+            (d_timestamps + idx)) -
+           1;
+  };
+
+  auto following_column = expand_to_column(following_calculator, input.size(), stream, mr);
+
+  return cudf::rolling_window(
+    input, preceding_column->view(), following_column->view(), min_periods, aggr, mr);
+}
+
+// Time-range window computation, for timestamps in DESCENDING order.
+std::unique_ptr<column> time_range_window_DESC(
+  column_view const& input,
+  column_view const& timestamp_column,
+  rmm::device_vector<cudf::size_type> const& group_offsets,
+  rmm::device_vector<cudf::size_type> const& group_labels,
+  TimeT preceding_window,
+  bool preceding_window_is_unbounded,
+  TimeT following_window,
+  bool following_window_is_unbounded,
+  size_type min_periods,
+  std::unique_ptr<aggregation> const& aggr,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  rmm::device_vector<size_type> null_start, null_end;
+  std::tie(null_start, null_end) =
+    get_null_bounds_for_timestamp_column(timestamp_column, group_offsets);
+
+  auto preceding_calculator =
+    [d_group_offsets = group_offsets.data().get(),
+     d_group_labels  = group_labels.data().get(),
+     d_timestamps    = timestamp_column.data<TimeT>(),
+     d_nulls_begin   = null_start.data().get(),
+     d_nulls_end     = null_end.data().get(),
+     preceding_window,
+     preceding_window_is_unbounded] __device__(size_type idx) -> size_type {
+    auto group_label = d_group_labels[idx];
+    auto group_start = d_group_offsets[group_label];
+    auto nulls_begin = d_nulls_begin[group_label];
+    auto nulls_end   = d_nulls_end[group_label];
+
+    if (preceding_window_is_unbounded) { return (idx - group_start) + 1; }
+
+    // If idx lies in the null-range, the window is the null range.
+    if (idx >= nulls_begin && idx < nulls_end) {
+      // Current row is in the null group.
+      // The window starts at the start of the null group.
+      return idx - nulls_begin + 1;
+    }
+
+    // timestamp[idx] not null. Search must exclude the null group.
+    // If nulls_begin == group_start, either of the following is true:
+    //  1. NULLS FIRST ordering: Search must begin at nulls_end.
+    //  2. NO NULLS: Search must begin at group_start (which also equals nulls_end.)
+    // Otherwise, NULLS LAST ordering. Search must start at nulls group_start.
+    auto search_start = nulls_begin == group_start ? nulls_end : group_start;
+
+    auto highest_timestamp_in_window = d_timestamps[idx] + preceding_window;
+
+    return ((d_timestamps + idx) -
+            thrust::lower_bound(thrust::seq,
+                                d_timestamps + search_start,
+                                d_timestamps + idx,
+                                highest_timestamp_in_window,
+                                thrust::greater<decltype(highest_timestamp_in_window)>())) +
+           1;  // Add 1, for `preceding` to account for current row.
+  };
+
+  auto preceding_column = expand_to_column(preceding_calculator, input.size(), stream, mr);
+
+  auto following_calculator =
+    [d_group_offsets = group_offsets.data().get(),
+     d_group_labels  = group_labels.data().get(),
+     d_timestamps    = timestamp_column.data<TimeT>(),
+     d_nulls_begin   = null_start.data().get(),
+     d_nulls_end     = null_end.data().get(),
+     following_window,
+     following_window_is_unbounded] __device__(size_type idx) -> size_type {
+    auto group_label = d_group_labels[idx];
+    auto group_start = d_group_offsets[group_label];
+    auto group_end   = d_group_offsets[group_label + 1];
+    auto nulls_begin = d_nulls_begin[group_label];
+    auto nulls_end   = d_nulls_end[group_label];
+
+    if (following_window_is_unbounded) { return (group_end - idx) - 1; }
+
+    // If idx lies in the null-range, the window is the null range.
+    if (idx >= nulls_begin && idx < nulls_end) {
+      // Current row is in the null group.
+      // The window ends at the end of the null group.
+      return nulls_end - idx - 1;
+    }
+
+    // timestamp[idx] not null. Search must exclude the null group.
+    // If nulls_begin == group_start, either of the following is true:
+    //  1. NULLS FIRST ordering: Search ends at group_end.
+    //  2. NO NULLS: Search ends at group_end.
+    // Otherwise, NULLS LAST ordering. Search ends at nulls_begin.
+    auto search_end = nulls_begin == group_start ? group_end : nulls_begin;
+
+    auto lowest_timestamp_in_window = d_timestamps[idx] - following_window;
+
+    return (thrust::upper_bound(thrust::seq,
+                                d_timestamps + idx,
+                                d_timestamps + search_end,
+                                lowest_timestamp_in_window,
+                                thrust::greater<decltype(lowest_timestamp_in_window)>()) -
+            (d_timestamps + idx)) -
+           1;
+  };
+
+  auto following_column = expand_to_column(following_calculator, input.size(), stream, mr);
+
+  if (aggr->kind == aggregation::CUDA || aggr->kind == aggregation::PTX) {
+    CUDF_FAIL("Time ranged rolling window does NOT (yet) support UDF.");
+  } else {
+    return cudf::rolling_window(
+      input, preceding_column->view(), following_column->view(), min_periods, aggr, mr);
+  }
+}
+
+std::unique_ptr<column> grouped_time_range_rolling_window_impl(
+  column_view const& input,
+  column_view const& timestamp_column,
+  cudf::order const& timestamp_ordering,
+  rmm::device_vector<cudf::size_type> const& group_offsets,
+  rmm::device_vector<cudf::size_type> const& group_labels,
+  window_bounds preceding_window_in_days,  // TODO: Consider taking offset-type as type_id. Assumes
+                                           // days for now.
+  window_bounds following_window_in_days,
+  size_type min_periods,
+  std::unique_ptr<aggregation> const& aggr,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  TimeT mult_factor{static_cast<TimeT>(multiplication_factor(timestamp_column.type()))};
+
+  if (timestamp_ordering == cudf::order::ASCENDING) {
+    return group_offsets.empty()
+             ? time_range_window_ASC(input,
+                                     timestamp_column,
+                                     preceding_window_in_days.value * mult_factor,
+                                     preceding_window_in_days.is_unbounded,
+                                     following_window_in_days.value * mult_factor,
+                                     following_window_in_days.is_unbounded,
+                                     min_periods,
+                                     aggr,
+                                     stream,
+                                     mr)
+             : time_range_window_ASC(input,
+                                     timestamp_column,
+                                     group_offsets,
+                                     group_labels,
+                                     preceding_window_in_days.value * mult_factor,
+                                     preceding_window_in_days.is_unbounded,
+                                     following_window_in_days.value * mult_factor,
+                                     following_window_in_days.is_unbounded,
+                                     min_periods,
+                                     aggr,
+                                     stream,
+                                     mr);
+  } else {
+    return group_offsets.empty()
+             ? time_range_window_DESC(input,
+                                      timestamp_column,
+                                      preceding_window_in_days.value * mult_factor,
+                                      preceding_window_in_days.is_unbounded,
+                                      following_window_in_days.value * mult_factor,
+                                      following_window_in_days.is_unbounded,
+                                      min_periods,
+                                      aggr,
+                                      stream,
+                                      mr)
+             : time_range_window_DESC(input,
+                                      timestamp_column,
+                                      group_offsets,
+                                      group_labels,
+                                      preceding_window_in_days.value * mult_factor,
+                                      preceding_window_in_days.is_unbounded,
+                                      following_window_in_days.value * mult_factor,
+                                      following_window_in_days.is_unbounded,
+                                      min_periods,
+                                      aggr,
+                                      stream,
+                                      mr);
+  }
+}
+
+}  // namespace
+
+namespace detail {
+
+std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& group_keys,
+                                                          column_view const& timestamp_column,
+                                                          cudf::order const& timestamp_order,
+                                                          column_view const& input,
+                                                          window_bounds preceding_window_in_days,
+                                                          window_bounds following_window_in_days,
+                                                          size_type min_periods,
+                                                          std::unique_ptr<aggregation> const& aggr,
+                                                          rmm::cuda_stream_view stream,
+                                                          rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+
+  if (input.is_empty()) return empty_like(input);
+
+  CUDF_EXPECTS((group_keys.num_columns() == 0 || group_keys.num_rows() == input.size()),
+               "Size mismatch between group_keys and input vector.");
+
+  CUDF_EXPECTS((min_periods > 0), "min_periods must be positive");
+
+  using sort_groupby_helper = cudf::groupby::detail::sort::sort_groupby_helper;
+  using index_vector        = sort_groupby_helper::index_vector;
+
+  index_vector group_offsets, group_labels;
+  if (group_keys.num_columns() > 0) {
+    sort_groupby_helper helper{group_keys, cudf::null_policy::INCLUDE, cudf::sorted::YES};
+    group_offsets = helper.group_offsets();
+    group_labels  = helper.group_labels();
+  }
+
+  // Assumes that `timestamp_column` is actually of a timestamp type.
+  CUDF_EXPECTS(is_supported_range_frame_unit(timestamp_column.type()),
+               "Unsupported data-type for `timestamp`-based rolling window operation!");
+
+  auto is_timestamp_in_days = timestamp_column.type().id() == cudf::type_id::TIMESTAMP_DAYS;
+
+  return grouped_time_range_rolling_window_impl(
+    input,
+    is_timestamp_in_days
+      ? cudf::cast(timestamp_column, cudf::data_type(cudf::type_id::TIMESTAMP_SECONDS), mr)->view()
+      : timestamp_column,
+    timestamp_order,
+    group_offsets,
+    group_labels,
+    preceding_window_in_days,
+    following_window_in_days,
+    min_periods,
+    aggr,
+    stream,
+    mr);
+}
+
+}  // namespace detail
+
+std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& group_keys,
+                                                          column_view const& timestamp_column,
+                                                          cudf::order const& timestamp_order,
+                                                          column_view const& input,
+                                                          size_type preceding_window_in_days,
+                                                          size_type following_window_in_days,
+                                                          size_type min_periods,
+                                                          std::unique_ptr<aggregation> const& aggr,
+                                                          rmm::mr::device_memory_resource* mr)
+{
+  return grouped_time_range_rolling_window(group_keys,
+                                           timestamp_column,
+                                           timestamp_order,
+                                           input,
+                                           window_bounds::get(preceding_window_in_days),
+                                           window_bounds::get(following_window_in_days),
+                                           min_periods,
+                                           aggr,
+                                           rmm::cuda_stream_default,
+                                           mr);
+}
+
+std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& group_keys,
+                                                          column_view const& timestamp_column,
+                                                          cudf::order const& timestamp_order,
+                                                          column_view const& input,
+                                                          window_bounds preceding_window_in_days,
+                                                          window_bounds following_window_in_days,
+                                                          size_type min_periods,
+                                                          std::unique_ptr<aggregation> const& aggr,
+                                                          rmm::mr::device_memory_resource* mr)
+{
+  return detail::grouped_time_range_rolling_window(group_keys,
+                                                   timestamp_column,
+                                                   timestamp_order,
+                                                   input,
+                                                   preceding_window_in_days,
+                                                   following_window_in_days,
+                                                   min_periods,
+                                                   aggr,
+                                                   rmm::cuda_stream_default,
+                                                   mr);
+}
+
+}  // namespace cudf
diff --git a/cpp/src/rolling/rolling.cu b/cpp/src/rolling/rolling.cu
index d22c3b7cf45..13359f23d82 100644
--- a/cpp/src/rolling/rolling.cu
+++ b/cpp/src/rolling/rolling.cu
@@ -14,1034 +14,9 @@
  * limitations under the License.
  */
 
-#include <jit/launcher.h>
-#include <jit/parser.h>
-#include <jit/type.h>
-
-#include "jit/code/code.h"
-#include "rolling_detail.hpp"
-#include "rolling_jit_detail.hpp"
-
-#include <cudf/aggregation.hpp>
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/column/column_view.hpp>
-#include <cudf/copying.hpp>
-#include <cudf/detail/aggregation/aggregation.cuh>
-#include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/detail/copy.hpp>
-#include <cudf/detail/gather.hpp>
-#include <cudf/detail/groupby/sort_helper.hpp>
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_operators.cuh>
-#include <cudf/rolling.hpp>
-#include <cudf/types.hpp>
-#include <cudf/utilities/bit.hpp>
-#include <cudf/utilities/error.hpp>
-#include <cudf/utilities/traits.hpp>
-
-#include <jit/bit.hpp.jit>
-#include <jit/rolling_jit_detail.hpp.jit>
-#include <jit/types.hpp.jit>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
-#include <rmm/device_vector.hpp>
-
-#include <thrust/binary_search.h>
-#include <thrust/find.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/transform.h>
-
-#include <memory>
+#include "rolling_detail.cuh"
 
 namespace cudf {
-namespace detail {
-namespace {  // anonymous
-/**
- * @brief Only COUNT_VALID operation is executed and count is updated
- *        depending on `min_periods` and returns true if it was
- *        valid, else false.
- */
-template <typename InputType,
-          typename OutputType,
-          typename agg_op,
-          aggregation::Kind op,
-          bool has_nulls,
-          std::enable_if_t<op == aggregation::COUNT_VALID>* = nullptr>
-bool __device__ process_rolling_window(column_device_view input,
-                                       column_device_view ignored_default_outputs,
-                                       mutable_column_device_view output,
-                                       size_type start_index,
-                                       size_type end_index,
-                                       size_type current_index,
-                                       size_type min_periods)
-{
-  // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
-  // for CUDA 10.0 and below (fixed in CUDA 10.1)
-  volatile cudf::size_type count = 0;
-
-  bool output_is_valid = ((end_index - start_index) >= min_periods);
-
-  if (output_is_valid) {
-    if (!has_nulls) {
-      count = end_index - start_index;
-    } else {
-      count = thrust::count_if(thrust::seq,
-                               thrust::make_counting_iterator(start_index),
-                               thrust::make_counting_iterator(end_index),
-                               [&input](auto i) { return input.is_valid_nocheck(i); });
-    }
-    output.element<OutputType>(current_index) = count;
-  }
-
-  return output_is_valid;
-}
-
-/**
- * @brief Only COUNT_ALL operation is executed and count is updated
- *        depending on `min_periods` and returns true if it was
- *        valid, else false.
- */
-template <typename InputType,
-          typename OutputType,
-          typename agg_op,
-          aggregation::Kind op,
-          bool has_nulls,
-          std::enable_if_t<op == aggregation::COUNT_ALL>* = nullptr>
-bool __device__ process_rolling_window(column_device_view input,
-                                       column_device_view ignored_default_outputs,
-                                       mutable_column_device_view output,
-                                       size_type start_index,
-                                       size_type end_index,
-                                       size_type current_index,
-                                       size_type min_periods)
-{
-  cudf::size_type count = end_index - start_index;
-
-  bool output_is_valid                      = (count >= min_periods);
-  output.element<OutputType>(current_index) = count;
-
-  return output_is_valid;
-}
-
-/**
- * @brief Calculates row-number within [start_index, end_index).
- *        Count is updated depending on `min_periods`
- *        Returns true if it was valid, else false.
- */
-template <typename InputType,
-          typename OutputType,
-          typename agg_op,
-          aggregation::Kind op,
-          bool has_nulls,
-          std::enable_if_t<op == aggregation::ROW_NUMBER>* = nullptr>
-bool __device__ process_rolling_window(column_device_view input,
-                                       column_device_view ignored_default_outputs,
-                                       mutable_column_device_view output,
-                                       size_type start_index,
-                                       size_type end_index,
-                                       size_type current_index,
-                                       size_type min_periods)
-{
-  bool output_is_valid                      = ((end_index - start_index) >= min_periods);
-  output.element<OutputType>(current_index) = ((current_index - start_index) + 1);
-
-  return output_is_valid;
-}
-
-/**
- * @brief LEAD(N): Returns the row from the input column, at the specified offset past the
- *        current row.
- * If the offset crosses the grouping boundary or column boundary for
- * a given row, a "default" value is returned. The "default" value is null, by default.
- *
- * E.g. Consider an input column with the following values and grouping:
- *      [10, 11, 12, 13,   20, 21, 22, 23]
- *      <------G1----->   <------G2------>
- *
- * LEAD(input_col, 1) yields:
- *      [11, 12, 13, null,  21, 22, 23, null]
- *
- * LEAD(input_col, 1, 99) (where 99 indicates the default) yields:
- *      [11, 12, 13, 99,  21, 22, 23, 99]
- */
-template <typename InputType,
-          typename OutputType,
-          typename agg_op,
-          aggregation::Kind op,
-          bool has_nulls>
-std::enable_if_t<(op == aggregation::LEAD) && (cudf::is_fixed_width<InputType>()), bool> __device__
-process_rolling_window(column_device_view input,
-                       column_device_view default_outputs,
-                       mutable_column_device_view output,
-                       size_type start_index,
-                       size_type end_index,
-                       size_type current_index,
-                       size_type min_periods,
-                       agg_op device_agg_op)
-{
-  // Offsets have already been normalized.
-  auto row_offset = device_agg_op.row_offset;
-
-  // Check if row is invalid.
-  if (row_offset > (end_index - current_index - 1)) {
-    // Invalid row marked. Use default value, if available.
-    if (default_outputs.size() == 0 || default_outputs.is_null(current_index)) { return false; }
-
-    output.element<OutputType>(current_index) = default_outputs.element<OutputType>(current_index);
-    return true;
-  }
-
-  // Not an invalid row.
-  auto index   = current_index + row_offset;
-  auto is_null = input.is_null(index);
-  if (!is_null) { output.element<OutputType>(current_index) = input.element<InputType>(index); }
-  return !is_null;
-}
-
-/**
- * @brief LAG(N): returns the row from the input column at the specified offset preceding
- *        the current row.
- * If the offset crosses the grouping boundary or column boundary for
- * a given row, a "default" value is returned. The "default" value is null, by default.
- *
- * E.g. Consider an input column with the following values and grouping:
- *      [10, 11, 12, 13,   20, 21, 22, 23]
- *      <------G1----->   <------G2------>
- *
- * LAG(input_col, 2) yields:
- *      [null, null, 10, 11, null, null, 20, 21]
- * LAG(input_col, 2, 99) yields:
- *      [99, 99, 10, 11, 99, 99, 20, 21]
- */
-template <typename InputType,
-          typename OutputType,
-          typename agg_op,
-          aggregation::Kind op,
-          bool has_nulls>
-std::enable_if_t<(op == aggregation::LAG) && (cudf::is_fixed_width<InputType>()), bool> __device__
-process_rolling_window(column_device_view input,
-                       column_device_view default_outputs,
-                       mutable_column_device_view output,
-                       size_type start_index,
-                       size_type end_index,
-                       size_type current_index,
-                       size_type min_periods,
-                       agg_op device_agg_op)
-{
-  // Offsets have already been normalized.
-  auto row_offset = device_agg_op.row_offset;
-
-  // Check if row is invalid.
-  if (row_offset > (current_index - start_index)) {
-    // Invalid row marked. Use default value, if available.
-    if (default_outputs.size() == 0 || default_outputs.is_null(current_index)) { return false; }
-
-    output.element<OutputType>(current_index) = default_outputs.element<OutputType>(current_index);
-    return true;
-  }
-
-  // Not an invalid row.
-  auto index   = current_index - row_offset;
-  auto is_null = input.is_null(index);
-  if (!is_null) { output.element<OutputType>(current_index) = input.element<InputType>(index); }
-  return !is_null;
-}
-
-/**
- * @brief Only used for `string_view` type to get ARGMIN and ARGMAX, which
- *        will be used to gather MIN and MAX. And returns true if the
- *        operation was valid, else false.
- */
-template <typename InputType,
-          typename OutputType,
-          typename agg_op,
-          aggregation::Kind op,
-          bool has_nulls,
-          std::enable_if_t<(op == aggregation::ARGMIN or op == aggregation::ARGMAX) and
-                           std::is_same<InputType, cudf::string_view>::value>* = nullptr>
-bool __device__ process_rolling_window(column_device_view input,
-                                       column_device_view ignored_default_outputs,
-                                       mutable_column_device_view output,
-                                       size_type start_index,
-                                       size_type end_index,
-                                       size_type current_index,
-                                       size_type min_periods)
-{
-  // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
-  // for CUDA 10.0 and below (fixed in CUDA 10.1)
-  volatile cudf::size_type count = 0;
-  InputType val                  = agg_op::template identity<InputType>();
-  OutputType val_index           = (op == aggregation::ARGMIN) ? ARGMIN_SENTINEL : ARGMAX_SENTINEL;
-
-  for (size_type j = start_index; j < end_index; j++) {
-    if (!has_nulls || input.is_valid(j)) {
-      InputType element = input.element<InputType>(j);
-      val               = agg_op{}(element, val);
-      if (val == element) { val_index = j; }
-      count++;
-    }
-  }
-
-  bool output_is_valid = (count >= min_periods);
-  // -1 will help identify null elements while gathering for Min and Max
-  // In case of count, this would be null, so doesn't matter.
-  output.element<OutputType>(current_index) = (output_is_valid) ? val_index : -1;
-
-  // The gather mask shouldn't contain null values, so
-  // always return zero
-  return true;
-}
-
-/**
- * @brief Operates on only fixed-width types and returns true if the
- *        operation was valid, else false.
- */
-template <typename InputType,
-          typename OutputType,
-          typename agg_op,
-          aggregation::Kind op,
-          bool has_nulls,
-          std::enable_if_t<!std::is_same<InputType, cudf::string_view>::value and
-                           !(op == aggregation::COUNT_VALID || op == aggregation::COUNT_ALL ||
-                             op == aggregation::ROW_NUMBER || op == aggregation::LEAD ||
-                             op == aggregation::LAG)>* = nullptr>
-bool __device__ process_rolling_window(column_device_view input,
-                                       column_device_view ignored_default_outputs,
-                                       mutable_column_device_view output,
-                                       size_type start_index,
-                                       size_type end_index,
-                                       size_type current_index,
-                                       size_type min_periods)
-{
-  // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
-  // for CUDA 10.0 and below (fixed in CUDA 10.1)
-  volatile cudf::size_type count = 0;
-  OutputType val                 = agg_op::template identity<OutputType>();
-
-  for (size_type j = start_index; j < end_index; j++) {
-    if (!has_nulls || input.is_valid(j)) {
-      OutputType element = input.element<InputType>(j);
-      val                = agg_op{}(element, val);
-      count++;
-    }
-  }
-
-  bool output_is_valid = (count >= min_periods);
-
-  // store the output value, one per thread
-  cudf::detail::rolling_store_output_functor<OutputType, op == aggregation::MEAN>{}(
-    output.element<OutputType>(current_index), val, count);
-
-  return output_is_valid;
-}
-
-/**
- * @brief Computes the rolling window function
- *
- * @tparam InputType  Datatype of `input`
- * @tparam OutputType  Datatype of `output`
- * @tparam agg_op  A functor that defines the aggregation operation
- * @tparam op The aggregation operator (enum value)
- * @tparam block_size CUDA block size for the kernel
- * @tparam has_nulls true if the input column has nulls
- * @tparam PrecedingWindowIterator iterator type (inferred)
- * @tparam FollowingWindowIterator iterator type (inferred)
- * @param input Input column device view
- * @param output Output column device view
- * @param preceding_window_begin[in] Rolling window size iterator, accumulates from
- *                in_col[i-preceding_window] to in_col[i] inclusive
- * @param following_window_begin[in] Rolling window size iterator in the forward
- *                direction, accumulates from in_col[i] to
- *                in_col[i+following_window] inclusive
- * @param min_periods[in]  Minimum number of observations in window required to
- *                have a value, otherwise 0 is stored in the valid bit mask
- */
-template <typename InputType,
-          typename OutputType,
-          typename agg_op,
-          aggregation::Kind op,
-          int block_size,
-          bool has_nulls,
-          typename PrecedingWindowIterator,
-          typename FollowingWindowIterator>
-__launch_bounds__(block_size) __global__
-  void gpu_rolling(column_device_view input,
-                   column_device_view default_outputs,
-                   mutable_column_device_view output,
-                   size_type* __restrict__ output_valid_count,
-                   PrecedingWindowIterator preceding_window_begin,
-                   FollowingWindowIterator following_window_begin,
-                   size_type min_periods)
-{
-  size_type i      = blockIdx.x * block_size + threadIdx.x;
-  size_type stride = block_size * gridDim.x;
-
-  size_type warp_valid_count{0};
-
-  auto active_threads = __ballot_sync(0xffffffff, i < input.size());
-  while (i < input.size()) {
-    size_type preceding_window = preceding_window_begin[i];
-    size_type following_window = following_window_begin[i];
-
-    // compute bounds
-    size_type start       = min(input.size(), max(0, i - preceding_window + 1));
-    size_type end         = min(input.size(), max(0, i + following_window + 1));
-    size_type start_index = min(start, end);
-    size_type end_index   = max(start, end);
-
-    // aggregate
-    // TODO: We should explore using shared memory to avoid redundant loads.
-    //       This might require separating the kernel into a special version
-    //       for dynamic and static sizes.
-
-    volatile bool output_is_valid = false;
-    output_is_valid = process_rolling_window<InputType, OutputType, agg_op, op, has_nulls>(
-      input, default_outputs, output, start_index, end_index, i, min_periods);
-
-    // set the mask
-    cudf::bitmask_type result_mask{__ballot_sync(active_threads, output_is_valid)};
-
-    // only one thread writes the mask
-    if (0 == threadIdx.x % cudf::detail::warp_size) {
-      output.set_mask_word(cudf::word_index(i), result_mask);
-      warp_valid_count += __popc(result_mask);
-    }
-
-    // process next element
-    i += stride;
-    active_threads = __ballot_sync(active_threads, i < input.size());
-  }
-
-  // sum the valid counts across the whole block
-  size_type block_valid_count =
-    cudf::detail::single_lane_block_sum_reduce<block_size, 0>(warp_valid_count);
-
-  if (threadIdx.x == 0) { atomicAdd(output_valid_count, block_valid_count); }
-}
-
-template <typename InputType,
-          typename OutputType,
-          typename agg_op,
-          aggregation::Kind op,
-          int block_size,
-          bool has_nulls,
-          typename PrecedingWindowIterator,
-          typename FollowingWindowIterator>
-__launch_bounds__(block_size) __global__
-  void gpu_rolling(column_device_view input,
-                   column_device_view default_outputs,
-                   mutable_column_device_view output,
-                   size_type* __restrict__ output_valid_count,
-                   PrecedingWindowIterator preceding_window_begin,
-                   FollowingWindowIterator following_window_begin,
-                   size_type min_periods,
-                   agg_op device_agg_op)
-{
-  size_type i      = blockIdx.x * block_size + threadIdx.x;
-  size_type stride = block_size * gridDim.x;
-
-  size_type warp_valid_count{0};
-
-  auto active_threads = __ballot_sync(0xffffffff, i < input.size());
-  while (i < input.size()) {
-    size_type preceding_window = preceding_window_begin[i];
-    size_type following_window = following_window_begin[i];
-
-    // compute bounds
-    size_type start       = min(input.size(), max(0, i - preceding_window + 1));
-    size_type end         = min(input.size(), max(0, i + following_window + 1));
-    size_type start_index = min(start, end);
-    size_type end_index   = max(start, end);
-
-    // aggregate
-    // TODO: We should explore using shared memory to avoid redundant loads.
-    //       This might require separating the kernel into a special version
-    //       for dynamic and static sizes.
-
-    volatile bool output_is_valid = false;
-    output_is_valid = process_rolling_window<InputType, OutputType, agg_op, op, has_nulls>(
-      input, default_outputs, output, start_index, end_index, i, min_periods, device_agg_op);
-
-    // set the mask
-    cudf::bitmask_type result_mask{__ballot_sync(active_threads, output_is_valid)};
-
-    // only one thread writes the mask
-    if (0 == threadIdx.x % cudf::detail::warp_size) {
-      output.set_mask_word(cudf::word_index(i), result_mask);
-      warp_valid_count += __popc(result_mask);
-    }
-
-    // process next element
-    i += stride;
-    active_threads = __ballot_sync(active_threads, i < input.size());
-  }
-
-  // sum the valid counts across the whole block
-  size_type block_valid_count =
-    cudf::detail::single_lane_block_sum_reduce<block_size, 0>(warp_valid_count);
-
-  if (threadIdx.x == 0) { atomicAdd(output_valid_count, block_valid_count); }
-}
-
-template <typename InputType>
-struct rolling_window_launcher {
-  template <typename T,
-            typename agg_op,
-            aggregation::Kind op,
-            typename PrecedingWindowIterator,
-            typename FollowingWindowIterator>
-  size_type kernel_launcher(column_view const& input,
-                            column_view const& default_outputs,
-                            mutable_column_view& output,
-                            PrecedingWindowIterator preceding_window_begin,
-                            FollowingWindowIterator following_window_begin,
-                            size_type min_periods,
-                            std::unique_ptr<aggregation> const& agg,
-                            rmm::cuda_stream_view stream)
-  {
-    constexpr cudf::size_type block_size = 256;
-    cudf::detail::grid_1d grid(input.size(), block_size);
-
-    auto input_device_view           = column_device_view::create(input, stream);
-    auto output_device_view          = mutable_column_device_view::create(output, stream);
-    auto default_outputs_device_view = column_device_view::create(default_outputs, stream);
-
-    rmm::device_scalar<size_type> device_valid_count{0, stream};
-
-    if (input.has_nulls()) {
-      gpu_rolling<T, target_type_t<InputType, op>, agg_op, op, block_size, true>
-        <<<grid.num_blocks, block_size, 0, stream.value()>>>(*input_device_view,
-                                                             *default_outputs_device_view,
-                                                             *output_device_view,
-                                                             device_valid_count.data(),
-                                                             preceding_window_begin,
-                                                             following_window_begin,
-                                                             min_periods);
-    } else {
-      gpu_rolling<T, target_type_t<InputType, op>, agg_op, op, block_size, false>
-        <<<grid.num_blocks, block_size, 0, stream.value()>>>(*input_device_view,
-                                                             *default_outputs_device_view,
-                                                             *output_device_view,
-                                                             device_valid_count.data(),
-                                                             preceding_window_begin,
-                                                             following_window_begin,
-                                                             min_periods);
-    }
-
-    size_type valid_count = device_valid_count.value(stream);
-
-    // check the stream for debugging
-    CHECK_CUDA(stream.value());
-
-    return valid_count;
-  }
-
-  template <typename T,
-            typename agg_op,
-            aggregation::Kind op,
-            typename PrecedingWindowIterator,
-            typename FollowingWindowIterator>
-  size_type kernel_launcher(column_view const& input,
-                            column_view const& default_outputs,
-                            mutable_column_view& output,
-                            PrecedingWindowIterator preceding_window_begin,
-                            FollowingWindowIterator following_window_begin,
-                            size_type min_periods,
-                            std::unique_ptr<aggregation> const& agg,
-                            agg_op const& device_agg_op,
-                            rmm::cuda_stream_view stream)
-  {
-    constexpr cudf::size_type block_size = 256;
-    cudf::detail::grid_1d grid(input.size(), block_size);
-
-    auto input_device_view           = column_device_view::create(input, stream);
-    auto output_device_view          = mutable_column_device_view::create(output, stream);
-    auto default_outputs_device_view = column_device_view::create(default_outputs, stream);
-
-    rmm::device_scalar<size_type> device_valid_count{0, stream};
-
-    if (input.has_nulls()) {
-      gpu_rolling<T, target_type_t<InputType, op>, agg_op, op, block_size, true>
-        <<<grid.num_blocks, block_size, 0, stream.value()>>>(*input_device_view,
-                                                             *default_outputs_device_view,
-                                                             *output_device_view,
-                                                             device_valid_count.data(),
-                                                             preceding_window_begin,
-                                                             following_window_begin,
-                                                             min_periods,
-                                                             device_agg_op);
-    } else {
-      gpu_rolling<T, target_type_t<InputType, op>, agg_op, op, block_size, false>
-        <<<grid.num_blocks, block_size, 0, stream.value()>>>(*input_device_view,
-                                                             *default_outputs_device_view,
-                                                             *output_device_view,
-                                                             device_valid_count.data(),
-                                                             preceding_window_begin,
-                                                             following_window_begin,
-                                                             min_periods,
-                                                             device_agg_op);
-    }
-
-    size_type valid_count = device_valid_count.value(stream);
-
-    // check the stream for debugging
-    CHECK_CUDA(stream.value());
-
-    return valid_count;
-  }
-
-  // This launch is only for fixed width columns with valid aggregation option
-  // numeric: All
-  // timestamp: MIN, MAX, COUNT_VALID, COUNT_ALL, ROW_NUMBER
-  // string, dictionary, list : COUNT_VALID, COUNT_ALL, ROW_NUMBER
-  template <typename T,
-            typename agg_op,
-            aggregation::Kind op,
-            typename PrecedingWindowIterator,
-            typename FollowingWindowIterator>
-  std::enable_if_t<cudf::detail::is_rolling_supported<T, agg_op, op>() and
-                     !cudf::detail::is_rolling_string_specialization<T, agg_op, op>(),
-                   std::unique_ptr<column>>
-  launch(column_view const& input,
-         column_view const& default_outputs,
-         PrecedingWindowIterator preceding_window_begin,
-         FollowingWindowIterator following_window_begin,
-         size_type min_periods,
-         std::unique_ptr<aggregation> const& agg,
-         rmm::cuda_stream_view stream,
-         rmm::mr::device_memory_resource* mr)
-  {
-    if (input.is_empty()) return empty_like(input);
-
-    auto output = make_fixed_width_column(
-      target_type(input.type(), op), input.size(), mask_state::UNINITIALIZED, stream, mr);
-
-    cudf::mutable_column_view output_view = output->mutable_view();
-    auto valid_count =
-      kernel_launcher<T, agg_op, op, PrecedingWindowIterator, FollowingWindowIterator>(
-        input,
-        default_outputs,
-        output_view,
-        preceding_window_begin,
-        following_window_begin,
-        min_periods,
-        agg,
-        stream);
-
-    output->set_null_count(output->size() - valid_count);
-
-    return output;
-  }
-
-  // This launch is only for string specializations
-  // string: MIN, MAX
-  template <typename T,
-            typename agg_op,
-            aggregation::Kind op,
-            typename PrecedingWindowIterator,
-            typename FollowingWindowIterator>
-  std::enable_if_t<cudf::detail::is_rolling_string_specialization<T, agg_op, op>(),
-                   std::unique_ptr<column>>
-  launch(column_view const& input,
-         column_view const& default_outputs,
-         PrecedingWindowIterator preceding_window_begin,
-         FollowingWindowIterator following_window_begin,
-         size_type min_periods,
-         std::unique_ptr<aggregation> const& agg,
-         rmm::cuda_stream_view stream,
-         rmm::mr::device_memory_resource* mr)
-  {
-    if (input.is_empty()) return empty_like(input);
-
-    auto output = make_numeric_column(cudf::data_type{cudf::type_to_id<size_type>()},
-                                      input.size(),
-                                      cudf::mask_state::UNINITIALIZED,
-                                      stream,
-                                      mr);
-
-    cudf::mutable_column_view output_view = output->mutable_view();
-
-    // Passing the agg_op and aggregation::Kind as constant to group them in pair, else it
-    // evolves to error when try to use agg_op as compiler tries different combinations
-    if (op == aggregation::MIN) {
-      kernel_launcher<T,
-                      DeviceMin,
-                      aggregation::ARGMIN,
-                      PrecedingWindowIterator,
-                      FollowingWindowIterator>(input,
-                                               default_outputs,
-                                               output_view,
-                                               preceding_window_begin,
-                                               following_window_begin,
-                                               min_periods,
-                                               agg,
-                                               stream);
-    } else if (op == aggregation::MAX) {
-      kernel_launcher<T,
-                      DeviceMax,
-                      aggregation::ARGMAX,
-                      PrecedingWindowIterator,
-                      FollowingWindowIterator>(input,
-                                               default_outputs,
-                                               output_view,
-                                               preceding_window_begin,
-                                               following_window_begin,
-                                               min_periods,
-                                               agg,
-                                               stream);
-    } else {
-      CUDF_FAIL("MIN and MAX are the only supported aggregation types for string columns");
-    }
-
-    // The rows that represent null elements will be having negative values in gather map,
-    // and that's why nullify_out_of_bounds/ignore_out_of_bounds is true.
-    auto output_table = detail::gather(table_view{{input}},
-                                       output->view(),
-                                       cudf::out_of_bounds_policy::NULLIFY,
-                                       detail::negative_index_policy::NOT_ALLOWED,
-                                       stream,
-                                       mr);
-    return std::make_unique<cudf::column>(std::move(output_table->get_column(0)));
-  }
-
-  // Deals with invalid column and/or aggregation options
-  template <typename T,
-            typename agg_op,
-            aggregation::Kind op,
-            typename PrecedingWindowIterator,
-            typename FollowingWindowIterator>
-  std::enable_if_t<!cudf::detail::is_rolling_supported<T, agg_op, op>() and
-                     !cudf::detail::is_rolling_string_specialization<T, agg_op, op>(),
-                   std::unique_ptr<column>>
-  launch(column_view const& input,
-         column_view const& default_outputs,
-         PrecedingWindowIterator preceding_window_begin,
-         FollowingWindowIterator following_window_begin,
-         size_type min_periods,
-         std::unique_ptr<aggregation> const& agg,
-         rmm::cuda_stream_view stream,
-         rmm::mr::device_memory_resource* mr)
-  {
-    CUDF_FAIL("Aggregation operator and/or input type combination is invalid");
-  }
-
-  template <typename T,
-            typename agg_op,
-            aggregation::Kind op,
-            typename PrecedingWindowIterator,
-            typename FollowingWindowIterator>
-  std::enable_if_t<cudf::is_fixed_width<T>() and
-                     (op == aggregation::LEAD || op == aggregation::LAG),
-                   std::unique_ptr<column>>
-  launch(column_view const& input,
-         column_view const& default_outputs,
-         PrecedingWindowIterator preceding_window_begin,
-         FollowingWindowIterator following_window_begin,
-         size_type min_periods,
-         std::unique_ptr<aggregation> const& agg,
-         agg_op const& device_agg_op,
-         rmm::cuda_stream_view stream,
-         rmm::mr::device_memory_resource* mr)
-  {
-    if (input.is_empty()) return empty_like(input);
-
-    CUDF_EXPECTS(default_outputs.type().id() == input.type().id(),
-                 "Defaults column type must match input column.");  // Because LEAD/LAG.
-
-    // For LEAD(0)/LAG(0), no computation need be performed.
-    // Return copy of input.
-    if (0 == static_cast<cudf::detail::lead_lag_aggregation*>(agg.get())->row_offset) {
-      return std::make_unique<column>(input, stream, mr);
-    }
-
-    auto output = make_fixed_width_column(
-      target_type(input.type(), op), input.size(), mask_state::UNINITIALIZED, stream, mr);
-
-    cudf::mutable_column_view output_view = output->mutable_view();
-    auto valid_count =
-      kernel_launcher<T, agg_op, op, PrecedingWindowIterator, FollowingWindowIterator>(
-        input,
-        default_outputs,
-        output_view,
-        preceding_window_begin,
-        following_window_begin,
-        min_periods,
-        agg,
-        device_agg_op,
-        stream);
-
-    output->set_null_count(output->size() - valid_count);
-
-    return output;
-  }
-
-  // Deals with invalid column and/or aggregation options
-  template <typename T,
-            typename agg_op,
-            aggregation::Kind op,
-            typename PrecedingWindowIterator,
-            typename FollowingWindowIterator>
-  std::enable_if_t<!(op == aggregation::LEAD || op == aggregation::LAG) ||
-                     !cudf::is_fixed_width<T>(),
-                   std::unique_ptr<column>>
-  launch(column_view const& input,
-         column_view const& default_outputs,
-         PrecedingWindowIterator preceding_window_begin,
-         FollowingWindowIterator following_window_begin,
-         size_type min_periods,
-         std::unique_ptr<aggregation> const& agg,
-         agg_op device_agg_op,
-         rmm::cuda_stream_view stream,
-         rmm::mr::device_memory_resource* mr)
-  {
-    CUDF_FAIL(
-      "Aggregation operator and/or input type combination is invalid: "
-      "LEAD/LAG supported only on fixed-width types");
-  }
-
-  template <aggregation::Kind op,
-            typename PrecedingWindowIterator,
-            typename FollowingWindowIterator>
-  std::enable_if_t<!(op == aggregation::MEAN || op == aggregation::LEAD || op == aggregation::LAG),
-                   std::unique_ptr<column>>
-  operator()(column_view const& input,
-             column_view const& default_outputs,
-             PrecedingWindowIterator preceding_window_begin,
-             FollowingWindowIterator following_window_begin,
-             size_type min_periods,
-             std::unique_ptr<aggregation> const& agg,
-             rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr)
-  {
-    CUDF_EXPECTS(default_outputs.is_empty(),
-                 "Only LEAD/LAG window functions support default values.");
-
-    return launch<InputType,
-                  typename corresponding_operator<op>::type,
-                  op,
-                  PrecedingWindowIterator,
-                  FollowingWindowIterator>(input,
-                                           default_outputs,
-                                           preceding_window_begin,
-                                           following_window_begin,
-                                           min_periods,
-                                           agg,
-                                           stream,
-                                           mr);
-  }
-
-  // This variant is just to handle mean
-  template <aggregation::Kind op,
-            typename PrecedingWindowIterator,
-            typename FollowingWindowIterator>
-  std::enable_if_t<(op == aggregation::MEAN), std::unique_ptr<column>> operator()(
-    column_view const& input,
-    column_view const& default_outputs,
-    PrecedingWindowIterator preceding_window_begin,
-    FollowingWindowIterator following_window_begin,
-    size_type min_periods,
-    std::unique_ptr<aggregation> const& agg,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
-  {
-    return launch<InputType, cudf::DeviceSum, op, PrecedingWindowIterator, FollowingWindowIterator>(
-      input,
-      default_outputs,
-      preceding_window_begin,
-      following_window_begin,
-      min_periods,
-      agg,
-      stream,
-      mr);
-  }
-
-  template <aggregation::Kind op,
-            typename PrecedingWindowIterator,
-            typename FollowingWindowIterator>
-  std::enable_if_t<(op == aggregation::LEAD || op == aggregation::LAG), std::unique_ptr<column>>
-  operator()(column_view const& input,
-             column_view const& default_outputs,
-             PrecedingWindowIterator preceding_window_begin,
-             FollowingWindowIterator following_window_begin,
-             size_type min_periods,
-             std::unique_ptr<aggregation> const& agg,
-             rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr)
-  {
-    return launch<InputType,
-                  cudf::DeviceLeadLag,
-                  op,
-                  PrecedingWindowIterator,
-                  FollowingWindowIterator>(
-      input,
-      default_outputs,
-      preceding_window_begin,
-      following_window_begin,
-      min_periods,
-      agg,
-      cudf::DeviceLeadLag{static_cast<cudf::detail::lead_lag_aggregation*>(agg.get())->row_offset},
-      stream,
-      mr);
-  }
-};
-
-struct dispatch_rolling {
-  template <typename T, typename PrecedingWindowIterator, typename FollowingWindowIterator>
-  std::unique_ptr<column> operator()(column_view const& input,
-                                     column_view const& default_outputs,
-                                     PrecedingWindowIterator preceding_window_begin,
-                                     FollowingWindowIterator following_window_begin,
-                                     size_type min_periods,
-                                     std::unique_ptr<aggregation> const& agg,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
-  {
-    return aggregation_dispatcher(agg->kind,
-                                  rolling_window_launcher<T>{},
-                                  input,
-                                  default_outputs,
-                                  preceding_window_begin,
-                                  following_window_begin,
-                                  min_periods,
-                                  agg,
-                                  stream,
-                                  mr);
-  }
-};
-
-}  // namespace
-
-// Applies a user-defined rolling window function to the values in a column.
-template <typename PrecedingWindowIterator, typename FollowingWindowIterator>
-std::unique_ptr<column> rolling_window_udf(column_view const& input,
-                                           PrecedingWindowIterator preceding_window,
-                                           std::string const& preceding_window_str,
-                                           FollowingWindowIterator following_window,
-                                           std::string const& following_window_str,
-                                           size_type min_periods,
-                                           std::unique_ptr<aggregation> const& agg,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
-{
-  static_assert(warp_size == cudf::detail::size_in_bits<cudf::bitmask_type>(),
-                "bitmask_type size does not match CUDA warp size");
-
-  if (input.has_nulls())
-    CUDF_FAIL("Currently the UDF version of rolling window does NOT support inputs with nulls.");
-
-  min_periods = std::max(min_periods, 0);
-
-  auto udf_agg = static_cast<udf_aggregation*>(agg.get());
-
-  std::string hash = "prog_rolling." + std::to_string(std::hash<std::string>{}(udf_agg->_source));
-
-  std::string cuda_source;
-  switch (udf_agg->kind) {
-    case aggregation::Kind::PTX:
-      cuda_source = cudf::rolling::jit::code::kernel_headers;
-      cuda_source +=
-        cudf::jit::parse_single_function_ptx(udf_agg->_source,
-                                             udf_agg->_function_name,
-                                             cudf::jit::get_type_name(udf_agg->_output_type),
-                                             {0, 5});  // args 0 and 5 are pointers.
-      cuda_source += cudf::rolling::jit::code::kernel;
-      break;
-    case aggregation::Kind::CUDA:
-      cuda_source = cudf::rolling::jit::code::kernel_headers;
-      cuda_source +=
-        cudf::jit::parse_single_function_cuda(udf_agg->_source, udf_agg->_function_name);
-      cuda_source += cudf::rolling::jit::code::kernel;
-      break;
-    default: CUDF_FAIL("Unsupported UDF type.");
-  }
-
-  std::unique_ptr<column> output = make_numeric_column(
-    udf_agg->_output_type, input.size(), cudf::mask_state::UNINITIALIZED, stream, mr);
-
-  auto output_view = output->mutable_view();
-  rmm::device_scalar<size_type> device_valid_count{0, stream};
-
-  const std::vector<std::string> compiler_flags{"-std=c++14",
-                                                // Have jitify prune unused global variables
-                                                "-remove-unused-globals",
-                                                // suppress all NVRTC warnings
-                                                "-w"};
-
-  // Launch the jitify kernel
-  cudf::jit::launcher(hash,
-                      cuda_source,
-                      {cudf_types_hpp,
-                       cudf_utilities_bit_hpp,
-                       cudf::rolling::jit::code::operation_h,
-                       ___src_rolling_rolling_jit_detail_hpp},
-                      compiler_flags,
-                      nullptr,
-                      stream)
-    .set_kernel_inst("gpu_rolling_new",  // name of the kernel we are launching
-                     {cudf::jit::get_type_name(input.type()),  // list of template arguments
-                      cudf::jit::get_type_name(output->type()),
-                      udf_agg->_operator_name,
-                      preceding_window_str.c_str(),
-                      following_window_str.c_str()})
-    .launch(input.size(),
-            cudf::jit::get_data_ptr(input),
-            input.null_mask(),
-            cudf::jit::get_data_ptr(output_view),
-            output_view.null_mask(),
-            device_valid_count.data(),
-            preceding_window,
-            following_window,
-            min_periods);
-
-  output->set_null_count(output->size() - device_valid_count.value(stream));
-
-  // check the stream for debugging
-  CHECK_CUDA(stream.value());
-
-  return output;
-}
-
-/**
- * @copydoc cudf::rolling_window(column_view const& input,
- *                               PrecedingWindowIterator preceding_window_begin,
- *                               FollowingWindowIterator following_window_begin,
- *                               size_type min_periods,
- *                               std::unique_ptr<aggregation> const& agg,
- *                               rmm::mr::device_memory_resource* mr)
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
- */
-template <typename PrecedingWindowIterator, typename FollowingWindowIterator>
-std::unique_ptr<column> rolling_window(column_view const& input,
-                                       column_view const& default_outputs,
-                                       PrecedingWindowIterator preceding_window_begin,
-                                       FollowingWindowIterator following_window_begin,
-                                       size_type min_periods,
-                                       std::unique_ptr<aggregation> const& agg,
-                                       rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
-{
-  static_assert(warp_size == cudf::detail::size_in_bits<cudf::bitmask_type>(),
-                "bitmask_type size does not match CUDA warp size");
-
-  min_periods = std::max(min_periods, 0);
-
-  return cudf::type_dispatcher(input.type(),
-                               dispatch_rolling{},
-                               input,
-                               default_outputs,
-                               preceding_window_begin,
-                               following_window_begin,
-                               min_periods,
-                               agg,
-                               stream,
-                               mr);
-}
-
-}  // namespace detail
 
 // Applies a fixed-size rolling window function to the values in a column.
 std::unique_ptr<column> rolling_window(column_view const& input,
@@ -1055,6 +30,8 @@ std::unique_ptr<column> rolling_window(column_view const& input,
     input, empty_like(input)->view(), preceding_window, following_window, min_periods, agg, mr);
 }
 
+namespace detail {
+
 // Applies a fixed-size rolling window function to the values in a column.
 std::unique_ptr<column> rolling_window(column_view const& input,
                                        column_view const& default_outputs,
@@ -1062,6 +39,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type following_window,
                                        size_type min_periods,
                                        std::unique_ptr<aggregation> const& agg,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
@@ -1080,7 +58,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                             "cudf::size_type",
                                             min_periods,
                                             agg,
-                                            rmm::cuda_stream_default,
+                                            stream,
                                             mr);
   } else {
     auto preceding_window_begin = thrust::make_constant_iterator(preceding_window);
@@ -1092,7 +70,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                         following_window_begin,
                                         min_periods,
                                         agg,
-                                        rmm::cuda_stream_default,
+                                        stream,
                                         mr);
   }
 }
@@ -1103,6 +81,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        column_view const& following_window,
                                        size_type min_periods,
                                        std::unique_ptr<aggregation> const& agg,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
@@ -1125,7 +104,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                             "cudf::size_type*",
                                             min_periods,
                                             agg,
-                                            rmm::cuda_stream_default,
+                                            stream,
                                             mr);
   } else {
     return cudf::detail::rolling_window(input,
@@ -1134,850 +113,42 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                         following_window.begin<size_type>(),
                                         min_periods,
                                         agg,
-                                        rmm::cuda_stream_default,
+                                        stream,
                                         mr);
   }
 }
 
-std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
-                                               column_view const& input,
-                                               size_type preceding_window,
-                                               size_type following_window,
-                                               size_type min_periods,
-                                               std::unique_ptr<aggregation> const& aggr,
-                                               rmm::mr::device_memory_resource* mr)
-{
-  return grouped_rolling_window(group_keys,
-                                input,
-                                window_bounds::get(preceding_window),
-                                window_bounds::get(following_window),
-                                min_periods,
-                                aggr,
-                                mr);
-}
+}  // namespace detail
 
-std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
-                                               column_view const& input,
-                                               window_bounds preceding_window,
-                                               window_bounds following_window,
-                                               size_type min_periods,
-                                               std::unique_ptr<aggregation> const& aggr,
-                                               rmm::mr::device_memory_resource* mr)
+// Applies a fixed-size rolling window function to the values in a column.
+std::unique_ptr<column> rolling_window(column_view const& input,
+                                       column_view const& default_outputs,
+                                       size_type preceding_window,
+                                       size_type following_window,
+                                       size_type min_periods,
+                                       std::unique_ptr<aggregation> const& agg,
+                                       rmm::mr::device_memory_resource* mr)
 {
-  return grouped_rolling_window(group_keys,
-                                input,
-                                empty_like(input)->view(),
+  return detail::rolling_window(input,
+                                default_outputs,
                                 preceding_window,
                                 following_window,
                                 min_periods,
-                                aggr,
-                                mr);
-}
-
-std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
-                                               column_view const& input,
-                                               column_view const& default_outputs,
-                                               size_type preceding_window,
-                                               size_type following_window,
-                                               size_type min_periods,
-                                               std::unique_ptr<aggregation> const& aggr,
-                                               rmm::mr::device_memory_resource* mr)
-{
-  return grouped_rolling_window(group_keys,
-                                input,
-                                default_outputs,
-                                window_bounds::get(preceding_window),
-                                window_bounds::get(following_window),
-                                min_periods,
-                                aggr,
+                                agg,
+                                rmm::cuda_stream_default,
                                 mr);
 }
 
-std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
-                                               column_view const& input,
-                                               column_view const& default_outputs,
-                                               window_bounds preceding_window_bounds,
-                                               window_bounds following_window_bounds,
-                                               size_type min_periods,
-                                               std::unique_ptr<aggregation> const& aggr,
-                                               rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-
-  if (input.is_empty()) return empty_like(input);
-
-  CUDF_EXPECTS((group_keys.num_columns() == 0 || group_keys.num_rows() == input.size()),
-               "Size mismatch between group_keys and input vector.");
-
-  CUDF_EXPECTS((min_periods > 0), "min_periods must be positive");
-
-  CUDF_EXPECTS((default_outputs.is_empty() || default_outputs.size() == input.size()),
-               "Defaults column must be either empty or have as many rows as the input column.");
-
-  auto const preceding_window = preceding_window_bounds.value;
-  auto const following_window = following_window_bounds.value;
-
-  if (group_keys.num_columns() == 0) {
-    // No Groupby columns specified. Treat as one big group.
-    return rolling_window(
-      input, default_outputs, preceding_window, following_window, min_periods, aggr, mr);
-  }
-
-  using sort_groupby_helper = cudf::groupby::detail::sort::sort_groupby_helper;
-
-  sort_groupby_helper helper{group_keys, cudf::null_policy::INCLUDE, cudf::sorted::YES};
-  auto group_offsets{helper.group_offsets()};
-  auto const& group_labels{helper.group_labels()};
-
-  // `group_offsets` are interpreted in adjacent pairs, each pair representing the offsets
-  // of the first, and one past the last elements in a group.
-  //
-  // If `group_offsets` is not empty, it must contain at least two offsets:
-  //   a. 0, indicating the first element in `input`
-  //   b. input.size(), indicating one past the last element in `input`.
-  //
-  // Thus, for an input of 1000 rows,
-  //   0. [] indicates a single group, spanning the entire column.
-  //   1  [10] is invalid.
-  //   2. [0, 1000] indicates a single group, spanning the entire column (thus, equivalent to no
-  //   groups.)
-  //   3. [0, 500, 1000] indicates two equal-sized groups: [0,500), and [500,1000).
-
-  assert(group_offsets.size() >= 2 && group_offsets[0] == 0 &&
-         group_offsets[group_offsets.size() - 1] == input.size() &&
-         "Must have at least one group.");
-
-  auto preceding_calculator = [d_group_offsets = group_offsets.data().get(),
-                               d_group_labels  = group_labels.data().get(),
-                               preceding_window] __device__(size_type idx) {
-    auto group_label = d_group_labels[idx];
-    auto group_start = d_group_offsets[group_label];
-    return thrust::minimum<size_type>{}(preceding_window,
-                                        idx - group_start + 1);  // Preceding includes current row.
-  };
-
-  auto following_calculator = [d_group_offsets = group_offsets.data().get(),
-                               d_group_labels  = group_labels.data().get(),
-                               following_window] __device__(size_type idx) {
-    auto group_label = d_group_labels[idx];
-    auto group_end =
-      d_group_offsets[group_label +
-                      1];  // Cannot fall off the end, since offsets is capped with `input.size()`.
-    return thrust::minimum<size_type>{}(following_window, (group_end - 1) - idx);
-  };
-
-  if (aggr->kind == aggregation::CUDA || aggr->kind == aggregation::PTX) {
-    cudf::detail::preceding_window_wrapper grouped_preceding_window{
-      group_offsets.data().get(), group_labels.data().get(), preceding_window};
-
-    cudf::detail::following_window_wrapper grouped_following_window{
-      group_offsets.data().get(), group_labels.data().get(), following_window};
-
-    return cudf::detail::rolling_window_udf(input,
-                                            grouped_preceding_window,
-                                            "cudf::detail::preceding_window_wrapper",
-                                            grouped_following_window,
-                                            "cudf::detail::following_window_wrapper",
-                                            min_periods,
-                                            aggr,
-                                            rmm::cuda_stream_default,
-                                            mr);
-  } else {
-    return cudf::detail::rolling_window(
-      input,
-      default_outputs,
-      thrust::make_transform_iterator(thrust::make_counting_iterator<size_type>(0),
-                                      preceding_calculator),
-      thrust::make_transform_iterator(thrust::make_counting_iterator<size_type>(0),
-                                      following_calculator),
-      min_periods,
-      aggr,
-      rmm::cuda_stream_default,
-      mr);
-  }
-}
-
-namespace {
-
-bool is_supported_range_frame_unit(cudf::data_type const& data_type)
-{
-  auto id = data_type.id();
-  return id == cudf::type_id::TIMESTAMP_DAYS || id == cudf::type_id::TIMESTAMP_SECONDS ||
-         id == cudf::type_id::TIMESTAMP_MILLISECONDS ||
-         id == cudf::type_id::TIMESTAMP_MICROSECONDS || id == cudf::type_id::TIMESTAMP_NANOSECONDS;
-}
-
-/// Fetches multiplication factor to normalize window sizes, depending on the datatype of the
-/// timestamp column. Used for time-based rolling-window operations. E.g. If the timestamp column is
-/// in TIMESTAMP_SECONDS, and the window sizes are specified in DAYS, the window size needs to be
-/// multiplied by `24*60*60`, before comparisons with the timestamps.
-size_t multiplication_factor(cudf::data_type const& data_type)
-{
-  // Assume timestamps.
-  switch (data_type.id()) {
-    case cudf::type_id::TIMESTAMP_DAYS: return 1L;
-    case cudf::type_id::TIMESTAMP_SECONDS: return 24L * 60 * 60;
-    case cudf::type_id::TIMESTAMP_MILLISECONDS: return 24L * 60 * 60 * 1000;
-    case cudf::type_id::TIMESTAMP_MICROSECONDS: return 24L * 60 * 60 * 1000 * 1000;
-    default:
-      CUDF_EXPECTS(data_type.id() == cudf::type_id::TIMESTAMP_NANOSECONDS,
-                   "Unexpected data-type for timestamp-based rolling window operation!");
-      return 24L * 60 * 60 * 1000 * 1000 * 1000;
-  }
-}
-
-/// Given a single, ungrouped timestamp column, return the indices corresponding
-/// to the first null timestamp, and (one past) the last null timestamp.
-/// The input column is sorted, with all null values clustered either
-/// at the beginning of the column or at the end.
-/// If no null values are founds, null_begin and null_end are 0.
-std::tuple<size_type, size_type> get_null_bounds_for_timestamp_column(
-  column_view const& timestamp_column)
-{
-  auto const num_rows  = timestamp_column.size();
-  auto const num_nulls = timestamp_column.null_count();
-
-  if (num_nulls == num_rows || num_nulls == 0) {
-    // Short-circuit: All nulls, or no nulls.
-    return std::make_tuple(0, num_nulls);
-  }
-
-  auto const first_row_is_null = timestamp_column.null_count(0, 1) == 1;
-
-  return first_row_is_null ? std::make_tuple(0, num_nulls)
-                           : std::make_tuple(num_rows - num_nulls, num_rows);
-}
-
-/// Time-range window computation, with
-///   1. no grouping keys specified
-///   2. timetamps in ASCENDING order.
-/// Treat as one single group.
-template <typename TimeT>
-std::unique_ptr<column> time_range_window_ASC(column_view const& input,
-                                              column_view const& timestamp_column,
-                                              TimeT preceding_window,
-                                              bool preceding_window_is_unbounded,
-                                              TimeT following_window,
-                                              bool following_window_is_unbounded,
-                                              size_type min_periods,
-                                              std::unique_ptr<aggregation> const& aggr,
-                                              rmm::mr::device_memory_resource* mr)
-{
-  size_type nulls_begin_idx, nulls_end_idx;
-  std::tie(nulls_begin_idx, nulls_end_idx) = get_null_bounds_for_timestamp_column(timestamp_column);
-
-  auto preceding_calculator =
-    [nulls_begin_idx,
-     nulls_end_idx,
-     d_timestamps = timestamp_column.data<TimeT>(),
-     preceding_window,
-     preceding_window_is_unbounded] __device__(size_type idx) -> size_type {
-    if (preceding_window_is_unbounded) {
-      return idx + 1;  // Technically `idx - 0 + 1`,
-                       // where 0 == Group start,
-                       // and   1 accounts for the current row
-    }
-    if (idx >= nulls_begin_idx && idx < nulls_end_idx) {
-      // Current row is in the null group.
-      // Must consider beginning of null-group as window start.
-      return idx - nulls_begin_idx + 1;
-    }
-
-    // timestamp[idx] not null. Binary search the group, excluding null group.
-    // If nulls_begin_idx == 0, either
-    //  1. NULLS FIRST ordering: Binary search starts where nulls_end_idx.
-    //  2. NO NULLS: Binary search starts at 0 (also nulls_end_idx).
-    // Otherwise, NULLS LAST ordering. Start at 0.
-    auto group_start                = nulls_begin_idx == 0 ? nulls_end_idx : 0;
-    auto lowest_timestamp_in_window = d_timestamps[idx] - preceding_window;
-
-    return ((d_timestamps + idx) - thrust::lower_bound(thrust::seq,
-                                                       d_timestamps + group_start,
-                                                       d_timestamps + idx,
-                                                       lowest_timestamp_in_window)) +
-           1;  // Add 1, for `preceding` to account for current row.
-  };
-
-  auto following_calculator =
-    [nulls_begin_idx,
-     nulls_end_idx,
-     num_rows     = input.size(),
-     d_timestamps = timestamp_column.data<TimeT>(),
-     following_window,
-     following_window_is_unbounded] __device__(size_type idx) -> size_type {
-    if (following_window_is_unbounded) { return num_rows - idx - 1; }
-    if (idx >= nulls_begin_idx && idx < nulls_end_idx) {
-      // Current row is in the null group.
-      // Window ends at the end of the null group.
-      return nulls_end_idx - idx - 1;
-    }
-
-    // timestamp[idx] not null. Binary search the group, excluding null group.
-    // If nulls_begin_idx == 0, either
-    //  1. NULLS FIRST ordering: Binary search ends at num_rows.
-    //  2. NO NULLS: Binary search also ends at num_rows.
-    // Otherwise, NULLS LAST ordering. End at nulls_begin_idx.
-
-    auto group_end                   = nulls_begin_idx == 0 ? num_rows : nulls_begin_idx;
-    auto highest_timestamp_in_window = d_timestamps[idx] + following_window;
-
-    return (thrust::upper_bound(thrust::seq,
-                                d_timestamps + idx,
-                                d_timestamps + group_end,
-                                highest_timestamp_in_window) -
-            (d_timestamps + idx)) -
-           1;
-  };
-
-  return cudf::detail::rolling_window(
-    input,
-    empty_like(input)->view(),
-    thrust::make_transform_iterator(thrust::make_counting_iterator<size_type>(0),
-                                    preceding_calculator),
-    thrust::make_transform_iterator(thrust::make_counting_iterator<size_type>(0),
-                                    following_calculator),
-    min_periods,
-    aggr,
-    rmm::cuda_stream_default,
-    mr);
-}
-
-/// Given a timestamp column grouped as specified in group_offsets,
-/// return the following two vectors:
-///  1. Vector with one entry per group, indicating the offset in the group
-///     where the null values begin.
-///  2. Vector with one entry per group, indicating the offset in the group
-///     where the null values end. (i.e. 1 past the last null.)
-/// Each group in the input timestamp column must be sorted,
-/// with null values clustered at either the start or the end of each group.
-/// If there are no nulls for any given group, (nulls_begin, nulls_end) == (0,0).
-std::tuple<rmm::device_vector<size_type>, rmm::device_vector<size_type>>
-get_null_bounds_for_timestamp_column(column_view const& timestamp_column,
-                                     rmm::device_vector<size_type> const& group_offsets)
-{
-  // For each group, the null values are themselves clustered
-  // at the beginning or the end of the group.
-  // These nulls cannot participate, except in their own window.
-
-  // If the input has n groups, group_offsets will have n+1 values.
-  // null_start and null_end should eventually have 1 entry per group.
-  auto null_start = rmm::device_vector<size_type>(group_offsets.begin(), group_offsets.end() - 1);
-  auto null_end   = rmm::device_vector<size_type>(group_offsets.begin(), group_offsets.end() - 1);
-
-  if (timestamp_column.has_nulls()) {
-    auto p_timestamps_device_view = column_device_view::create(timestamp_column);
-    auto num_groups               = group_offsets.size();
-
-    // Null timestamps exist. Find null bounds, per group.
-    thrust::for_each(
-      thrust::device,
-      thrust::make_counting_iterator(static_cast<size_type>(0)),
-      thrust::make_counting_iterator(static_cast<size_type>(num_groups)),
-      [d_timestamps    = *p_timestamps_device_view,
-       d_group_offsets = group_offsets.data().get(),
-       d_null_start    = null_start.data(),
-       d_null_end      = null_end.data()] __device__(auto group_label) {
-        auto group_start           = d_group_offsets[group_label];
-        auto group_end             = d_group_offsets[group_label + 1];
-        auto first_element_is_null = d_timestamps.is_null_nocheck(group_start);
-        auto last_element_is_null  = d_timestamps.is_null_nocheck(group_end - 1);
-        if (!first_element_is_null && !last_element_is_null) {
-          // Short circuit: No nulls.
-          d_null_start[group_label] = group_start;
-          d_null_end[group_label]   = group_start;
-        } else if (first_element_is_null && last_element_is_null) {
-          // Short circuit: All nulls.
-          d_null_start[group_label] = group_start;
-          d_null_end[group_label]   = group_end;
-        } else if (first_element_is_null) {
-          // NULLS FIRST.
-          d_null_start[group_label] = group_start;
-          d_null_end[group_label]   = *thrust::partition_point(
-            thrust::seq,
-            thrust::make_counting_iterator(group_start),
-            thrust::make_counting_iterator(group_end),
-            [&d_timestamps] __device__(auto i) { return d_timestamps.is_null_nocheck(i); });
-        } else {
-          // NULLS LAST.
-          d_null_end[group_label]   = group_end;
-          d_null_start[group_label] = *thrust::partition_point(
-            thrust::seq,
-            thrust::make_counting_iterator(group_start),
-            thrust::make_counting_iterator(group_end),
-            [&d_timestamps] __device__(auto i) { return d_timestamps.is_valid_nocheck(i); });
-        }
-      });
-  }
-
-  return std::make_tuple(std::move(null_start), std::move(null_end));
-}
-
-// Time-range window computation, for timestamps in ASCENDING order.
-template <typename TimeT>
-std::unique_ptr<column> time_range_window_ASC(
-  column_view const& input,
-  column_view const& timestamp_column,
-  rmm::device_vector<cudf::size_type> const& group_offsets,
-  rmm::device_vector<cudf::size_type> const& group_labels,
-  TimeT preceding_window,
-  bool preceding_window_is_unbounded,
-  TimeT following_window,
-  bool following_window_is_unbounded,
-  size_type min_periods,
-  std::unique_ptr<aggregation> const& aggr,
-  rmm::mr::device_memory_resource* mr)
-{
-  rmm::device_vector<size_type> null_start, null_end;
-  std::tie(null_start, null_end) =
-    get_null_bounds_for_timestamp_column(timestamp_column, group_offsets);
-
-  auto preceding_calculator =
-    [d_group_offsets = group_offsets.data().get(),
-     d_group_labels  = group_labels.data().get(),
-     d_timestamps    = timestamp_column.data<TimeT>(),
-     d_nulls_begin   = null_start.data().get(),
-     d_nulls_end     = null_end.data().get(),
-     preceding_window,
-     preceding_window_is_unbounded] __device__(size_type idx) -> size_type {
-    auto group_label = d_group_labels[idx];
-    auto group_start = d_group_offsets[group_label];
-    auto nulls_begin = d_nulls_begin[group_label];
-    auto nulls_end   = d_nulls_end[group_label];
-
-    if (preceding_window_is_unbounded) { return idx - group_start + 1; }
-
-    // If idx lies in the null-range, the window is the null range.
-    if (idx >= nulls_begin && idx < nulls_end) {
-      // Current row is in the null group.
-      // The window starts at the start of the null group.
-      return idx - nulls_begin + 1;
-    }
-
-    // timestamp[idx] not null. Search must exclude the null group.
-    // If nulls_begin == group_start, either of the following is true:
-    //  1. NULLS FIRST ordering: Search must begin at nulls_end.
-    //  2. NO NULLS: Search must begin at group_start (which also equals nulls_end.)
-    // Otherwise, NULLS LAST ordering. Search must start at nulls group_start.
-    auto search_start = nulls_begin == group_start ? nulls_end : group_start;
-
-    auto lowest_timestamp_in_window = d_timestamps[idx] - preceding_window;
-
-    return ((d_timestamps + idx) - thrust::lower_bound(thrust::seq,
-                                                       d_timestamps + search_start,
-                                                       d_timestamps + idx,
-                                                       lowest_timestamp_in_window)) +
-           1;  // Add 1, for `preceding` to account for current row.
-  };
-
-  auto following_calculator =
-    [d_group_offsets = group_offsets.data().get(),
-     d_group_labels  = group_labels.data().get(),
-     d_timestamps    = timestamp_column.data<TimeT>(),
-     d_nulls_begin   = null_start.data().get(),
-     d_nulls_end     = null_end.data().get(),
-     following_window,
-     following_window_is_unbounded] __device__(size_type idx) -> size_type {
-    auto group_label = d_group_labels[idx];
-    auto group_start = d_group_offsets[group_label];
-    auto group_end =
-      d_group_offsets[group_label +
-                      1];  // Cannot fall off the end, since offsets is capped with `input.size()`.
-    auto nulls_begin = d_nulls_begin[group_label];
-    auto nulls_end   = d_nulls_end[group_label];
-
-    if (following_window_is_unbounded) { return (group_end - idx) - 1; }
-
-    // If idx lies in the null-range, the window is the null range.
-    if (idx >= nulls_begin && idx < nulls_end) {
-      // Current row is in the null group.
-      // The window ends at the end of the null group.
-      return nulls_end - idx - 1;
-    }
-
-    // timestamp[idx] not null. Search must exclude the null group.
-    // If nulls_begin == group_start, either of the following is true:
-    //  1. NULLS FIRST ordering: Search ends at group_end.
-    //  2. NO NULLS: Search ends at group_end.
-    // Otherwise, NULLS LAST ordering. Search ends at nulls_begin.
-    auto search_end = nulls_begin == group_start ? group_end : nulls_begin;
-
-    auto highest_timestamp_in_window = d_timestamps[idx] + following_window;
-
-    return (thrust::upper_bound(thrust::seq,
-                                d_timestamps + idx,
-                                d_timestamps + search_end,
-                                highest_timestamp_in_window) -
-            (d_timestamps + idx)) -
-           1;
-  };
-
-  return cudf::detail::rolling_window(
-    input,
-    empty_like(input)->view(),
-    thrust::make_transform_iterator(thrust::make_counting_iterator<size_type>(0),
-                                    preceding_calculator),
-    thrust::make_transform_iterator(thrust::make_counting_iterator<size_type>(0),
-                                    following_calculator),
-    min_periods,
-    aggr,
-    rmm::cuda_stream_default,
-    mr);
-}
-
-/// Time-range window computation, with
-///   1. no grouping keys specified
-///   2. timetamps in DESCENDING order.
-/// Treat as one single group.
-template <typename TimeT>
-std::unique_ptr<column> time_range_window_DESC(column_view const& input,
-                                               column_view const& timestamp_column,
-                                               TimeT preceding_window,
-                                               bool preceding_window_is_unbounded,
-                                               TimeT following_window,
-                                               bool following_window_is_unbounded,
-                                               size_type min_periods,
-                                               std::unique_ptr<aggregation> const& aggr,
-                                               rmm::mr::device_memory_resource* mr)
-{
-  size_type nulls_begin_idx, nulls_end_idx;
-  std::tie(nulls_begin_idx, nulls_end_idx) = get_null_bounds_for_timestamp_column(timestamp_column);
-
-  auto preceding_calculator =
-    [nulls_begin_idx,
-     nulls_end_idx,
-     d_timestamps = timestamp_column.data<TimeT>(),
-     preceding_window,
-     preceding_window_is_unbounded] __device__(size_type idx) -> size_type {
-    if (preceding_window_is_unbounded) {
-      return idx + 1;  // Technically `idx - 0 + 1`,
-                       // where 0 == Group start,
-                       // and   1 accounts for the current row
-    }
-    if (idx >= nulls_begin_idx && idx < nulls_end_idx) {
-      // Current row is in the null group.
-      // Must consider beginning of null-group as window start.
-      return idx - nulls_begin_idx + 1;
-    }
-
-    // timestamp[idx] not null. Binary search the group, excluding null group.
-    // If nulls_begin_idx == 0, either
-    //  1. NULLS FIRST ordering: Binary search starts where nulls_end_idx.
-    //  2. NO NULLS: Binary search starts at 0 (also nulls_end_idx).
-    // Otherwise, NULLS LAST ordering. Start at 0.
-    auto group_start                 = nulls_begin_idx == 0 ? nulls_end_idx : 0;
-    auto highest_timestamp_in_window = d_timestamps[idx] + preceding_window;
-
-    return ((d_timestamps + idx) -
-            thrust::lower_bound(thrust::seq,
-                                d_timestamps + group_start,
-                                d_timestamps + idx,
-                                highest_timestamp_in_window,
-                                thrust::greater<decltype(highest_timestamp_in_window)>())) +
-           1;  // Add 1, for `preceding` to account for current row.
-  };
-
-  auto following_calculator =
-    [nulls_begin_idx,
-     nulls_end_idx,
-     num_rows     = input.size(),
-     d_timestamps = timestamp_column.data<TimeT>(),
-     following_window,
-     following_window_is_unbounded] __device__(size_type idx) -> size_type {
-    if (following_window_is_unbounded) { return (num_rows - idx) - 1; }
-    if (idx >= nulls_begin_idx && idx < nulls_end_idx) {
-      // Current row is in the null group.
-      // Window ends at the end of the null group.
-      return nulls_end_idx - idx - 1;
-    }
-
-    // timestamp[idx] not null. Search must exclude null group.
-    // If nulls_begin_idx = 0, either
-    //  1. NULLS FIRST ordering: Search ends at num_rows.
-    //  2. NO NULLS: Search also ends at num_rows.
-    // Otherwise, NULLS LAST ordering: End at nulls_begin_idx.
-
-    auto group_end                  = nulls_begin_idx == 0 ? num_rows : nulls_begin_idx;
-    auto lowest_timestamp_in_window = d_timestamps[idx] - following_window;
-
-    return (thrust::upper_bound(thrust::seq,
-                                d_timestamps + idx,
-                                d_timestamps + group_end,
-                                lowest_timestamp_in_window,
-                                thrust::greater<decltype(lowest_timestamp_in_window)>()) -
-            (d_timestamps + idx)) -
-           1;
-  };
-
-  return cudf::detail::rolling_window(
-    input,
-    empty_like(input)->view(),
-    thrust::make_transform_iterator(thrust::make_counting_iterator<size_type>(0),
-                                    preceding_calculator),
-    thrust::make_transform_iterator(thrust::make_counting_iterator<size_type>(0),
-                                    following_calculator),
-    min_periods,
-    aggr,
-    rmm::cuda_stream_default,
-    mr);
-}
-
-// Time-range window computation, for timestamps in DESCENDING order.
-template <typename TimeT>
-std::unique_ptr<column> time_range_window_DESC(
-  column_view const& input,
-  column_view const& timestamp_column,
-  rmm::device_vector<cudf::size_type> const& group_offsets,
-  rmm::device_vector<cudf::size_type> const& group_labels,
-  TimeT preceding_window,
-  bool preceding_window_is_unbounded,
-  TimeT following_window,
-  bool following_window_is_unbounded,
-  size_type min_periods,
-  std::unique_ptr<aggregation> const& aggr,
-  rmm::mr::device_memory_resource* mr)
-{
-  rmm::device_vector<size_type> null_start, null_end;
-  std::tie(null_start, null_end) =
-    get_null_bounds_for_timestamp_column(timestamp_column, group_offsets);
-
-  auto preceding_calculator =
-    [d_group_offsets = group_offsets.data().get(),
-     d_group_labels  = group_labels.data().get(),
-     d_timestamps    = timestamp_column.data<TimeT>(),
-     d_nulls_begin   = null_start.data().get(),
-     d_nulls_end     = null_end.data().get(),
-     preceding_window,
-     preceding_window_is_unbounded] __device__(size_type idx) -> size_type {
-    auto group_label = d_group_labels[idx];
-    auto group_start = d_group_offsets[group_label];
-    auto nulls_begin = d_nulls_begin[group_label];
-    auto nulls_end   = d_nulls_end[group_label];
-
-    if (preceding_window_is_unbounded) { return (idx - group_start) + 1; }
-
-    // If idx lies in the null-range, the window is the null range.
-    if (idx >= nulls_begin && idx < nulls_end) {
-      // Current row is in the null group.
-      // The window starts at the start of the null group.
-      return idx - nulls_begin + 1;
-    }
-
-    // timestamp[idx] not null. Search must exclude the null group.
-    // If nulls_begin == group_start, either of the following is true:
-    //  1. NULLS FIRST ordering: Search must begin at nulls_end.
-    //  2. NO NULLS: Search must begin at group_start (which also equals nulls_end.)
-    // Otherwise, NULLS LAST ordering. Search must start at nulls group_start.
-    auto search_start = nulls_begin == group_start ? nulls_end : group_start;
-
-    auto highest_timestamp_in_window = d_timestamps[idx] + preceding_window;
-
-    return ((d_timestamps + idx) -
-            thrust::lower_bound(thrust::seq,
-                                d_timestamps + search_start,
-                                d_timestamps + idx,
-                                highest_timestamp_in_window,
-                                thrust::greater<decltype(highest_timestamp_in_window)>())) +
-           1;  // Add 1, for `preceding` to account for current row.
-  };
-
-  auto following_calculator =
-    [d_group_offsets = group_offsets.data().get(),
-     d_group_labels  = group_labels.data().get(),
-     d_timestamps    = timestamp_column.data<TimeT>(),
-     d_nulls_begin   = null_start.data().get(),
-     d_nulls_end     = null_end.data().get(),
-     following_window,
-     following_window_is_unbounded] __device__(size_type idx) -> size_type {
-    auto group_label = d_group_labels[idx];
-    auto group_start = d_group_offsets[group_label];
-    auto group_end   = d_group_offsets[group_label + 1];
-    auto nulls_begin = d_nulls_begin[group_label];
-    auto nulls_end   = d_nulls_end[group_label];
-
-    if (following_window_is_unbounded) { return (group_end - idx) - 1; }
-
-    // If idx lies in the null-range, the window is the null range.
-    if (idx >= nulls_begin && idx < nulls_end) {
-      // Current row is in the null group.
-      // The window ends at the end of the null group.
-      return nulls_end - idx - 1;
-    }
-
-    // timestamp[idx] not null. Search must exclude the null group.
-    // If nulls_begin == group_start, either of the following is true:
-    //  1. NULLS FIRST ordering: Search ends at group_end.
-    //  2. NO NULLS: Search ends at group_end.
-    // Otherwise, NULLS LAST ordering. Search ends at nulls_begin.
-    auto search_end = nulls_begin == group_start ? group_end : nulls_begin;
-
-    auto lowest_timestamp_in_window = d_timestamps[idx] - following_window;
-
-    return (thrust::upper_bound(thrust::seq,
-                                d_timestamps + idx,
-                                d_timestamps + search_end,
-                                lowest_timestamp_in_window,
-                                thrust::greater<decltype(lowest_timestamp_in_window)>()) -
-            (d_timestamps + idx)) -
-           1;
-  };
-
-  if (aggr->kind == aggregation::CUDA || aggr->kind == aggregation::PTX) {
-    CUDF_FAIL("Time ranged rolling window does NOT (yet) support UDF.");
-  } else {
-    return cudf::detail::rolling_window(
-      input,
-      empty_like(input)->view(),
-      thrust::make_transform_iterator(thrust::make_counting_iterator<size_type>(0),
-                                      preceding_calculator),
-      thrust::make_transform_iterator(thrust::make_counting_iterator<size_type>(0),
-                                      following_calculator),
-      min_periods,
-      aggr,
-      rmm::cuda_stream_default,
-      mr);
-  }
-}
-
-template <typename TimeT>
-std::unique_ptr<column> grouped_time_range_rolling_window_impl(
-  column_view const& input,
-  column_view const& timestamp_column,
-  cudf::order const& timestamp_ordering,
-  rmm::device_vector<cudf::size_type> const& group_offsets,
-  rmm::device_vector<cudf::size_type> const& group_labels,
-  window_bounds preceding_window_in_days,  // TODO: Consider taking offset-type as type_id. Assumes
-                                           // days for now.
-  window_bounds following_window_in_days,
-  size_type min_periods,
-  std::unique_ptr<aggregation> const& aggr,
-  rmm::mr::device_memory_resource* mr)
-{
-  TimeT mult_factor{static_cast<TimeT>(multiplication_factor(timestamp_column.type()))};
-
-  if (timestamp_ordering == cudf::order::ASCENDING) {
-    return group_offsets.empty()
-             ? time_range_window_ASC(input,
-                                     timestamp_column,
-                                     preceding_window_in_days.value * mult_factor,
-                                     preceding_window_in_days.is_unbounded,
-                                     following_window_in_days.value * mult_factor,
-                                     following_window_in_days.is_unbounded,
-                                     min_periods,
-                                     aggr,
-                                     mr)
-             : time_range_window_ASC(input,
-                                     timestamp_column,
-                                     group_offsets,
-                                     group_labels,
-                                     preceding_window_in_days.value * mult_factor,
-                                     preceding_window_in_days.is_unbounded,
-                                     following_window_in_days.value * mult_factor,
-                                     following_window_in_days.is_unbounded,
-                                     min_periods,
-                                     aggr,
-                                     mr);
-  } else {
-    return group_offsets.empty()
-             ? time_range_window_DESC(input,
-                                      timestamp_column,
-                                      preceding_window_in_days.value * mult_factor,
-                                      preceding_window_in_days.is_unbounded,
-                                      following_window_in_days.value * mult_factor,
-                                      following_window_in_days.is_unbounded,
-                                      min_periods,
-                                      aggr,
-                                      mr)
-             : time_range_window_DESC(input,
-                                      timestamp_column,
-                                      group_offsets,
-                                      group_labels,
-                                      preceding_window_in_days.value * mult_factor,
-                                      preceding_window_in_days.is_unbounded,
-                                      following_window_in_days.value * mult_factor,
-                                      following_window_in_days.is_unbounded,
-                                      min_periods,
-                                      aggr,
-                                      mr);
-  }
-}
-
-}  // namespace
-
-std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& group_keys,
-                                                          column_view const& timestamp_column,
-                                                          cudf::order const& timestamp_order,
-                                                          column_view const& input,
-                                                          size_type preceding_window_in_days,
-                                                          size_type following_window_in_days,
-                                                          size_type min_periods,
-                                                          std::unique_ptr<aggregation> const& aggr,
-                                                          rmm::mr::device_memory_resource* mr)
-{
-  return grouped_time_range_rolling_window(group_keys,
-                                           timestamp_column,
-                                           timestamp_order,
-                                           input,
-                                           window_bounds::get(preceding_window_in_days),
-                                           window_bounds::get(following_window_in_days),
-                                           min_periods,
-                                           aggr,
-                                           mr);
-}
-
-std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& group_keys,
-                                                          column_view const& timestamp_column,
-                                                          cudf::order const& timestamp_order,
-                                                          column_view const& input,
-                                                          window_bounds preceding_window_in_days,
-                                                          window_bounds following_window_in_days,
-                                                          size_type min_periods,
-                                                          std::unique_ptr<aggregation> const& aggr,
-                                                          rmm::mr::device_memory_resource* mr)
+// Applies a variable-size rolling window function to the values in a column.
+std::unique_ptr<column> rolling_window(column_view const& input,
+                                       column_view const& preceding_window,
+                                       column_view const& following_window,
+                                       size_type min_periods,
+                                       std::unique_ptr<aggregation> const& agg,
+                                       rmm::mr::device_memory_resource* mr)
 {
-  CUDF_FUNC_RANGE();
-
-  if (input.is_empty()) return empty_like(input);
-
-  CUDF_EXPECTS((group_keys.num_columns() == 0 || group_keys.num_rows() == input.size()),
-               "Size mismatch between group_keys and input vector.");
-
-  CUDF_EXPECTS((min_periods > 0), "min_periods must be positive");
-
-  using sort_groupby_helper = cudf::groupby::detail::sort::sort_groupby_helper;
-  using index_vector        = sort_groupby_helper::index_vector;
-
-  index_vector group_offsets, group_labels;
-  if (group_keys.num_columns() > 0) {
-    sort_groupby_helper helper{group_keys, cudf::null_policy::INCLUDE, cudf::sorted::YES};
-    group_offsets = helper.group_offsets();
-    group_labels  = helper.group_labels();
-  }
-
-  // Assumes that `timestamp_column` is actually of a timestamp type.
-  CUDF_EXPECTS(is_supported_range_frame_unit(timestamp_column.type()),
-               "Unsupported data-type for `timestamp`-based rolling window operation!");
-
-  return timestamp_column.type().id() == cudf::type_id::TIMESTAMP_DAYS
-           ? grouped_time_range_rolling_window_impl<int32_t>(input,
-                                                             timestamp_column,
-                                                             timestamp_order,
-                                                             group_offsets,
-                                                             group_labels,
-                                                             preceding_window_in_days,
-                                                             following_window_in_days,
-                                                             min_periods,
-                                                             aggr,
-                                                             mr)
-           : grouped_time_range_rolling_window_impl<int64_t>(input,
-                                                             timestamp_column,
-                                                             timestamp_order,
-                                                             group_offsets,
-                                                             group_labels,
-                                                             preceding_window_in_days,
-                                                             following_window_in_days,
-                                                             min_periods,
-                                                             aggr,
-                                                             mr);
+  return detail::rolling_window(
+    input, preceding_window, following_window, min_periods, agg, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh
new file mode 100644
index 00000000000..20d8abf6701
--- /dev/null
+++ b/cpp/src/rolling/rolling_detail.cuh
@@ -0,0 +1,1056 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rolling/jit/code/code.h>
+#include <rolling/rolling_detail.hpp>
+#include <rolling/rolling_jit_detail.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/groupby/sort_helper.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/device_operators.cuh>
+#include <cudf/rolling.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/bit.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/traits.hpp>
+
+#include <jit/launcher.h>
+#include <jit/parser.h>
+#include <jit/type.h>
+#include <jit/bit.hpp.jit>
+#include <jit/rolling_jit_detail.hpp.jit>
+#include <jit/types.hpp.jit>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/execution_policy.h>
+#include <thrust/find.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
+
+#include <memory>
+
+namespace cudf {
+
+namespace detail {
+namespace {  // anonymous
+/**
+ * @brief Only COUNT_VALID operation is executed and count is updated
+ *        depending on `min_periods` and returns true if it was
+ *        valid, else false.
+ */
+template <typename InputType,
+          typename OutputType,
+          typename agg_op,
+          aggregation::Kind op,
+          bool has_nulls,
+          std::enable_if_t<op == aggregation::COUNT_VALID>* = nullptr>
+bool __device__ process_rolling_window(column_device_view input,
+                                       column_device_view ignored_default_outputs,
+                                       mutable_column_device_view output,
+                                       size_type start_index,
+                                       size_type end_index,
+                                       size_type current_index,
+                                       size_type min_periods)
+{
+  // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
+  // for CUDA 10.0 and below (fixed in CUDA 10.1)
+  volatile cudf::size_type count = 0;
+
+  bool output_is_valid = ((end_index - start_index) >= min_periods);
+
+  if (output_is_valid) {
+    if (!has_nulls) {
+      count = end_index - start_index;
+    } else {
+      count = thrust::count_if(thrust::seq,
+                               thrust::make_counting_iterator(start_index),
+                               thrust::make_counting_iterator(end_index),
+                               [&input](auto i) { return input.is_valid_nocheck(i); });
+    }
+    output.element<OutputType>(current_index) = count;
+  }
+
+  return output_is_valid;
+}
+
+/**
+ * @brief Only COUNT_ALL operation is executed and count is updated
+ *        depending on `min_periods` and returns true if it was
+ *        valid, else false.
+ */
+template <typename InputType,
+          typename OutputType,
+          typename agg_op,
+          aggregation::Kind op,
+          bool has_nulls,
+          std::enable_if_t<op == aggregation::COUNT_ALL>* = nullptr>
+bool __device__ process_rolling_window(column_device_view input,
+                                       column_device_view ignored_default_outputs,
+                                       mutable_column_device_view output,
+                                       size_type start_index,
+                                       size_type end_index,
+                                       size_type current_index,
+                                       size_type min_periods)
+{
+  cudf::size_type count = end_index - start_index;
+
+  bool output_is_valid                      = (count >= min_periods);
+  output.element<OutputType>(current_index) = count;
+
+  return output_is_valid;
+}
+
+/**
+ * @brief Calculates row-number within [start_index, end_index).
+ *        Count is updated depending on `min_periods`
+ *        Returns true if it was valid, else false.
+ */
+template <typename InputType,
+          typename OutputType,
+          typename agg_op,
+          aggregation::Kind op,
+          bool has_nulls,
+          std::enable_if_t<op == aggregation::ROW_NUMBER>* = nullptr>
+bool __device__ process_rolling_window(column_device_view input,
+                                       column_device_view ignored_default_outputs,
+                                       mutable_column_device_view output,
+                                       size_type start_index,
+                                       size_type end_index,
+                                       size_type current_index,
+                                       size_type min_periods)
+{
+  bool output_is_valid                      = ((end_index - start_index) >= min_periods);
+  output.element<OutputType>(current_index) = ((current_index - start_index) + 1);
+
+  return output_is_valid;
+}
+
+/**
+ * @brief LEAD(N): Returns the row from the input column, at the specified offset past the
+ *        current row.
+ * If the offset crosses the grouping boundary or column boundary for
+ * a given row, a "default" value is returned. The "default" value is null, by default.
+ *
+ * E.g. Consider an input column with the following values and grouping:
+ *      [10, 11, 12, 13,   20, 21, 22, 23]
+ *      <------G1----->   <------G2------>
+ *
+ * LEAD(input_col, 1) yields:
+ *      [11, 12, 13, null,  21, 22, 23, null]
+ *
+ * LEAD(input_col, 1, 99) (where 99 indicates the default) yields:
+ *      [11, 12, 13, 99,  21, 22, 23, 99]
+ */
+template <typename InputType,
+          typename OutputType,
+          typename agg_op,
+          aggregation::Kind op,
+          bool has_nulls>
+std::enable_if_t<(op == aggregation::LEAD) && (cudf::is_fixed_width<InputType>()), bool> __device__
+process_rolling_window(column_device_view input,
+                       column_device_view default_outputs,
+                       mutable_column_device_view output,
+                       size_type start_index,
+                       size_type end_index,
+                       size_type current_index,
+                       size_type min_periods,
+                       agg_op device_agg_op)
+{
+  // Offsets have already been normalized.
+  auto row_offset = device_agg_op.row_offset;
+
+  // Check if row is invalid.
+  if (row_offset > (end_index - current_index - 1)) {
+    // Invalid row marked. Use default value, if available.
+    if (default_outputs.size() == 0 || default_outputs.is_null(current_index)) { return false; }
+
+    output.element<OutputType>(current_index) = default_outputs.element<OutputType>(current_index);
+    return true;
+  }
+
+  // Not an invalid row.
+  auto index   = current_index + row_offset;
+  auto is_null = input.is_null(index);
+  if (!is_null) { output.element<OutputType>(current_index) = input.element<InputType>(index); }
+  return !is_null;
+}
+
+/**
+ * @brief LAG(N): returns the row from the input column at the specified offset preceding
+ *        the current row.
+ * If the offset crosses the grouping boundary or column boundary for
+ * a given row, a "default" value is returned. The "default" value is null, by default.
+ *
+ * E.g. Consider an input column with the following values and grouping:
+ *      [10, 11, 12, 13,   20, 21, 22, 23]
+ *      <------G1----->   <------G2------>
+ *
+ * LAG(input_col, 2) yields:
+ *      [null, null, 10, 11, null, null, 20, 21]
+ * LAG(input_col, 2, 99) yields:
+ *      [99, 99, 10, 11, 99, 99, 20, 21]
+ */
+template <typename InputType,
+          typename OutputType,
+          typename agg_op,
+          aggregation::Kind op,
+          bool has_nulls>
+std::enable_if_t<(op == aggregation::LAG) && (cudf::is_fixed_width<InputType>()), bool> __device__
+process_rolling_window(column_device_view input,
+                       column_device_view default_outputs,
+                       mutable_column_device_view output,
+                       size_type start_index,
+                       size_type end_index,
+                       size_type current_index,
+                       size_type min_periods,
+                       agg_op device_agg_op)
+{
+  // Offsets have already been normalized.
+  auto row_offset = device_agg_op.row_offset;
+
+  // Check if row is invalid.
+  if (row_offset > (current_index - start_index)) {
+    // Invalid row marked. Use default value, if available.
+    if (default_outputs.size() == 0 || default_outputs.is_null(current_index)) { return false; }
+
+    output.element<OutputType>(current_index) = default_outputs.element<OutputType>(current_index);
+    return true;
+  }
+
+  // Not an invalid row.
+  auto index   = current_index - row_offset;
+  auto is_null = input.is_null(index);
+  if (!is_null) { output.element<OutputType>(current_index) = input.element<InputType>(index); }
+  return !is_null;
+}
+
+/**
+ * @brief Only used for `string_view` type to get ARGMIN and ARGMAX, which
+ *        will be used to gather MIN and MAX. And returns true if the
+ *        operation was valid, else false.
+ */
+template <typename InputType,
+          typename OutputType,
+          typename agg_op,
+          aggregation::Kind op,
+          bool has_nulls,
+          std::enable_if_t<(op == aggregation::ARGMIN or op == aggregation::ARGMAX) and
+                           std::is_same<InputType, cudf::string_view>::value>* = nullptr>
+bool __device__ process_rolling_window(column_device_view input,
+                                       column_device_view ignored_default_outputs,
+                                       mutable_column_device_view output,
+                                       size_type start_index,
+                                       size_type end_index,
+                                       size_type current_index,
+                                       size_type min_periods)
+{
+  // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
+  // for CUDA 10.0 and below (fixed in CUDA 10.1)
+  volatile cudf::size_type count = 0;
+  InputType val                  = agg_op::template identity<InputType>();
+  OutputType val_index           = (op == aggregation::ARGMIN) ? ARGMIN_SENTINEL : ARGMAX_SENTINEL;
+
+  for (size_type j = start_index; j < end_index; j++) {
+    if (!has_nulls || input.is_valid(j)) {
+      InputType element = input.element<InputType>(j);
+      val               = agg_op{}(element, val);
+      if (val == element) { val_index = j; }
+      count++;
+    }
+  }
+
+  bool output_is_valid = (count >= min_periods);
+  // -1 will help identify null elements while gathering for Min and Max
+  // In case of count, this would be null, so doesn't matter.
+  output.element<OutputType>(current_index) = (output_is_valid) ? val_index : -1;
+
+  // The gather mask shouldn't contain null values, so
+  // always return zero
+  return true;
+}
+
+/**
+ * @brief Operates on only fixed-width types and returns true if the
+ *        operation was valid, else false.
+ */
+template <typename InputType,
+          typename OutputType,
+          typename agg_op,
+          aggregation::Kind op,
+          bool has_nulls,
+          std::enable_if_t<!std::is_same<InputType, cudf::string_view>::value and
+                           !(op == aggregation::COUNT_VALID || op == aggregation::COUNT_ALL ||
+                             op == aggregation::ROW_NUMBER || op == aggregation::LEAD ||
+                             op == aggregation::LAG)>* = nullptr>
+bool __device__ process_rolling_window(column_device_view input,
+                                       column_device_view ignored_default_outputs,
+                                       mutable_column_device_view output,
+                                       size_type start_index,
+                                       size_type end_index,
+                                       size_type current_index,
+                                       size_type min_periods)
+{
+  // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
+  // for CUDA 10.0 and below (fixed in CUDA 10.1)
+  volatile cudf::size_type count = 0;
+  OutputType val                 = agg_op::template identity<OutputType>();
+
+  for (size_type j = start_index; j < end_index; j++) {
+    if (!has_nulls || input.is_valid(j)) {
+      OutputType element = input.element<InputType>(j);
+      val                = agg_op{}(element, val);
+      count++;
+    }
+  }
+
+  bool output_is_valid = (count >= min_periods);
+
+  // store the output value, one per thread
+  cudf::detail::rolling_store_output_functor<OutputType, op == aggregation::MEAN>{}(
+    output.element<OutputType>(current_index), val, count);
+
+  return output_is_valid;
+}
+
+/**
+ * @brief Computes the rolling window function
+ *
+ * @tparam InputType  Datatype of `input`
+ * @tparam OutputType  Datatype of `output`
+ * @tparam agg_op  A functor that defines the aggregation operation
+ * @tparam op The aggregation operator (enum value)
+ * @tparam block_size CUDA block size for the kernel
+ * @tparam has_nulls true if the input column has nulls
+ * @tparam PrecedingWindowIterator iterator type (inferred)
+ * @tparam FollowingWindowIterator iterator type (inferred)
+ * @param input Input column device view
+ * @param output Output column device view
+ * @param preceding_window_begin[in] Rolling window size iterator, accumulates from
+ *                in_col[i-preceding_window] to in_col[i] inclusive
+ * @param following_window_begin[in] Rolling window size iterator in the forward
+ *                direction, accumulates from in_col[i] to
+ *                in_col[i+following_window] inclusive
+ * @param min_periods[in]  Minimum number of observations in window required to
+ *                have a value, otherwise 0 is stored in the valid bit mask
+ */
+template <typename InputType,
+          typename OutputType,
+          typename agg_op,
+          aggregation::Kind op,
+          int block_size,
+          bool has_nulls,
+          typename PrecedingWindowIterator,
+          typename FollowingWindowIterator>
+__launch_bounds__(block_size) __global__
+  void gpu_rolling(column_device_view input,
+                   column_device_view default_outputs,
+                   mutable_column_device_view output,
+                   size_type* __restrict__ output_valid_count,
+                   PrecedingWindowIterator preceding_window_begin,
+                   FollowingWindowIterator following_window_begin,
+                   size_type min_periods)
+{
+  size_type i      = blockIdx.x * block_size + threadIdx.x;
+  size_type stride = block_size * gridDim.x;
+
+  size_type warp_valid_count{0};
+
+  auto active_threads = __ballot_sync(0xffffffff, i < input.size());
+  while (i < input.size()) {
+    size_type preceding_window = preceding_window_begin[i];
+    size_type following_window = following_window_begin[i];
+
+    // compute bounds
+    size_type start       = min(input.size(), max(0, i - preceding_window + 1));
+    size_type end         = min(input.size(), max(0, i + following_window + 1));
+    size_type start_index = min(start, end);
+    size_type end_index   = max(start, end);
+
+    // aggregate
+    // TODO: We should explore using shared memory to avoid redundant loads.
+    //       This might require separating the kernel into a special version
+    //       for dynamic and static sizes.
+
+    volatile bool output_is_valid = false;
+    output_is_valid = process_rolling_window<InputType, OutputType, agg_op, op, has_nulls>(
+      input, default_outputs, output, start_index, end_index, i, min_periods);
+
+    // set the mask
+    cudf::bitmask_type result_mask{__ballot_sync(active_threads, output_is_valid)};
+
+    // only one thread writes the mask
+    if (0 == threadIdx.x % cudf::detail::warp_size) {
+      output.set_mask_word(cudf::word_index(i), result_mask);
+      warp_valid_count += __popc(result_mask);
+    }
+
+    // process next element
+    i += stride;
+    active_threads = __ballot_sync(active_threads, i < input.size());
+  }
+
+  // sum the valid counts across the whole block
+  size_type block_valid_count =
+    cudf::detail::single_lane_block_sum_reduce<block_size, 0>(warp_valid_count);
+
+  if (threadIdx.x == 0) { atomicAdd(output_valid_count, block_valid_count); }
+}
+
+template <typename InputType,
+          typename OutputType,
+          typename agg_op,
+          aggregation::Kind op,
+          int block_size,
+          bool has_nulls,
+          typename PrecedingWindowIterator,
+          typename FollowingWindowIterator>
+__launch_bounds__(block_size) __global__
+  void gpu_rolling(column_device_view input,
+                   column_device_view default_outputs,
+                   mutable_column_device_view output,
+                   size_type* __restrict__ output_valid_count,
+                   PrecedingWindowIterator preceding_window_begin,
+                   FollowingWindowIterator following_window_begin,
+                   size_type min_periods,
+                   agg_op device_agg_op)
+{
+  size_type i      = blockIdx.x * block_size + threadIdx.x;
+  size_type stride = block_size * gridDim.x;
+
+  size_type warp_valid_count{0};
+
+  auto active_threads = __ballot_sync(0xffffffff, i < input.size());
+  while (i < input.size()) {
+    size_type preceding_window = preceding_window_begin[i];
+    size_type following_window = following_window_begin[i];
+
+    // compute bounds
+    size_type start       = min(input.size(), max(0, i - preceding_window + 1));
+    size_type end         = min(input.size(), max(0, i + following_window + 1));
+    size_type start_index = min(start, end);
+    size_type end_index   = max(start, end);
+
+    // aggregate
+    // TODO: We should explore using shared memory to avoid redundant loads.
+    //       This might require separating the kernel into a special version
+    //       for dynamic and static sizes.
+
+    volatile bool output_is_valid = false;
+    output_is_valid = process_rolling_window<InputType, OutputType, agg_op, op, has_nulls>(
+      input, default_outputs, output, start_index, end_index, i, min_periods, device_agg_op);
+
+    // set the mask
+    cudf::bitmask_type result_mask{__ballot_sync(active_threads, output_is_valid)};
+
+    // only one thread writes the mask
+    if (0 == threadIdx.x % cudf::detail::warp_size) {
+      output.set_mask_word(cudf::word_index(i), result_mask);
+      warp_valid_count += __popc(result_mask);
+    }
+
+    // process next element
+    i += stride;
+    active_threads = __ballot_sync(active_threads, i < input.size());
+  }
+
+  // sum the valid counts across the whole block
+  size_type block_valid_count =
+    cudf::detail::single_lane_block_sum_reduce<block_size, 0>(warp_valid_count);
+
+  if (threadIdx.x == 0) { atomicAdd(output_valid_count, block_valid_count); }
+}
+
+template <typename InputType>
+struct rolling_window_launcher {
+  template <typename T,
+            typename agg_op,
+            aggregation::Kind op,
+            typename PrecedingWindowIterator,
+            typename FollowingWindowIterator>
+  size_type kernel_launcher(column_view const& input,
+                            column_view const& default_outputs,
+                            mutable_column_view& output,
+                            PrecedingWindowIterator preceding_window_begin,
+                            FollowingWindowIterator following_window_begin,
+                            size_type min_periods,
+                            std::unique_ptr<aggregation> const& agg,
+                            rmm::cuda_stream_view stream)
+  {
+    using Type    = device_storage_type_t<T>;
+    using OutType = device_storage_type_t<target_type_t<InputType, op>>;
+
+    constexpr cudf::size_type block_size = 256;
+    cudf::detail::grid_1d grid(input.size(), block_size);
+
+    auto input_device_view           = column_device_view::create(input, stream);
+    auto output_device_view          = mutable_column_device_view::create(output, stream);
+    auto default_outputs_device_view = column_device_view::create(default_outputs, stream);
+
+    rmm::device_scalar<size_type> device_valid_count{0, stream};
+
+    if (input.has_nulls()) {
+      gpu_rolling<Type, OutType, agg_op, op, block_size, true>
+        <<<grid.num_blocks, block_size, 0, stream.value()>>>(*input_device_view,
+                                                             *default_outputs_device_view,
+                                                             *output_device_view,
+                                                             device_valid_count.data(),
+                                                             preceding_window_begin,
+                                                             following_window_begin,
+                                                             min_periods);
+    } else {
+      gpu_rolling<Type, OutType, agg_op, op, block_size, false>
+        <<<grid.num_blocks, block_size, 0, stream.value()>>>(*input_device_view,
+                                                             *default_outputs_device_view,
+                                                             *output_device_view,
+                                                             device_valid_count.data(),
+                                                             preceding_window_begin,
+                                                             following_window_begin,
+                                                             min_periods);
+    }
+
+    size_type valid_count = device_valid_count.value(stream);
+
+    // check the stream for debugging
+    CHECK_CUDA(stream.value());
+
+    return valid_count;
+  }
+
+  template <typename T,
+            typename agg_op,
+            aggregation::Kind op,
+            typename PrecedingWindowIterator,
+            typename FollowingWindowIterator>
+  size_type kernel_launcher(column_view const& input,
+                            column_view const& default_outputs,
+                            mutable_column_view& output,
+                            PrecedingWindowIterator preceding_window_begin,
+                            FollowingWindowIterator following_window_begin,
+                            size_type min_periods,
+                            std::unique_ptr<aggregation> const& agg,
+                            agg_op const& device_agg_op,
+                            rmm::cuda_stream_view stream)
+  {
+    using Type    = device_storage_type_t<T>;
+    using OutType = device_storage_type_t<target_type_t<InputType, op>>;
+
+    constexpr cudf::size_type block_size = 256;
+    cudf::detail::grid_1d grid(input.size(), block_size);
+
+    auto input_device_view           = column_device_view::create(input, stream);
+    auto output_device_view          = mutable_column_device_view::create(output, stream);
+    auto default_outputs_device_view = column_device_view::create(default_outputs, stream);
+
+    rmm::device_scalar<size_type> device_valid_count{0, stream};
+
+    if (input.has_nulls()) {
+      gpu_rolling<Type, OutType, agg_op, op, block_size, true>
+        <<<grid.num_blocks, block_size, 0, stream.value()>>>(*input_device_view,
+                                                             *default_outputs_device_view,
+                                                             *output_device_view,
+                                                             device_valid_count.data(),
+                                                             preceding_window_begin,
+                                                             following_window_begin,
+                                                             min_periods,
+                                                             device_agg_op);
+    } else {
+      gpu_rolling<Type, OutType, agg_op, op, block_size, false>
+        <<<grid.num_blocks, block_size, 0, stream.value()>>>(*input_device_view,
+                                                             *default_outputs_device_view,
+                                                             *output_device_view,
+                                                             device_valid_count.data(),
+                                                             preceding_window_begin,
+                                                             following_window_begin,
+                                                             min_periods,
+                                                             device_agg_op);
+    }
+
+    size_type valid_count = device_valid_count.value(stream);
+
+    // check the stream for debugging
+    CHECK_CUDA(stream.value());
+
+    return valid_count;
+  }
+
+  // This launch is only for fixed width columns with valid aggregation option
+  // numeric: All
+  // timestamp: MIN, MAX, COUNT_VALID, COUNT_ALL, ROW_NUMBER
+  // string, dictionary, list : COUNT_VALID, COUNT_ALL, ROW_NUMBER
+  template <typename T,
+            typename agg_op,
+            aggregation::Kind op,
+            typename PrecedingWindowIterator,
+            typename FollowingWindowIterator>
+  std::enable_if_t<cudf::detail::is_rolling_supported<T, agg_op, op>() and
+                     !cudf::detail::is_rolling_string_specialization<T, agg_op, op>(),
+                   std::unique_ptr<column>>
+  launch(column_view const& input,
+         column_view const& default_outputs,
+         PrecedingWindowIterator preceding_window_begin,
+         FollowingWindowIterator following_window_begin,
+         size_type min_periods,
+         std::unique_ptr<aggregation> const& agg,
+         rmm::cuda_stream_view stream,
+         rmm::mr::device_memory_resource* mr)
+  {
+    if (input.is_empty()) return empty_like(input);
+
+    auto output = make_fixed_width_column(
+      target_type(input.type(), op), input.size(), mask_state::UNINITIALIZED, stream, mr);
+
+    cudf::mutable_column_view output_view = output->mutable_view();
+    auto valid_count =
+      kernel_launcher<T, agg_op, op, PrecedingWindowIterator, FollowingWindowIterator>(
+        input,
+        default_outputs,
+        output_view,
+        preceding_window_begin,
+        following_window_begin,
+        min_periods,
+        agg,
+        stream);
+
+    output->set_null_count(output->size() - valid_count);
+
+    return output;
+  }
+
+  // This launch is only for string specializations
+  // string: MIN, MAX
+  template <typename T,
+            typename agg_op,
+            aggregation::Kind op,
+            typename PrecedingWindowIterator,
+            typename FollowingWindowIterator>
+  std::enable_if_t<cudf::detail::is_rolling_string_specialization<T, agg_op, op>(),
+                   std::unique_ptr<column>>
+  launch(column_view const& input,
+         column_view const& default_outputs,
+         PrecedingWindowIterator preceding_window_begin,
+         FollowingWindowIterator following_window_begin,
+         size_type min_periods,
+         std::unique_ptr<aggregation> const& agg,
+         rmm::cuda_stream_view stream,
+         rmm::mr::device_memory_resource* mr)
+  {
+    if (input.is_empty()) return empty_like(input);
+
+    auto output = make_numeric_column(cudf::data_type{cudf::type_to_id<size_type>()},
+                                      input.size(),
+                                      cudf::mask_state::UNINITIALIZED,
+                                      stream,
+                                      mr);
+
+    cudf::mutable_column_view output_view = output->mutable_view();
+
+    // Passing the agg_op and aggregation::Kind as constant to group them in pair, else it
+    // evolves to error when try to use agg_op as compiler tries different combinations
+    if (op == aggregation::MIN) {
+      kernel_launcher<T,
+                      DeviceMin,
+                      aggregation::ARGMIN,
+                      PrecedingWindowIterator,
+                      FollowingWindowIterator>(input,
+                                               default_outputs,
+                                               output_view,
+                                               preceding_window_begin,
+                                               following_window_begin,
+                                               min_periods,
+                                               agg,
+                                               stream);
+    } else if (op == aggregation::MAX) {
+      kernel_launcher<T,
+                      DeviceMax,
+                      aggregation::ARGMAX,
+                      PrecedingWindowIterator,
+                      FollowingWindowIterator>(input,
+                                               default_outputs,
+                                               output_view,
+                                               preceding_window_begin,
+                                               following_window_begin,
+                                               min_periods,
+                                               agg,
+                                               stream);
+    } else {
+      CUDF_FAIL("MIN and MAX are the only supported aggregation types for string columns");
+    }
+
+    // The rows that represent null elements will be having negative values in gather map,
+    // and that's why nullify_out_of_bounds/ignore_out_of_bounds is true.
+    auto output_table = detail::gather(table_view{{input}},
+                                       output->view(),
+                                       cudf::out_of_bounds_policy::NULLIFY,
+                                       detail::negative_index_policy::NOT_ALLOWED,
+                                       stream,
+                                       mr);
+    return std::make_unique<cudf::column>(std::move(output_table->get_column(0)));
+  }
+
+  // Deals with invalid column and/or aggregation options
+  template <typename T,
+            typename agg_op,
+            aggregation::Kind op,
+            typename PrecedingWindowIterator,
+            typename FollowingWindowIterator>
+  std::enable_if_t<!cudf::detail::is_rolling_supported<T, agg_op, op>() and
+                     !cudf::detail::is_rolling_string_specialization<T, agg_op, op>(),
+                   std::unique_ptr<column>>
+  launch(column_view const& input,
+         column_view const& default_outputs,
+         PrecedingWindowIterator preceding_window_begin,
+         FollowingWindowIterator following_window_begin,
+         size_type min_periods,
+         std::unique_ptr<aggregation> const& agg,
+         rmm::cuda_stream_view stream,
+         rmm::mr::device_memory_resource* mr)
+  {
+    CUDF_FAIL("Aggregation operator and/or input type combination is invalid");
+  }
+
+  template <typename T,
+            typename agg_op,
+            aggregation::Kind op,
+            typename PrecedingWindowIterator,
+            typename FollowingWindowIterator>
+  std::enable_if_t<cudf::is_fixed_width<T>() and
+                     (op == aggregation::LEAD || op == aggregation::LAG),
+                   std::unique_ptr<column>>
+  launch(column_view const& input,
+         column_view const& default_outputs,
+         PrecedingWindowIterator preceding_window_begin,
+         FollowingWindowIterator following_window_begin,
+         size_type min_periods,
+         std::unique_ptr<aggregation> const& agg,
+         agg_op const& device_agg_op,
+         rmm::cuda_stream_view stream,
+         rmm::mr::device_memory_resource* mr)
+  {
+    if (input.is_empty()) return empty_like(input);
+
+    CUDF_EXPECTS(default_outputs.type().id() == input.type().id(),
+                 "Defaults column type must match input column.");  // Because LEAD/LAG.
+
+    // For LEAD(0)/LAG(0), no computation need be performed.
+    // Return copy of input.
+    if (0 == static_cast<cudf::detail::lead_lag_aggregation*>(agg.get())->row_offset) {
+      return std::make_unique<column>(input, stream, mr);
+    }
+
+    auto output = make_fixed_width_column(
+      target_type(input.type(), op), input.size(), mask_state::UNINITIALIZED, stream, mr);
+
+    cudf::mutable_column_view output_view = output->mutable_view();
+    auto valid_count =
+      kernel_launcher<T, agg_op, op, PrecedingWindowIterator, FollowingWindowIterator>(
+        input,
+        default_outputs,
+        output_view,
+        preceding_window_begin,
+        following_window_begin,
+        min_periods,
+        agg,
+        device_agg_op,
+        stream);
+
+    output->set_null_count(output->size() - valid_count);
+
+    return output;
+  }
+
+  // Deals with invalid column and/or aggregation options
+  template <typename T,
+            typename agg_op,
+            aggregation::Kind op,
+            typename PrecedingWindowIterator,
+            typename FollowingWindowIterator>
+  std::enable_if_t<!(op == aggregation::LEAD || op == aggregation::LAG) ||
+                     !cudf::is_fixed_width<T>(),
+                   std::unique_ptr<column>>
+  launch(column_view const& input,
+         column_view const& default_outputs,
+         PrecedingWindowIterator preceding_window_begin,
+         FollowingWindowIterator following_window_begin,
+         size_type min_periods,
+         std::unique_ptr<aggregation> const& agg,
+         agg_op device_agg_op,
+         rmm::cuda_stream_view stream,
+         rmm::mr::device_memory_resource* mr)
+  {
+    CUDF_FAIL(
+      "Aggregation operator and/or input type combination is invalid: "
+      "LEAD/LAG supported only on fixed-width types");
+  }
+
+  template <aggregation::Kind op,
+            typename PrecedingWindowIterator,
+            typename FollowingWindowIterator>
+  std::enable_if_t<!(op == aggregation::MEAN || op == aggregation::LEAD || op == aggregation::LAG),
+                   std::unique_ptr<column>>
+  operator()(column_view const& input,
+             column_view const& default_outputs,
+             PrecedingWindowIterator preceding_window_begin,
+             FollowingWindowIterator following_window_begin,
+             size_type min_periods,
+             std::unique_ptr<aggregation> const& agg,
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr)
+  {
+    CUDF_EXPECTS(default_outputs.is_empty(),
+                 "Only LEAD/LAG window functions support default values.");
+
+    return launch<InputType,
+                  typename corresponding_operator<op>::type,
+                  op,
+                  PrecedingWindowIterator,
+                  FollowingWindowIterator>(input,
+                                           default_outputs,
+                                           preceding_window_begin,
+                                           following_window_begin,
+                                           min_periods,
+                                           agg,
+                                           stream,
+                                           mr);
+  }
+
+  // This variant is just to handle mean
+  template <aggregation::Kind op,
+            typename PrecedingWindowIterator,
+            typename FollowingWindowIterator>
+  std::enable_if_t<(op == aggregation::MEAN), std::unique_ptr<column>> operator()(
+    column_view const& input,
+    column_view const& default_outputs,
+    PrecedingWindowIterator preceding_window_begin,
+    FollowingWindowIterator following_window_begin,
+    size_type min_periods,
+    std::unique_ptr<aggregation> const& agg,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr)
+  {
+    return launch<InputType, cudf::DeviceSum, op, PrecedingWindowIterator, FollowingWindowIterator>(
+      input,
+      default_outputs,
+      preceding_window_begin,
+      following_window_begin,
+      min_periods,
+      agg,
+      stream,
+      mr);
+  }
+
+  template <aggregation::Kind op,
+            typename PrecedingWindowIterator,
+            typename FollowingWindowIterator>
+  std::enable_if_t<(op == aggregation::LEAD || op == aggregation::LAG), std::unique_ptr<column>>
+  operator()(column_view const& input,
+             column_view const& default_outputs,
+             PrecedingWindowIterator preceding_window_begin,
+             FollowingWindowIterator following_window_begin,
+             size_type min_periods,
+             std::unique_ptr<aggregation> const& agg,
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr)
+  {
+    return launch<InputType,
+                  cudf::DeviceLeadLag,
+                  op,
+                  PrecedingWindowIterator,
+                  FollowingWindowIterator>(
+      input,
+      default_outputs,
+      preceding_window_begin,
+      following_window_begin,
+      min_periods,
+      agg,
+      cudf::DeviceLeadLag{static_cast<cudf::detail::lead_lag_aggregation*>(agg.get())->row_offset},
+      stream,
+      mr);
+  }
+};
+
+struct dispatch_rolling {
+  template <typename T, typename PrecedingWindowIterator, typename FollowingWindowIterator>
+  std::unique_ptr<column> operator()(column_view const& input,
+                                     column_view const& default_outputs,
+                                     PrecedingWindowIterator preceding_window_begin,
+                                     FollowingWindowIterator following_window_begin,
+                                     size_type min_periods,
+                                     std::unique_ptr<aggregation> const& agg,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    return aggregation_dispatcher(agg->kind,
+                                  rolling_window_launcher<T>{},
+                                  input,
+                                  default_outputs,
+                                  preceding_window_begin,
+                                  following_window_begin,
+                                  min_periods,
+                                  agg,
+                                  stream,
+                                  mr);
+  }
+};
+
+}  // namespace
+
+// Applies a user-defined rolling window function to the values in a column.
+template <typename PrecedingWindowIterator, typename FollowingWindowIterator>
+std::unique_ptr<column> rolling_window_udf(column_view const& input,
+                                           PrecedingWindowIterator preceding_window,
+                                           std::string const& preceding_window_str,
+                                           FollowingWindowIterator following_window,
+                                           std::string const& following_window_str,
+                                           size_type min_periods,
+                                           std::unique_ptr<aggregation> const& agg,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  static_assert(warp_size == cudf::detail::size_in_bits<cudf::bitmask_type>(),
+                "bitmask_type size does not match CUDA warp size");
+
+  if (input.has_nulls())
+    CUDF_FAIL("Currently the UDF version of rolling window does NOT support inputs with nulls.");
+
+  min_periods = std::max(min_periods, 0);
+
+  auto udf_agg = static_cast<udf_aggregation*>(agg.get());
+
+  std::string hash = "prog_rolling." + std::to_string(std::hash<std::string>{}(udf_agg->_source));
+
+  std::string cuda_source;
+  switch (udf_agg->kind) {
+    case aggregation::Kind::PTX:
+      cuda_source = cudf::rolling::jit::code::kernel_headers;
+      cuda_source +=
+        cudf::jit::parse_single_function_ptx(udf_agg->_source,
+                                             udf_agg->_function_name,
+                                             cudf::jit::get_type_name(udf_agg->_output_type),
+                                             {0, 5});  // args 0 and 5 are pointers.
+      cuda_source += cudf::rolling::jit::code::kernel;
+      break;
+    case aggregation::Kind::CUDA:
+      cuda_source = cudf::rolling::jit::code::kernel_headers;
+      cuda_source +=
+        cudf::jit::parse_single_function_cuda(udf_agg->_source, udf_agg->_function_name);
+      cuda_source += cudf::rolling::jit::code::kernel;
+      break;
+    default: CUDF_FAIL("Unsupported UDF type.");
+  }
+
+  std::unique_ptr<column> output = make_numeric_column(
+    udf_agg->_output_type, input.size(), cudf::mask_state::UNINITIALIZED, stream, mr);
+
+  auto output_view = output->mutable_view();
+  rmm::device_scalar<size_type> device_valid_count{0, stream};
+
+  const std::vector<std::string> compiler_flags{"-std=c++14",
+                                                // Have jitify prune unused global variables
+                                                "-remove-unused-globals",
+                                                // suppress all NVRTC warnings
+                                                "-w"};
+
+  // Launch the jitify kernel
+  cudf::jit::launcher(hash,
+                      cuda_source,
+                      {cudf_types_hpp,
+                       cudf_utilities_bit_hpp,
+                       cudf::rolling::jit::code::operation_h,
+                       ___src_rolling_rolling_jit_detail_hpp},
+                      compiler_flags,
+                      nullptr,
+                      stream)
+    .set_kernel_inst("gpu_rolling_new",  // name of the kernel we are launching
+                     {cudf::jit::get_type_name(input.type()),  // list of template arguments
+                      cudf::jit::get_type_name(output->type()),
+                      udf_agg->_operator_name,
+                      preceding_window_str.c_str(),
+                      following_window_str.c_str()})
+    .launch(input.size(),
+            cudf::jit::get_data_ptr(input),
+            input.null_mask(),
+            cudf::jit::get_data_ptr(output_view),
+            output_view.null_mask(),
+            device_valid_count.data(),
+            preceding_window,
+            following_window,
+            min_periods);
+
+  output->set_null_count(output->size() - device_valid_count.value(stream));
+
+  // check the stream for debugging
+  CHECK_CUDA(stream.value());
+
+  return output;
+}
+
+/**
+ * @copydoc cudf::rolling_window(column_view const& input,
+ *                               PrecedingWindowIterator preceding_window_begin,
+ *                               FollowingWindowIterator following_window_begin,
+ *                               size_type min_periods,
+ *                               std::unique_ptr<aggregation> const& agg,
+ *                               rmm::mr::device_memory_resource* mr)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+template <typename PrecedingWindowIterator, typename FollowingWindowIterator>
+std::unique_ptr<column> rolling_window(column_view const& input,
+                                       column_view const& default_outputs,
+                                       PrecedingWindowIterator preceding_window_begin,
+                                       FollowingWindowIterator following_window_begin,
+                                       size_type min_periods,
+                                       std::unique_ptr<aggregation> const& agg,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
+{
+  static_assert(warp_size == cudf::detail::size_in_bits<cudf::bitmask_type>(),
+                "bitmask_type size does not match CUDA warp size");
+
+  min_periods = std::max(min_periods, 0);
+
+  return cudf::type_dispatcher(input.type(),
+                               dispatch_rolling{},
+                               input,
+                               default_outputs,
+                               preceding_window_begin,
+                               following_window_begin,
+                               min_periods,
+                               agg,
+                               stream,
+                               mr);
+}
+
+}  // namespace detail
+
+}  // namespace cudf
diff --git a/cpp/src/rolling/rolling_detail.hpp b/cpp/src/rolling/rolling_detail.hpp
index a3d57a315f9..a69d39b4957 100644
--- a/cpp/src/rolling/rolling_detail.hpp
+++ b/cpp/src/rolling/rolling_detail.hpp
@@ -54,7 +54,10 @@ static constexpr bool is_rolling_supported()
     return (op == aggregation::MIN) or (op == aggregation::MAX) or
            (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
            (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or (op == aggregation::LAG);
-
+  } else if (cudf::is_fixed_point<ColumnType>()) {
+    return (op == aggregation::MIN) or (op == aggregation::MAX) or
+           (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
+           (op == aggregation::LEAD) or (op == aggregation::LAG);
   } else if (std::is_same<ColumnType, cudf::string_view>()) {
     return (op == aggregation::MIN) or (op == aggregation::MAX) or
            (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index 9840a6a1e3d..90d183a6eef 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -181,7 +181,8 @@ struct half_even_negative {
     auto const down_over_n = e / n;            // use this to determine HALF_EVEN case
     auto const down        = down_over_n * n;  // result from rounding down
     auto const diff        = generic_abs(e - down);
-    auto const adjustment  = (diff > n / 2) or (diff == n / 2 && down_over_n % 2 == 1) ? n : 0;
+    auto const adjustment =
+      (diff > n / 2) or (diff == n / 2 && generic_abs(down_over_n) % 2 == 1) ? n : 0;
     return down + generic_sign(e) * adjustment;
   }
 };
diff --git a/cpp/src/strings/char_types/char_cases.h b/cpp/src/strings/char_types/char_cases.h
index dd95dccdc02..7fd0e586db9 100644
--- a/cpp/src/strings/char_types/char_cases.h
+++ b/cpp/src/strings/char_types/char_cases.h
@@ -4725,7 +4725,6 @@ namespace detail {
  *
  * special_case_mapping, g_special_case_mappings, and special_case_prime can be regenerated with
  * cudf::strings::detail::generate_special_mapping_hash_table
- *
  */
 
 struct special_case_mapping {
diff --git a/cpp/src/strings/combine.cu b/cpp/src/strings/combine.cu
index a8a0ffa98f7..4e61d4d8c41 100644
--- a/cpp/src/strings/combine.cu
+++ b/cpp/src/strings/combine.cu
@@ -201,7 +201,7 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
     make_numeric_column(data_type{type_id::INT32}, 2, mask_state::UNALLOCATED, stream, mr);
   auto offsets_view = offsets_column->mutable_view();
   // set the first entry to 0 and the last entry to bytes
-  int32_t new_offsets[] = {0, bytes};
+  int32_t new_offsets[] = {0, static_cast<int32_t>(bytes)};
   CUDA_TRY(cudaMemcpyAsync(offsets_view.data<int32_t>(),
                            new_offsets,
                            sizeof(new_offsets),
@@ -286,8 +286,8 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
     // Execute it on every element
     thrust::transform(
       rmm::exec_policy(stream),
-      thrust::make_counting_iterator(0),
-      thrust::make_counting_iterator(strings_count),
+      thrust::make_counting_iterator<size_type>(0),
+      thrust::make_counting_iterator<size_type>(strings_count),
       out_col_strings.data().get(),
       // Output depends on the separator
       [col0, invalid_str, separator_col_view, separator_rep, col_rep] __device__(auto ridx) {
@@ -366,7 +366,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
     if (bytes == 0) assert(separator_str.size_bytes() == 0);
 
     // Separator goes only in between elements
-    return bytes - separator_str.size_bytes();
+    return static_cast<int32_t>(bytes - separator_str.size_bytes());
   };
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<size_type>(0), offsets_transformer);
diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu
index 07197265a60..81a499084f6 100644
--- a/cpp/src/strings/contains.cu
+++ b/cpp/src/strings/contains.cu
@@ -44,7 +44,6 @@ namespace {
  * There are three call types based on the number of regex instructions in the given pattern.
  * Small to medium instruction lengths can use the stack effectively though smaller executes faster.
  * Longer patterns require global memory.
- *
  */
 template <size_t stack_size>
 struct contains_fn {
@@ -159,7 +158,6 @@ namespace detail {
 namespace {
 /**
  * @brief This counts the number of times the regex pattern matches in each string.
- *
  */
 template <size_t stack_size>
 struct count_fn {
@@ -172,11 +170,11 @@ struct count_fn {
     prog.set_stack_mem(data1, data2);
     if (d_strings.is_null(idx)) return 0;
     string_view d_str  = d_strings.element<string_view>(idx);
+    auto const nchars  = d_str.length();
     int32_t find_count = 0;
-    size_type nchars   = d_str.length();
-    size_type begin    = 0;
-    while (begin <= nchars) {
-      auto end = nchars;
+    int32_t begin      = 0;
+    while (begin < nchars) {
+      auto end = static_cast<int32_t>(nchars);
       if (prog.find(idx, d_str, begin, end) <= 0) break;
       ++find_count;
       begin = end > begin ? end : begin + 1;
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 58264cbeeea..bc49b4768d0 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -238,7 +238,8 @@ struct parse_datetime {
         case 'M': timeparts[TP_MINUTE] = str2int(ptr, item.length); break;
         case 'S': timeparts[TP_SECOND] = str2int(ptr, item.length); break;
         case 'f': {
-          int32_t const read_size = std::min(static_cast<int32_t>(item.length), length);
+          int32_t const read_size =
+            std::min(static_cast<int32_t>(item.length), static_cast<int32_t>(length));
           int64_t const fraction  = str2int(ptr, read_size) * power_of_ten(item.length - read_size);
           timeparts[TP_SUBSECOND] = static_cast<int32_t>(fraction);
           break;
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 233fb937356..80675edce08 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -119,7 +119,7 @@ __global__ void fused_concatenate_string_offset_kernel(column_device_view const*
                                                        size_t const* partition_offsets,
                                                        size_type const num_input_views,
                                                        size_type const output_size,
-                                                       size_type* output_data,
+                                                       int32_t* output_data,
                                                        bitmask_type* output_mask,
                                                        size_type* out_valid_count)
 {
diff --git a/cpp/src/strings/copying/copying.cu b/cpp/src/strings/copying/copying.cu
index 668eb8471b9..80ef11ec456 100644
--- a/cpp/src/strings/copying/copying.cu
+++ b/cpp/src/strings/copying/copying.cu
@@ -53,7 +53,7 @@ std::unique_ptr<cudf::column> copy_slice(strings_column_view const& strings,
     auto offsets_column = std::make_unique<cudf::column>(
       cudf::slice(strings.offsets(), {0, strings_count + 1}).front(), stream, mr);
     auto data_size =
-      cudf::detail::get_value<size_type>(offsets_column->view(), strings_count, stream);
+      cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
     auto chars_column = std::make_unique<cudf::column>(
       cudf::slice(strings.chars(), {0, data_size}).front(), stream, mr);
     auto null_mask = cudf::detail::copy_bitmask(strings.null_mask(), 0, strings_count, stream, mr);
diff --git a/cpp/src/strings/findall.cu b/cpp/src/strings/findall.cu
index 71174cb3afb..9b74e81bf9f 100644
--- a/cpp/src/strings/findall.cu
+++ b/cpp/src/strings/findall.cu
@@ -68,15 +68,15 @@ struct findall_fn {
     u_char data2[stack_size];
     prog.set_stack_mem(data1, data2);
     string_view d_str      = d_strings.element<string_view>(idx);
-    auto nchars            = d_str.length();
-    size_type spos         = 0;
-    size_type epos         = nchars;
+    auto const nchars      = d_str.length();
+    int32_t spos           = 0;
+    int32_t epos           = static_cast<int32_t>(nchars);
     size_type column_count = 0;
     while (spos <= nchars) {
       if (prog.find(idx, d_str, spos, epos) <= 0) break;  // no more matches found
       if (column_count == column_index) break;            // found our column
       spos = epos > spos ? epos : spos + 1;
-      epos = nchars;
+      epos = static_cast<int32_t>(nchars);
       ++column_count;
     }
     if (spos <= epos) {
diff --git a/cpp/src/strings/regex/regexec.cu b/cpp/src/strings/regex/regexec.cu
index 8089244803e..0f344fb7111 100644
--- a/cpp/src/strings/regex/regexec.cu
+++ b/cpp/src/strings/regex/regexec.cu
@@ -73,8 +73,8 @@ reprog_device::reprog_device(reprog& prog)
 // Create instance of the reprog that can be passed into a device kernel
 std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_device::create(
   std::string const& pattern,
-  const uint8_t* codepoint_flags,
-  size_type strings_count,
+  uint8_t const* codepoint_flags,
+  int32_t strings_count,
   rmm::cuda_stream_view stream)
 {
   std::vector<char32_t> pattern32 = string_to_char32_vector(pattern);
diff --git a/cpp/src/strings/replace/backref_re.cuh b/cpp/src/strings/replace/backref_re.cuh
index d5bec759528..f13d84cf9ca 100644
--- a/cpp/src/strings/replace/backref_re.cuh
+++ b/cpp/src/strings/replace/backref_re.cuh
@@ -41,7 +41,6 @@ using backref_type = thrust::pair<size_type, size_type>;
  * There are three call types based on the number of regex instructions in the given pattern.
  * Small to medium instruction lengths can use the stack effectively though smaller executes faster.
  * Longer patterns require global memory. Shorter patterns are common in data cleaning.
- *
  */
 template <size_t stack_size>
 struct backrefs_fn {
@@ -87,8 +86,8 @@ struct backrefs_fn {
             lpos_template += copy_length;
           }
           // extract the specific group's string for this backref's index
-          size_type spos_extract = begin;  // these are modified
-          size_type epos_extract = end;    // by extract()
+          int32_t spos_extract = begin;  // these are modified
+          int32_t epos_extract = end;    // by extract()
           if ((prog.extract(idx, d_str, spos_extract, epos_extract, backref.first - 1) <= 0) ||
               (epos_extract <= spos_extract))
             return;  // no value for this backref number; that is ok
@@ -109,7 +108,7 @@ struct backrefs_fn {
     if (out_ptr && (lpos < d_str.size_bytes()))  // copy remainder of input string
       memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);
     else if (!out_ptr)
-      d_offsets[idx] = nbytes;
+      d_offsets[idx] = static_cast<int32_t>(nbytes);
   }
 };
 
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index 81f1c694716..84c2466b9ed 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -50,7 +50,6 @@ using found_range = thrust::pair<size_type, size_type>;
  * There are three call types based on the number of regex instructions in the given pattern.
  * Small to medium instruction lengths can use the stack effectively though smaller executes faster.
  * Longer patterns require global memory. Shorter patterns are common in data cleaning.
- *
  */
 template <size_t stack_size>
 struct replace_multi_regex_fn {
@@ -89,7 +88,8 @@ struct replace_multi_regex_fn {
           continue;                             // or later in the string
         reprog_device prog = progs[ptn_idx];
         prog.set_stack_mem(data1, data2);
-        size_type begin = ch_pos, end = nchars;
+        auto begin = static_cast<int32_t>(ch_pos);
+        auto end   = static_cast<int32_t>(nchars);
         if (!prog.is_empty() && prog.find(idx, d_str, begin, end) > 0)
           d_ranges[ptn_idx] = found_range{begin, end};  // found a match
         else
@@ -124,7 +124,7 @@ struct replace_multi_regex_fn {
     if (out_ptr)  // copy the remainder
       memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);
     else
-      d_offsets[idx] = nbytes;
+      d_offsets[idx] = static_cast<int32_t>(nbytes);
   }
 };
 
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index 4eff05ba7b7..156b246fdfc 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -47,7 +47,6 @@ namespace {
  * There are three call types based on the number of regex instructions in the given pattern.
  * Small to medium instruction lengths can use the stack effectively though smaller executes faster.
  * Longer patterns require global memory. Shorter patterns are common in data cleaning.
- *
  */
 template <size_t stack_size>
 struct replace_regex_fn {
@@ -74,8 +73,8 @@ struct replace_regex_fn {
     auto in_ptr       = d_str.data();                    // input pointer (i)
     auto out_ptr      = d_chars ? d_chars + d_offsets[idx] : nullptr;  // output pointer (o)
     size_type lpos    = 0;
-    size_type begin   = 0;
-    size_type end     = nchars;  // working vars
+    int32_t begin     = 0;
+    int32_t end       = static_cast<int32_t>(nchars);
     // copy input to output replacing strings as we go
     while (mxn-- > 0)  // maximum number of replaces
     {
@@ -91,12 +90,12 @@ struct replace_regex_fn {
         lpos = epos;                                                        // i:bbbbsssseeee
       }                                                                     //  in_ptr --^
       begin = end;
-      end   = nchars;
+      end   = static_cast<int32_t>(nchars);
     }
     if (out_ptr)                                                  // copy the remainder
       memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);  // o:bbbbrrrrrreeee
     else
-      d_offsets[idx] = nbytes;
+      d_offsets[idx] = static_cast<int32_t>(nbytes);
   }
 };
 
diff --git a/cpp/src/strings/sorting/sorting.cu b/cpp/src/strings/sorting/sorting.cu
index a20fd6f9f1b..9a6bb9ddbbc 100644
--- a/cpp/src/strings/sorting/sorting.cu
+++ b/cpp/src/strings/sorting/sorting.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/gather.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/sorting.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -38,6 +39,7 @@ std::unique_ptr<cudf::column> sort(strings_column_view strings,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
+  CUDF_FUNC_RANGE();
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_column       = *strings_column;
 
diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index 6c1f73c5c42..f171dcabc5d 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -233,8 +233,8 @@ std::unique_ptr<column> split_record_fn(strings_column_view const& strings,
     data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
   auto d_offsets = offsets->mutable_view().data<int32_t>();
   thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator(0),
-                    thrust::make_counting_iterator(strings_count),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(strings_count),
                     d_offsets,
                     counter);
   thrust::exclusive_scan(
diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu
index 354ffb8d38b..b53d909d362 100644
--- a/cpp/src/strings/strings_column_factories.cu
+++ b/cpp/src/strings/strings_column_factories.cu
@@ -152,11 +152,11 @@ std::unique_ptr<column> make_strings_column(const rmm::device_vector<char>& stri
   auto offsets_column = make_numeric_column(
     data_type{type_id::INT32}, num_strings + 1, mask_state::UNALLOCATED, stream, mr);
   auto offsets_view = offsets_column->mutable_view();
-  CUDA_TRY(cudaMemcpyAsync(offsets_view.data<int32_t>(),
-                           offsets.data().get(),
-                           (num_strings + 1) * sizeof(int32_t),
-                           cudaMemcpyDeviceToDevice,
-                           stream.value()));
+  thrust::transform(rmm::exec_policy(stream),
+                    offsets.begin(),
+                    offsets.end(),
+                    offsets_view.data<int32_t>(),
+                    [] __device__(auto offset) { return static_cast<int32_t>(offset); });
   // build null bitmask
   rmm::device_buffer null_mask{
     valid_mask.data().get(),
diff --git a/cpp/src/structs/copying/concatenate.cu b/cpp/src/structs/copying/concatenate.cu
index 47b63d9cf6f..b2f861c7c8d 100644
--- a/cpp/src/structs/copying/concatenate.cu
+++ b/cpp/src/structs/copying/concatenate.cu
@@ -35,7 +35,6 @@ namespace detail {
 
 /**
  * @copydoc cudf::structs::detail::concatenate
- *
  */
 std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
                                     rmm::cuda_stream_view stream,
diff --git a/cpp/src/structs/utilities.cu b/cpp/src/structs/utilities.cu
index 6ff52d8663c..09e6c5d949d 100644
--- a/cpp/src/structs/utilities.cu
+++ b/cpp/src/structs/utilities.cu
@@ -25,7 +25,6 @@ namespace detail {
 
 /**
  * @copydoc cudf::structs::detail::extract_ordered_struct_children
- *
  */
 std::vector<std::vector<column_view>> extract_ordered_struct_children(
   std::vector<column_view> const& struct_cols)
diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu
index 12a5a1e7b95..efb88f9d41f 100644
--- a/cpp/src/text/edit_distance.cu
+++ b/cpp/src/text/edit_distance.cu
@@ -224,7 +224,7 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
   // We only need memory for half the size of the output matrix since the edit distance calculation
   // is commutative -- `distance(strings[i],strings[j]) == distance(strings[j],strings[i])`
   cudf::size_type n_upper = (strings_count * (strings_count - 1)) / 2;
-  rmm::device_uvector<cudf::size_type> offsets(n_upper, stream);
+  rmm::device_uvector<int32_t> offsets(n_upper, stream);
   auto d_offsets = offsets.data();
   CUDA_TRY(cudaMemsetAsync(d_offsets, 0, n_upper * sizeof(cudf::size_type), stream.value()));
   thrust::for_each_n(
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 16db4c2f170..9162b81b914 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -221,7 +221,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
   auto const d_strings      = *strings_column;
 
   // create a vector of ngram offsets for each string
-  rmm::device_vector<cudf::size_type> ngram_offsets(strings_count + 1);
+  rmm::device_vector<int32_t> ngram_offsets(strings_count + 1);
   thrust::transform_exclusive_scan(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<cudf::size_type>(0),
@@ -230,7 +230,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
     [d_strings, strings_count, ngrams] __device__(auto idx) {
       if (d_strings.is_null(idx) || (idx == strings_count)) return 0;
       auto const length = d_strings.element<cudf::string_view>(idx).length();
-      return std::max(0, (length + 1 - ngrams));
+      return std::max(0, static_cast<int32_t>(length + 1 - ngrams));
     },
     cudf::size_type{0},
     thrust::plus<cudf::size_type>());
diff --git a/cpp/src/text/utilities/tokenize_ops.cuh b/cpp/src/text/utilities/tokenize_ops.cuh
index 1afa46e1b1a..75d6872a9ad 100644
--- a/cpp/src/text/utilities/tokenize_ops.cuh
+++ b/cpp/src/text/utilities/tokenize_ops.cuh
@@ -139,7 +139,7 @@ struct characters_tokenizer {
 struct strings_tokenizer {
   cudf::column_device_view const d_strings;  ///< strings to tokenize
   cudf::string_view const d_delimiter;       ///< delimiter characters to tokenize around
-  cudf::size_type* d_offsets{};              ///< offsets into the d_tokens vector for each string
+  int32_t* d_offsets{};                      ///< offsets into the d_tokens vector for each string
   string_index_pair* d_tokens{};             ///< token positions in device memory
 
   /**
@@ -184,7 +184,7 @@ struct multi_delimiter_strings_tokenizer {
   cudf::column_device_view const d_strings;  ///< strings column to tokenize
   delimiterator delimiters_begin;            ///< first delimiter
   delimiterator delimiters_end;              ///< last delimiter
-  cudf::size_type* d_offsets{};              ///< offsets into the d_tokens output vector
+  int32_t* d_offsets{};                      ///< offsets into the d_tokens output vector
   string_index_pair* d_tokens{};             ///< token positions found for each string
 
   /**
diff --git a/cpp/tests/column/column_test.cu b/cpp/tests/column/column_test.cu
index 5c1fa6ca23d..88b8d656686 100644
--- a/cpp/tests/column/column_test.cu
+++ b/cpp/tests/column/column_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -63,7 +63,7 @@ TYPED_TEST_CASE(TypedColumnTest, cudf::test::Types<int32_t>);
  * @brief Verifies equality of the properties and data of a `column`'s views.
  *
  * @param col The `column` to verify
- **/
+ */
 void verify_column_views(cudf::column col)
 {
   cudf::column_view view                 = col;
diff --git a/cpp/tests/copying/scatter_tests.cpp b/cpp/tests/copying/scatter_tests.cpp
index acb6232cd1f..3351533b7a0 100644
--- a/cpp/tests/copying/scatter_tests.cpp
+++ b/cpp/tests/copying/scatter_tests.cpp
@@ -396,9 +396,9 @@ TYPED_TEST(ScatterDataTypeTests, ScatterScalarNoNulls)
 {
   using cudf::scalar_type_t;
   using cudf::test::fixed_width_column_wrapper;
+  using Type = cudf::device_storage_type_t<TypeParam>;
 
-  auto const source =
-    scalar_type_t<TypeParam>(cudf::test::make_type_param_scalar<TypeParam>(100), true);
+  auto const source = scalar_type_t<TypeParam>(cudf::test::make_type_param_scalar<Type>(100), true);
   std::reference_wrapper<const cudf::scalar> slr_ref{source};
   std::vector<std::reference_wrapper<const cudf::scalar>> source_vector{slr_ref};
 
@@ -418,9 +418,9 @@ TYPED_TEST(ScatterDataTypeTests, ScatterScalarTargetNulls)
 {
   using cudf::scalar_type_t;
   using cudf::test::fixed_width_column_wrapper;
+  using Type = cudf::device_storage_type_t<TypeParam>;
 
-  auto const source =
-    scalar_type_t<TypeParam>(cudf::test::make_type_param_scalar<TypeParam>(100), true);
+  auto const source = scalar_type_t<TypeParam>(cudf::test::make_type_param_scalar<Type>(100), true);
   std::reference_wrapper<const cudf::scalar> slr_ref{source};
   std::vector<std::reference_wrapper<const cudf::scalar>> source_vector{slr_ref};
 
@@ -442,9 +442,10 @@ TYPED_TEST(ScatterDataTypeTests, ScatterScalarSourceNulls)
 {
   using cudf::scalar_type_t;
   using cudf::test::fixed_width_column_wrapper;
+  using Type = cudf::device_storage_type_t<TypeParam>;
 
   auto const source =
-    scalar_type_t<TypeParam>(cudf::test::make_type_param_scalar<TypeParam>(100), false);
+    scalar_type_t<TypeParam>(cudf::test::make_type_param_scalar<Type>(100), false);
   std::reference_wrapper<const cudf::scalar> slr_ref{source};
   std::vector<std::reference_wrapper<const cudf::scalar>> source_vector{slr_ref};
 
@@ -465,9 +466,10 @@ TYPED_TEST(ScatterDataTypeTests, ScatterScalarBothNulls)
 {
   using cudf::scalar_type_t;
   using cudf::test::fixed_width_column_wrapper;
+  using Type = cudf::device_storage_type_t<TypeParam>;
 
   auto const source =
-    scalar_type_t<TypeParam>(cudf::test::make_type_param_scalar<TypeParam>(100), false);
+    scalar_type_t<TypeParam>(cudf::test::make_type_param_scalar<Type>(100), false);
   std::reference_wrapper<const cudf::scalar> slr_ref{source};
   std::vector<std::reference_wrapper<const cudf::scalar>> source_vector{slr_ref};
 
diff --git a/cpp/tests/io/comp/decomp_test.cu b/cpp/tests/io/comp/decomp_test.cu
index 6d4fd67c83e..c7e1ae91bd9 100644
--- a/cpp/tests/io/comp/decomp_test.cu
+++ b/cpp/tests/io/comp/decomp_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@
  *
  * Calls into Decompressor fixture to dispatch actual decompression work,
  * whose interface and setup is different for each codec.
- **/
+ */
 template <typename Decompressor>
 struct DecompressTest : public cudf::test::BaseFixture {
   void SetUp() override
@@ -93,7 +93,7 @@ struct DecompressTest : public cudf::test::BaseFixture {
 
 /**
  * @brief Derived fixture for GZIP decompression
- **/
+ */
 struct GzipDecompressTest : public DecompressTest<GzipDecompressTest> {
   cudaError_t dispatch()
   {
@@ -103,7 +103,7 @@ struct GzipDecompressTest : public DecompressTest<GzipDecompressTest> {
 
 /**
  * @brief Derived fixture for Snappy decompression
- **/
+ */
 struct SnappyDecompressTest : public DecompressTest<SnappyDecompressTest> {
   cudaError_t dispatch()
   {
@@ -113,7 +113,7 @@ struct SnappyDecompressTest : public DecompressTest<SnappyDecompressTest> {
 
 /**
  * @brief Derived fixture for Brotli decompression
- **/
+ */
 struct BrotliDecompressTest : public DecompressTest<BrotliDecompressTest> {
   cudaError_t dispatch()
   {
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index 79c99b7b77d..88b7a4f4bb2 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -589,6 +589,114 @@ TEST_F(CsvReaderTest, DatesCastToTimestampNanoSeconds)
     view.column(0));
 }
 
+TEST_F(CsvReaderTest, IntegersCastToTimestampSeconds)
+{
+  auto filepath = temp_env->get_temp_dir() + "IntegersCastToTimestampS.csv";
+  std::vector<int64_t> input_vals{1, 10, 111, 2, 11, 112, 3, 12, 113, 43432423, 13342, 13243214};
+  auto expected_column =
+    column_wrapper<cudf::timestamp_s, cudf::timestamp_s::rep>(input_vals.begin(), input_vals.end());
+  {
+    std::ofstream outfile(filepath, std::ofstream::out);
+    for (auto v : input_vals) { outfile << v << "\n"; }
+  }
+
+  cudf_io::csv_reader_options in_opts =
+    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+      .names({"A"})
+      .dtypes({"datetime64[s]"})
+      .header(-1)
+      .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_SECONDS});
+  auto result = cudf_io::read_csv(in_opts);
+
+  const auto view = result.tbl->view();
+  EXPECT_EQ(1, view.num_columns());
+  ASSERT_EQ(cudf::type_id::TIMESTAMP_SECONDS, view.column(0).type().id());
+
+  using namespace cuda::std::chrono_literals;
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_column, view.column(0));
+}
+
+TEST_F(CsvReaderTest, IntegersCastToTimestampMilliSeconds)
+{
+  auto filepath = temp_env->get_temp_dir() + "IntegersCastToTimestampMs.csv";
+  std::vector<int64_t> input_vals{1, 10, 111, 2, 11, 112, 3, 12, 113, 43432423, 13342, 13243214};
+  auto expected_column = column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>(
+    input_vals.begin(), input_vals.end());
+  {
+    std::ofstream outfile(filepath, std::ofstream::out);
+    for (auto v : input_vals) { outfile << v << "\n"; }
+  }
+
+  cudf_io::csv_reader_options in_opts =
+    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+      .names({"A"})
+      .dtypes({"datetime64[ms]"})
+      .header(-1)
+      .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS});
+  auto result = cudf_io::read_csv(in_opts);
+
+  const auto view = result.tbl->view();
+  EXPECT_EQ(1, view.num_columns());
+  ASSERT_EQ(cudf::type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id());
+
+  using namespace cuda::std::chrono_literals;
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_column, view.column(0));
+}
+
+TEST_F(CsvReaderTest, IntegersCastToTimestampMicroSeconds)
+{
+  auto filepath = temp_env->get_temp_dir() + "IntegersCastToTimestampUs.csv";
+  std::vector<int64_t> input_vals{1, 10, 111, 2, 11, 112, 3, 12, 113, 43432423, 13342, 13243214};
+  auto expected_column = column_wrapper<cudf::timestamp_us, cudf::timestamp_us::rep>(
+    input_vals.begin(), input_vals.end());
+  {
+    std::ofstream outfile(filepath, std::ofstream::out);
+    for (auto v : input_vals) { outfile << v << "\n"; }
+  }
+
+  cudf_io::csv_reader_options in_opts =
+    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+      .names({"A"})
+      .dtypes({"datetime64[us]"})
+      .header(-1)
+      .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS});
+  auto result = cudf_io::read_csv(in_opts);
+
+  const auto view = result.tbl->view();
+  EXPECT_EQ(1, view.num_columns());
+  ASSERT_EQ(cudf::type_id::TIMESTAMP_MICROSECONDS, view.column(0).type().id());
+
+  using namespace cuda::std::chrono_literals;
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_column, view.column(0));
+}
+
+TEST_F(CsvReaderTest, IntegersCastToTimestampNanoSeconds)
+{
+  auto filepath = temp_env->get_temp_dir() + "IntegersCastToTimestampNs.csv";
+  std::vector<int64_t> input_vals{1, 10, 111, 2, 11, 112, 3, 12, 113, 43432423, 13342, 13243214};
+  auto expected_column = column_wrapper<cudf::timestamp_ns, cudf::timestamp_ns::rep>(
+    input_vals.begin(), input_vals.end());
+  {
+    std::ofstream outfile(filepath, std::ofstream::out);
+    for (auto v : input_vals) { outfile << v << "\n"; }
+  }
+
+  cudf_io::csv_reader_options in_opts =
+    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+      .names({"A"})
+      .dtypes({"datetime64[ns]"})
+      .header(-1)
+      .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_NANOSECONDS});
+  auto result = cudf_io::read_csv(in_opts);
+
+  const auto view = result.tbl->view();
+  EXPECT_EQ(1, view.num_columns());
+  ASSERT_EQ(cudf::type_id::TIMESTAMP_NANOSECONDS, view.column(0).type().id());
+
+  using namespace cuda::std::chrono_literals;
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_column, view.column(0));
+}
+
 TEST_F(CsvReaderTest, FloatingPoint)
 {
   auto filepath = temp_env->get_temp_dir() + "FloatingPoint.csv";
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 9873397aac3..6a50aed3f7e 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -143,7 +143,7 @@ void check_float_column(cudf::column_view const& col,
 
 /**
  * @brief Base test fixture for JSON reader tests
- **/
+ */
 struct JsonReaderTest : public cudf::test::BaseFixture {
 };
 
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 0e03b8800b5..9fbcf605099 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -18,6 +18,7 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/concatenate.hpp>
@@ -141,15 +142,6 @@ inline auto random_values(size_t size)
   return values;
 }
 
-// Helper function to compare two tables
-void CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view const& lhs, cudf::table_view const& rhs)
-{
-  EXPECT_EQ(lhs.num_columns(), rhs.num_columns());
-  auto expected = lhs.begin();
-  auto result   = rhs.begin();
-  while (result != rhs.end()) { CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected++, *result++); }
-}
-
 struct SkipRowTest {
   int test_calls;
   SkipRowTest(void) : test_calls(0) {}
@@ -617,6 +609,26 @@ TEST_F(OrcWriterTest, negTimestampsNano)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
 }
 
+TEST_F(OrcWriterTest, Slice)
+{
+  auto col =
+    cudf::test::fixed_width_column_wrapper<int>{{1, 2, 3, 4, 5}, {true, true, true, false, true}};
+  std::vector<cudf::size_type> indices{2, 5};
+  std::vector<cudf::column_view> result = cudf::slice(col, indices);
+  cudf::table_view tbl{result};
+
+  auto filepath = temp_env->get_temp_filepath("Slice.orc");
+  cudf_io::orc_writer_options out_opts =
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl);
+  cudf_io::write_orc(out_opts);
+
+  cudf_io::orc_reader_options in_opts =
+    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath});
+  auto read_table = cudf_io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(read_table.tbl->view(), tbl);
+}
+
 TEST_F(OrcChunkedWriterTest, SingleTable)
 {
   srand(31337);
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index af8dfcce9d3..ada2eadaa31 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -816,6 +816,26 @@ TEST_F(ParquetWriterTest, MultipleMismatchedSources)
   }
 }
 
+TEST_F(ParquetWriterTest, Slice)
+{
+  auto col =
+    cudf::test::fixed_width_column_wrapper<int>{{1, 2, 3, 4, 5}, {true, true, true, false, true}};
+  std::vector<cudf::size_type> indices{2, 5};
+  std::vector<cudf::column_view> result = cudf::slice(col, indices);
+  cudf::table_view tbl{result};
+
+  auto filepath = temp_env->get_temp_filepath("Slice.parquet");
+  cudf_io::parquet_writer_options out_opts =
+    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, tbl);
+  cudf_io::write_parquet(out_opts);
+
+  cudf_io::parquet_reader_options in_opts =
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
+  auto read_table = cudf_io::read_parquet(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(read_table.tbl->view(), tbl);
+}
+
 TEST_F(ParquetChunkedWriterTest, SingleTable)
 {
   srand(31337);
@@ -1727,210 +1747,362 @@ TEST_F(ParquetReaderTest, ReorderedColumns)
 
 TEST_F(ParquetReaderTest, DecimalRead)
 {
-  /* We could add a dataset to include this file, but we don't want tests in cudf to have data. This
-     test is a temporary test until python gains the ability to write decimal, so we're embedding
-     a parquet file directly into the code here to prevent issues with finding the file */
-  const unsigned char decimals_parquet[] = {
-    0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0xb0, 0x03, 0x15, 0xb8, 0x03, 0x2c, 0x15, 0x6a, 0x15,
-    0x00, 0x15, 0x06, 0x15, 0x08, 0x1c, 0x36, 0x02, 0x28, 0x04, 0x7f, 0x96, 0x98, 0x00, 0x18, 0x04,
-    0x81, 0x69, 0x67, 0xff, 0x00, 0x00, 0x00, 0xd8, 0x01, 0xf0, 0xd7, 0x04, 0x00, 0x00, 0x00, 0x64,
-    0x01, 0x03, 0x06, 0x68, 0x12, 0xdc, 0xff, 0xbd, 0x18, 0xfd, 0xff, 0x64, 0x13, 0x80, 0x00, 0xb3,
-    0x5d, 0x62, 0x00, 0x90, 0x35, 0xa9, 0xff, 0xa2, 0xde, 0xe3, 0xff, 0xe9, 0xbf, 0x96, 0xff, 0x1f,
-    0x8a, 0x98, 0xff, 0xb1, 0x50, 0x34, 0x00, 0x88, 0x24, 0x59, 0x00, 0x2a, 0x33, 0xbe, 0xff, 0xd5,
-    0x16, 0xbc, 0xff, 0x13, 0x50, 0x8d, 0xff, 0xcb, 0x63, 0x2d, 0x00, 0x80, 0x8f, 0xbe, 0xff, 0x82,
-    0x40, 0x10, 0x00, 0x84, 0x68, 0x70, 0xff, 0x9b, 0x69, 0x78, 0x00, 0x14, 0x6c, 0x10, 0x00, 0x50,
-    0xd9, 0xe1, 0xff, 0xaa, 0xcd, 0x6a, 0x00, 0xcf, 0xb1, 0x28, 0x00, 0x77, 0x57, 0x8d, 0x00, 0xee,
-    0x05, 0x79, 0x00, 0xf0, 0x15, 0xeb, 0xff, 0x02, 0xe2, 0x06, 0x00, 0x87, 0x43, 0x86, 0x00, 0xf8,
-    0x2d, 0x2e, 0x00, 0xee, 0x2e, 0x98, 0xff, 0x39, 0xcb, 0x4d, 0x00, 0x1e, 0x6b, 0xea, 0xff, 0x80,
-    0x8e, 0x6c, 0xff, 0x97, 0x25, 0x26, 0x00, 0x4d, 0x0d, 0x0a, 0x00, 0xca, 0x64, 0x7f, 0x00, 0xf4,
-    0xbe, 0xa1, 0xff, 0xe2, 0x12, 0x6c, 0xff, 0xbd, 0x77, 0xae, 0xff, 0xf9, 0x4b, 0x36, 0x00, 0xb0,
-    0xe3, 0x79, 0xff, 0xa2, 0x2a, 0x29, 0x00, 0xcd, 0x06, 0xbc, 0xff, 0x2d, 0xa3, 0x7e, 0x00, 0xa9,
-    0x08, 0xa1, 0xff, 0xbf, 0x81, 0xd0, 0xff, 0x4f, 0x03, 0x73, 0x00, 0xb0, 0x99, 0x0c, 0x00, 0xbd,
-    0x6f, 0xf8, 0xff, 0x6b, 0x02, 0x05, 0x00, 0xc1, 0xe1, 0xba, 0xff, 0x81, 0x69, 0x67, 0xff, 0x7f,
-    0x96, 0x98, 0x00, 0x15, 0x00, 0x15, 0xd0, 0x06, 0x15, 0xda, 0x06, 0x2c, 0x15, 0x6a, 0x15, 0x00,
-    0x15, 0x06, 0x15, 0x08, 0x1c, 0x36, 0x02, 0x28, 0x08, 0xff, 0x3f, 0x7a, 0x10, 0xf3, 0x5a, 0x00,
-    0x00, 0x18, 0x08, 0x01, 0xc0, 0x85, 0xef, 0x0c, 0xa5, 0xff, 0xff, 0x00, 0x00, 0x00, 0xa8, 0x03,
-    0xf4, 0xa7, 0x01, 0x04, 0x00, 0x00, 0x00, 0x64, 0x01, 0x03, 0x06, 0x55, 0x6f, 0xc5, 0xe4, 0x9f,
-    0x1a, 0x00, 0x00, 0x47, 0x89, 0x0a, 0xe8, 0x58, 0xf0, 0xff, 0xff, 0x63, 0xee, 0x21, 0xdd, 0xdd,
-    0xca, 0xff, 0xff, 0xbe, 0x6f, 0x3b, 0xaa, 0xe9, 0x3d, 0x00, 0x00, 0xd6, 0x91, 0x2a, 0xb7, 0x08,
-    0x02, 0x00, 0x00, 0x75, 0x45, 0x2c, 0xd7, 0x76, 0x0c, 0x00, 0x00, 0x54, 0x49, 0x92, 0x44, 0x9c,
-    0xbf, 0xff, 0xff, 0x41, 0xa9, 0x6d, 0xec, 0x7a, 0xd0, 0xff, 0xff, 0x27, 0xa0, 0x23, 0x41, 0x44,
-    0xc1, 0xff, 0xff, 0x18, 0xd4, 0xe1, 0x30, 0xd3, 0xe0, 0xff, 0xff, 0x59, 0xac, 0x14, 0xf4, 0xec,
-    0x58, 0x00, 0x00, 0x2c, 0x17, 0x29, 0x57, 0x44, 0x13, 0x00, 0x00, 0xa2, 0x0d, 0x4a, 0xcc, 0x63,
-    0xff, 0xff, 0xff, 0x81, 0x33, 0xbc, 0xda, 0xd5, 0xda, 0xff, 0xff, 0x4c, 0x05, 0xf4, 0x78, 0x19,
-    0xea, 0xff, 0xff, 0x06, 0x71, 0x25, 0xde, 0x5a, 0xaf, 0xff, 0xff, 0x95, 0x32, 0x5f, 0x76, 0x98,
-    0xb3, 0xff, 0xff, 0xf1, 0x34, 0x3c, 0xbf, 0xa8, 0xbe, 0xff, 0xff, 0x27, 0x73, 0x40, 0x0c, 0x7d,
-    0xcd, 0xff, 0xff, 0x68, 0xa9, 0xc2, 0xe9, 0x2c, 0x03, 0x00, 0x00, 0x3f, 0x79, 0xd9, 0x04, 0x8c,
-    0xe5, 0xff, 0xff, 0x91, 0xb4, 0x9b, 0xe3, 0x8f, 0x21, 0x00, 0x00, 0xb8, 0x20, 0xc8, 0xc2, 0x4d,
-    0xa6, 0xff, 0xff, 0x47, 0xfa, 0xde, 0x36, 0x4a, 0xf3, 0xff, 0xff, 0x72, 0x80, 0x94, 0x59, 0xdd,
-    0x4e, 0x00, 0x00, 0x29, 0xe4, 0xd6, 0x43, 0xb0, 0xf0, 0xff, 0xff, 0x68, 0x36, 0xbc, 0x2d, 0xd1,
-    0xa9, 0xff, 0xff, 0xbc, 0xe4, 0xbe, 0xd7, 0xed, 0x1b, 0x00, 0x00, 0x02, 0x8b, 0xcb, 0xd7, 0xed,
-    0x47, 0x00, 0x00, 0x3c, 0x06, 0xe4, 0xda, 0xc7, 0x47, 0x00, 0x00, 0xf3, 0x39, 0x55, 0x28, 0x97,
-    0xba, 0xff, 0xff, 0x07, 0x79, 0x38, 0x4e, 0xe0, 0x21, 0x00, 0x00, 0xde, 0xed, 0x1c, 0x23, 0x09,
-    0x49, 0x00, 0x00, 0x49, 0x46, 0x49, 0x5d, 0x8f, 0x34, 0x00, 0x00, 0x38, 0x18, 0x50, 0xf6, 0xa1,
-    0x11, 0x00, 0x00, 0xdf, 0xb8, 0x19, 0x14, 0xd1, 0xe1, 0xff, 0xff, 0x2c, 0x56, 0x72, 0x93, 0x64,
-    0x3f, 0x00, 0x00, 0x1c, 0xe0, 0xbe, 0x87, 0x7d, 0xf9, 0xff, 0xff, 0x73, 0x0e, 0x3c, 0x01, 0x91,
-    0xf9, 0xff, 0xff, 0xb2, 0x37, 0x85, 0x81, 0x5f, 0x54, 0x00, 0x00, 0x58, 0x44, 0xb0, 0x1a, 0xac,
-    0xbb, 0xff, 0xff, 0x36, 0xbf, 0xbe, 0x5e, 0x22, 0xff, 0xff, 0xff, 0x06, 0x20, 0xa0, 0x23, 0x0d,
-    0x3b, 0x00, 0x00, 0x19, 0xc6, 0x49, 0x0a, 0x00, 0xcf, 0xff, 0xff, 0x4f, 0xcd, 0xc6, 0x95, 0x4b,
-    0xf1, 0xff, 0xff, 0xa3, 0x59, 0xaf, 0x65, 0xec, 0xe9, 0xff, 0xff, 0x58, 0xef, 0x05, 0x50, 0x63,
-    0xe4, 0xff, 0xff, 0xc7, 0x6a, 0x9e, 0xf1, 0x69, 0x20, 0x00, 0x00, 0xd1, 0xb3, 0xc9, 0x14, 0xb2,
-    0x29, 0x00, 0x00, 0x1d, 0x48, 0x16, 0x70, 0xf0, 0x40, 0x00, 0x00, 0x01, 0xc0, 0x85, 0xef, 0x0c,
-    0xa5, 0xff, 0xff, 0xff, 0x3f, 0x7a, 0x10, 0xf3, 0x5a, 0x00, 0x00, 0x15, 0x00, 0x15, 0x90, 0x0d,
-    0x15, 0x9a, 0x0d, 0x2c, 0x15, 0x6a, 0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1c, 0x36, 0x02, 0x28,
-    0x10, 0x4b, 0x3b, 0x4c, 0xa8, 0x5a, 0x86, 0xc4, 0x7a, 0x09, 0x8a, 0x22, 0x3f, 0xff, 0xff, 0xff,
-    0xff, 0x18, 0x10, 0xb4, 0xc4, 0xb3, 0x57, 0xa5, 0x79, 0x3b, 0x85, 0xf6, 0x75, 0xdd, 0xc0, 0x00,
-    0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xc8, 0x06, 0xf4, 0x47, 0x03, 0x04, 0x00, 0x00, 0x00, 0x64,
-    0x01, 0x03, 0x06, 0x05, 0x49, 0xf7, 0xfc, 0x89, 0x3d, 0x3e, 0x20, 0x07, 0x72, 0x3e, 0xa1, 0x66,
-    0x81, 0x67, 0x80, 0x23, 0x78, 0x06, 0x68, 0x0e, 0x78, 0xf5, 0x08, 0xed, 0x20, 0xcd, 0x0e, 0x7f,
-    0x9c, 0x70, 0xa0, 0xb9, 0x16, 0x44, 0xb2, 0x41, 0x62, 0xba, 0x82, 0xad, 0xe1, 0x12, 0x9b, 0xa6,
-    0x53, 0x8d, 0x20, 0x27, 0xd5, 0x84, 0x63, 0xb8, 0x07, 0x4b, 0x5b, 0xa4, 0x1c, 0xa4, 0x1c, 0x17,
-    0xbf, 0x4b, 0x00, 0x24, 0x04, 0x56, 0xa8, 0x52, 0xaf, 0x33, 0xf7, 0xad, 0x7c, 0xc8, 0x83, 0x25,
-    0x13, 0xaf, 0x80, 0x25, 0x6f, 0xbd, 0xd1, 0x15, 0x69, 0x64, 0x20, 0x7b, 0xd7, 0x33, 0xba, 0x66,
-    0x29, 0x8a, 0x00, 0xda, 0x42, 0x07, 0x2c, 0x6c, 0x39, 0x76, 0x9f, 0xdc, 0x17, 0xad, 0xb6, 0x58,
-    0xdf, 0x5f, 0x00, 0x18, 0x3a, 0xae, 0x1c, 0xd6, 0x5f, 0x9d, 0x78, 0x8d, 0x73, 0xdd, 0x3e, 0xd6,
-    0x18, 0x33, 0x40, 0xe4, 0x36, 0xde, 0xb0, 0xb7, 0x33, 0x2a, 0x6b, 0x08, 0x03, 0x6c, 0x6d, 0x8f,
-    0x13, 0x93, 0xd0, 0xd7, 0x87, 0x62, 0x63, 0x53, 0xfb, 0xd8, 0xbb, 0xc9, 0x54, 0x90, 0xd6, 0xa9,
-    0x8f, 0xc8, 0x60, 0xbd, 0xec, 0x75, 0x23, 0x9a, 0x21, 0xec, 0xe4, 0x86, 0x43, 0xd7, 0xc1, 0x88,
-    0xdc, 0x82, 0x00, 0x32, 0x79, 0xc9, 0x2b, 0x70, 0x85, 0xb7, 0x25, 0xa1, 0xcc, 0x7d, 0x0b, 0x29,
-    0x03, 0xea, 0x80, 0xff, 0x9b, 0xf3, 0x24, 0x7f, 0xd1, 0xff, 0xf0, 0x22, 0x65, 0x85, 0x99, 0x17,
-    0x63, 0xc2, 0xc0, 0xb7, 0x62, 0x05, 0xda, 0x7a, 0xa0, 0xc3, 0x2a, 0x6f, 0x1f, 0xee, 0x1f, 0x31,
-    0xa8, 0x42, 0x80, 0xe4, 0xb7, 0x6c, 0xf6, 0xac, 0x47, 0xb0, 0x17, 0x69, 0xcb, 0xff, 0x66, 0x8a,
-    0xd6, 0x25, 0x00, 0xf3, 0xcf, 0x0a, 0xaf, 0xf8, 0x92, 0x8a, 0xa0, 0xdf, 0x71, 0x13, 0x8d, 0x9d,
-    0xff, 0x7e, 0xe0, 0x0a, 0x52, 0xf1, 0x97, 0x01, 0xa9, 0x73, 0x27, 0xfd, 0x63, 0x58, 0x00, 0x32,
-    0xa6, 0xf6, 0x78, 0xb8, 0xe4, 0xfd, 0x20, 0x7c, 0x90, 0xee, 0xad, 0x8c, 0xc9, 0x71, 0x35, 0x66,
-    0x71, 0x3c, 0xe0, 0xe4, 0x0b, 0xbb, 0xa0, 0x50, 0xe9, 0xf2, 0x81, 0x1d, 0x3a, 0x95, 0x94, 0x00,
-    0xd5, 0x49, 0x00, 0x07, 0xdf, 0x21, 0x53, 0x36, 0x8d, 0x9e, 0xd9, 0xa5, 0x52, 0x4d, 0x0d, 0x29,
-    0x74, 0xf0, 0x40, 0xbd, 0xda, 0x63, 0x4e, 0xdd, 0x91, 0x8e, 0xa6, 0xa7, 0xf6, 0x78, 0x58, 0x3b,
-    0x0a, 0x5c, 0x60, 0x3c, 0x15, 0x34, 0xf8, 0x2c, 0x21, 0xe3, 0x56, 0x1b, 0x9e, 0xd9, 0x56, 0xd3,
-    0x13, 0x2e, 0x80, 0x2c, 0x36, 0xda, 0x1d, 0xc8, 0xfb, 0x52, 0xee, 0x17, 0xb3, 0x2b, 0xf3, 0xd2,
-    0xeb, 0x29, 0xa0, 0x37, 0xa0, 0x12, 0xce, 0x1c, 0x50, 0x6a, 0xf4, 0x11, 0xcd, 0x96, 0x88, 0x3f,
-    0x43, 0x78, 0xc0, 0x2c, 0x53, 0x6c, 0xa6, 0xdf, 0xb9, 0x9e, 0x93, 0xd4, 0x1e, 0xa9, 0x7f, 0x67,
-    0xa6, 0xc1, 0x80, 0x46, 0x0f, 0x63, 0x7d, 0x15, 0xf2, 0x4c, 0xc5, 0xda, 0x11, 0x9a, 0x20, 0x67,
-    0x27, 0xe8, 0x00, 0xec, 0x03, 0x1d, 0x15, 0xa7, 0x92, 0xb3, 0x1f, 0xda, 0x20, 0x92, 0xd8, 0x00,
-    0xfb, 0x06, 0x80, 0xeb, 0x4b, 0x0c, 0xc1, 0x1f, 0x49, 0x40, 0x06, 0x8d, 0x8a, 0xf8, 0x34, 0xb1,
-    0x0c, 0x1d, 0x20, 0xd0, 0x47, 0xe5, 0xb1, 0x7e, 0xf7, 0xe4, 0xb4, 0x7e, 0x9c, 0x84, 0x18, 0x61,
-    0x32, 0x4f, 0xc0, 0xc2, 0xb2, 0xcc, 0x63, 0xf6, 0xe1, 0x16, 0xd6, 0xd9, 0x4b, 0x74, 0x13, 0x01,
-    0xa1, 0xe2, 0x00, 0xb7, 0x9e, 0xc1, 0x3a, 0xc5, 0xaf, 0xe8, 0x54, 0x07, 0x2a, 0x20, 0xfd, 0x2c,
-    0x6f, 0xb9, 0x80, 0x18, 0x92, 0x87, 0xa0, 0x81, 0x24, 0x60, 0x47, 0x17, 0x4f, 0xbc, 0xbe, 0xf5,
-    0x03, 0x69, 0x80, 0xe3, 0x10, 0x54, 0xd6, 0x68, 0x7d, 0x75, 0xd3, 0x0a, 0x45, 0x38, 0x9e, 0xa9,
-    0xfd, 0x05, 0x40, 0xd2, 0x1e, 0x6f, 0x5c, 0x30, 0x10, 0xfe, 0x9b, 0x9f, 0x6d, 0xc0, 0x9d, 0x6c,
-    0x17, 0x7d, 0x00, 0x09, 0xb6, 0x8a, 0x31, 0x8e, 0x1b, 0x6b, 0x84, 0x1e, 0x79, 0xce, 0x10, 0x55,
-    0x59, 0x6a, 0x40, 0x16, 0xdc, 0x9a, 0xcf, 0x4d, 0xb0, 0x8f, 0xac, 0xe3, 0x8d, 0xee, 0xd2, 0xef,
-    0x01, 0x8c, 0xe0, 0x2b, 0x24, 0xe5, 0xb4, 0xe1, 0x86, 0x72, 0x00, 0x30, 0x07, 0xce, 0x02, 0x23,
-    0x41, 0x33, 0x40, 0xf0, 0x9b, 0xc2, 0x2d, 0x30, 0xec, 0x3b, 0x17, 0xb2, 0x8f, 0x64, 0x7d, 0xcd,
-    0x70, 0x9e, 0x80, 0x22, 0xb5, 0xdf, 0x6d, 0x2a, 0x43, 0xd4, 0x2b, 0x5a, 0xf6, 0x96, 0xa6, 0xea,
-    0x91, 0x62, 0x80, 0x39, 0xf2, 0x5a, 0x8e, 0xc0, 0xb9, 0x29, 0x99, 0x17, 0xe7, 0x35, 0x2c, 0xf6,
-    0x4d, 0x18, 0x00, 0x48, 0x10, 0x85, 0xb4, 0x3f, 0x89, 0x60, 0x49, 0x6e, 0xf0, 0xcd, 0x9d, 0x92,
-    0xeb, 0x96, 0x80, 0xcf, 0xf9, 0xf1, 0x46, 0x1d, 0xc0, 0x49, 0xb3, 0x36, 0x2e, 0x24, 0xc8, 0xdb,
-    0x41, 0x72, 0x20, 0xf5, 0xde, 0x5c, 0xf9, 0x4a, 0x6e, 0xa0, 0x0b, 0x13, 0xfc, 0x2d, 0x17, 0x07,
-    0x16, 0x5e, 0x00, 0x3c, 0x54, 0x41, 0x0e, 0xa2, 0x0d, 0xf3, 0x48, 0x12, 0x2e, 0x7c, 0xab, 0x3c,
-    0x59, 0x1c, 0x40, 0xca, 0xb0, 0x71, 0xc7, 0x29, 0xf0, 0xbb, 0x9f, 0xf4, 0x3f, 0x25, 0x49, 0xad,
-    0xc2, 0x8f, 0x80, 0x04, 0x38, 0x6d, 0x35, 0x02, 0xca, 0xe6, 0x02, 0x83, 0x89, 0x4e, 0x74, 0xdb,
-    0x08, 0x5a, 0x80, 0x13, 0x99, 0xd4, 0x26, 0xc1, 0x27, 0xce, 0xb0, 0x98, 0x99, 0xca, 0xf6, 0x3e,
-    0x50, 0x49, 0xd0, 0xbf, 0xcb, 0x6f, 0xbe, 0x5b, 0x92, 0x63, 0xde, 0x94, 0xd3, 0x8f, 0x07, 0x06,
-    0x0f, 0x2b, 0x80, 0x36, 0xf1, 0x77, 0xf6, 0x29, 0x33, 0x13, 0xa9, 0x4a, 0x55, 0x3d, 0x6c, 0xca,
-    0xdb, 0x4e, 0x40, 0xc4, 0x95, 0x54, 0xf4, 0xe2, 0x8c, 0x1b, 0xa0, 0xfe, 0x30, 0x50, 0x9d, 0x62,
-    0xbc, 0x5c, 0x00, 0xb4, 0xc4, 0xb3, 0x57, 0xa5, 0x79, 0x3b, 0x85, 0xf6, 0x75, 0xdd, 0xc0, 0x00,
-    0x00, 0x00, 0x01, 0x4b, 0x3b, 0x4c, 0xa8, 0x5a, 0x86, 0xc4, 0x7a, 0x09, 0x8a, 0x22, 0x3f, 0xff,
-    0xff, 0xff, 0xff, 0x15, 0x02, 0x19, 0x4c, 0x48, 0x0c, 0x73, 0x70, 0x61, 0x72, 0x6b, 0x5f, 0x73,
-    0x63, 0x68, 0x65, 0x6d, 0x61, 0x15, 0x06, 0x00, 0x15, 0x02, 0x25, 0x02, 0x18, 0x06, 0x64, 0x65,
-    0x63, 0x37, 0x70, 0x34, 0x25, 0x0a, 0x15, 0x08, 0x15, 0x0e, 0x00, 0x15, 0x04, 0x25, 0x02, 0x18,
-    0x07, 0x64, 0x65, 0x63, 0x31, 0x34, 0x70, 0x35, 0x25, 0x0a, 0x15, 0x0a, 0x15, 0x1c, 0x00, 0x15,
-    0x0e, 0x15, 0x20, 0x15, 0x02, 0x18, 0x08, 0x64, 0x65, 0x63, 0x33, 0x38, 0x70, 0x31, 0x38, 0x25,
-    0x0a, 0x15, 0x24, 0x15, 0x4c, 0x00, 0x16, 0x6a, 0x19, 0x1c, 0x19, 0x3c, 0x26, 0x08, 0x1c, 0x15,
-    0x02, 0x19, 0x35, 0x06, 0x08, 0x00, 0x19, 0x18, 0x06, 0x64, 0x65, 0x63, 0x37, 0x70, 0x34, 0x15,
-    0x02, 0x16, 0x6a, 0x16, 0xf6, 0x03, 0x16, 0xfe, 0x03, 0x26, 0x08, 0x3c, 0x36, 0x02, 0x28, 0x04,
-    0x7f, 0x96, 0x98, 0x00, 0x18, 0x04, 0x81, 0x69, 0x67, 0xff, 0x00, 0x19, 0x1c, 0x15, 0x00, 0x15,
-    0x00, 0x15, 0x02, 0x00, 0x00, 0x00, 0x26, 0x86, 0x04, 0x1c, 0x15, 0x04, 0x19, 0x35, 0x06, 0x08,
-    0x00, 0x19, 0x18, 0x07, 0x64, 0x65, 0x63, 0x31, 0x34, 0x70, 0x35, 0x15, 0x02, 0x16, 0x6a, 0x16,
-    0xa6, 0x07, 0x16, 0xb0, 0x07, 0x26, 0x86, 0x04, 0x3c, 0x36, 0x02, 0x28, 0x08, 0xff, 0x3f, 0x7a,
-    0x10, 0xf3, 0x5a, 0x00, 0x00, 0x18, 0x08, 0x01, 0xc0, 0x85, 0xef, 0x0c, 0xa5, 0xff, 0xff, 0x00,
-    0x19, 0x1c, 0x15, 0x00, 0x15, 0x00, 0x15, 0x02, 0x00, 0x00, 0x00, 0x26, 0xb6, 0x0b, 0x1c, 0x15,
-    0x0e, 0x19, 0x35, 0x06, 0x08, 0x00, 0x19, 0x18, 0x08, 0x64, 0x65, 0x63, 0x33, 0x38, 0x70, 0x31,
-    0x38, 0x15, 0x02, 0x16, 0x6a, 0x16, 0x86, 0x0e, 0x16, 0x90, 0x0e, 0x26, 0xb6, 0x0b, 0x3c, 0x36,
-    0x02, 0x28, 0x10, 0x4b, 0x3b, 0x4c, 0xa8, 0x5a, 0x86, 0xc4, 0x7a, 0x09, 0x8a, 0x22, 0x3f, 0xff,
-    0xff, 0xff, 0xff, 0x18, 0x10, 0xb4, 0xc4, 0xb3, 0x57, 0xa5, 0x79, 0x3b, 0x85, 0xf6, 0x75, 0xdd,
-    0xc0, 0x00, 0x00, 0x00, 0x01, 0x00, 0x19, 0x1c, 0x15, 0x00, 0x15, 0x00, 0x15, 0x02, 0x00, 0x00,
-    0x00, 0x16, 0xa2, 0x19, 0x16, 0x6a, 0x00, 0x19, 0x2c, 0x18, 0x18, 0x6f, 0x72, 0x67, 0x2e, 0x61,
-    0x70, 0x61, 0x63, 0x68, 0x65, 0x2e, 0x73, 0x70, 0x61, 0x72, 0x6b, 0x2e, 0x76, 0x65, 0x72, 0x73,
-    0x69, 0x6f, 0x6e, 0x18, 0x05, 0x33, 0x2e, 0x30, 0x2e, 0x31, 0x00, 0x18, 0x29, 0x6f, 0x72, 0x67,
-    0x2e, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2e, 0x73, 0x70, 0x61, 0x72, 0x6b, 0x2e, 0x73, 0x71,
-    0x6c, 0x2e, 0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2e, 0x72, 0x6f, 0x77, 0x2e, 0x6d, 0x65,
-    0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x18, 0xf4, 0x01, 0x7b, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22,
-    0x3a, 0x22, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x22, 0x2c, 0x22, 0x66, 0x69, 0x65, 0x6c, 0x64,
-    0x73, 0x22, 0x3a, 0x5b, 0x7b, 0x22, 0x6e, 0x61, 0x6d, 0x65, 0x22, 0x3a, 0x22, 0x64, 0x65, 0x63,
-    0x37, 0x70, 0x34, 0x22, 0x2c, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x22, 0x64, 0x65, 0x63,
-    0x69, 0x6d, 0x61, 0x6c, 0x28, 0x37, 0x2c, 0x34, 0x29, 0x22, 0x2c, 0x22, 0x6e, 0x75, 0x6c, 0x6c,
-    0x61, 0x62, 0x6c, 0x65, 0x22, 0x3a, 0x74, 0x72, 0x75, 0x65, 0x2c, 0x22, 0x6d, 0x65, 0x74, 0x61,
-    0x64, 0x61, 0x74, 0x61, 0x22, 0x3a, 0x7b, 0x7d, 0x7d, 0x2c, 0x7b, 0x22, 0x6e, 0x61, 0x6d, 0x65,
-    0x22, 0x3a, 0x22, 0x64, 0x65, 0x63, 0x31, 0x34, 0x70, 0x35, 0x22, 0x2c, 0x22, 0x74, 0x79, 0x70,
-    0x65, 0x22, 0x3a, 0x22, 0x64, 0x65, 0x63, 0x69, 0x6d, 0x61, 0x6c, 0x28, 0x31, 0x34, 0x2c, 0x35,
-    0x29, 0x22, 0x2c, 0x22, 0x6e, 0x75, 0x6c, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x22, 0x3a, 0x74, 0x72,
-    0x75, 0x65, 0x2c, 0x22, 0x6d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x22, 0x3a, 0x7b, 0x7d,
-    0x7d, 0x2c, 0x7b, 0x22, 0x6e, 0x61, 0x6d, 0x65, 0x22, 0x3a, 0x22, 0x64, 0x65, 0x63, 0x33, 0x38,
-    0x70, 0x31, 0x38, 0x22, 0x2c, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x22, 0x64, 0x65, 0x63,
-    0x69, 0x6d, 0x61, 0x6c, 0x28, 0x33, 0x38, 0x2c, 0x31, 0x38, 0x29, 0x22, 0x2c, 0x22, 0x6e, 0x75,
-    0x6c, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x22, 0x3a, 0x74, 0x72, 0x75, 0x65, 0x2c, 0x22, 0x6d, 0x65,
-    0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x22, 0x3a, 0x7b, 0x7d, 0x7d, 0x5d, 0x7d, 0x00, 0x18, 0x4a,
-    0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2d, 0x6d, 0x72, 0x20, 0x76, 0x65, 0x72, 0x73, 0x69,
-    0x6f, 0x6e, 0x20, 0x31, 0x2e, 0x31, 0x30, 0x2e, 0x31, 0x20, 0x28, 0x62, 0x75, 0x69, 0x6c, 0x64,
-    0x20, 0x61, 0x38, 0x39, 0x64, 0x66, 0x38, 0x66, 0x39, 0x39, 0x33, 0x32, 0x62, 0x36, 0x65, 0x66,
-    0x36, 0x36, 0x33, 0x33, 0x64, 0x30, 0x36, 0x30, 0x36, 0x39, 0x65, 0x35, 0x30, 0x63, 0x39, 0x62,
-    0x37, 0x39, 0x37, 0x30, 0x62, 0x65, 0x62, 0x64, 0x31, 0x29, 0x19, 0x3c, 0x1c, 0x00, 0x00, 0x1c,
-    0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xd3, 0x02, 0x00, 0x00, 0x50, 0x41, 0x52, 0x31};
-  unsigned int decimals_parquet_len = 2366;
-
-  cudf_io::parquet_reader_options read_opts = cudf_io::parquet_reader_options::builder(
-    cudf_io::source_info{reinterpret_cast<const char*>(decimals_parquet), decimals_parquet_len});
-  auto result = cudf_io::read_parquet(read_opts);
+  {
+    /* We could add a dataset to include this file, but we don't want tests in cudf to have data.
+       This test is a temporary test until python gains the ability to write decimal, so we're
+       embedding
+       a parquet file directly into the code here to prevent issues with finding the file */
+    const unsigned char decimals_parquet[] = {
+      0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0xb0, 0x03, 0x15, 0xb8, 0x03, 0x2c, 0x15, 0x6a,
+      0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1c, 0x36, 0x02, 0x28, 0x04, 0x7f, 0x96, 0x98, 0x00,
+      0x18, 0x04, 0x81, 0x69, 0x67, 0xff, 0x00, 0x00, 0x00, 0xd8, 0x01, 0xf0, 0xd7, 0x04, 0x00,
+      0x00, 0x00, 0x64, 0x01, 0x03, 0x06, 0x68, 0x12, 0xdc, 0xff, 0xbd, 0x18, 0xfd, 0xff, 0x64,
+      0x13, 0x80, 0x00, 0xb3, 0x5d, 0x62, 0x00, 0x90, 0x35, 0xa9, 0xff, 0xa2, 0xde, 0xe3, 0xff,
+      0xe9, 0xbf, 0x96, 0xff, 0x1f, 0x8a, 0x98, 0xff, 0xb1, 0x50, 0x34, 0x00, 0x88, 0x24, 0x59,
+      0x00, 0x2a, 0x33, 0xbe, 0xff, 0xd5, 0x16, 0xbc, 0xff, 0x13, 0x50, 0x8d, 0xff, 0xcb, 0x63,
+      0x2d, 0x00, 0x80, 0x8f, 0xbe, 0xff, 0x82, 0x40, 0x10, 0x00, 0x84, 0x68, 0x70, 0xff, 0x9b,
+      0x69, 0x78, 0x00, 0x14, 0x6c, 0x10, 0x00, 0x50, 0xd9, 0xe1, 0xff, 0xaa, 0xcd, 0x6a, 0x00,
+      0xcf, 0xb1, 0x28, 0x00, 0x77, 0x57, 0x8d, 0x00, 0xee, 0x05, 0x79, 0x00, 0xf0, 0x15, 0xeb,
+      0xff, 0x02, 0xe2, 0x06, 0x00, 0x87, 0x43, 0x86, 0x00, 0xf8, 0x2d, 0x2e, 0x00, 0xee, 0x2e,
+      0x98, 0xff, 0x39, 0xcb, 0x4d, 0x00, 0x1e, 0x6b, 0xea, 0xff, 0x80, 0x8e, 0x6c, 0xff, 0x97,
+      0x25, 0x26, 0x00, 0x4d, 0x0d, 0x0a, 0x00, 0xca, 0x64, 0x7f, 0x00, 0xf4, 0xbe, 0xa1, 0xff,
+      0xe2, 0x12, 0x6c, 0xff, 0xbd, 0x77, 0xae, 0xff, 0xf9, 0x4b, 0x36, 0x00, 0xb0, 0xe3, 0x79,
+      0xff, 0xa2, 0x2a, 0x29, 0x00, 0xcd, 0x06, 0xbc, 0xff, 0x2d, 0xa3, 0x7e, 0x00, 0xa9, 0x08,
+      0xa1, 0xff, 0xbf, 0x81, 0xd0, 0xff, 0x4f, 0x03, 0x73, 0x00, 0xb0, 0x99, 0x0c, 0x00, 0xbd,
+      0x6f, 0xf8, 0xff, 0x6b, 0x02, 0x05, 0x00, 0xc1, 0xe1, 0xba, 0xff, 0x81, 0x69, 0x67, 0xff,
+      0x7f, 0x96, 0x98, 0x00, 0x15, 0x00, 0x15, 0xd0, 0x06, 0x15, 0xda, 0x06, 0x2c, 0x15, 0x6a,
+      0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1c, 0x36, 0x02, 0x28, 0x08, 0xff, 0x3f, 0x7a, 0x10,
+      0xf3, 0x5a, 0x00, 0x00, 0x18, 0x08, 0x01, 0xc0, 0x85, 0xef, 0x0c, 0xa5, 0xff, 0xff, 0x00,
+      0x00, 0x00, 0xa8, 0x03, 0xf4, 0xa7, 0x01, 0x04, 0x00, 0x00, 0x00, 0x64, 0x01, 0x03, 0x06,
+      0x55, 0x6f, 0xc5, 0xe4, 0x9f, 0x1a, 0x00, 0x00, 0x47, 0x89, 0x0a, 0xe8, 0x58, 0xf0, 0xff,
+      0xff, 0x63, 0xee, 0x21, 0xdd, 0xdd, 0xca, 0xff, 0xff, 0xbe, 0x6f, 0x3b, 0xaa, 0xe9, 0x3d,
+      0x00, 0x00, 0xd6, 0x91, 0x2a, 0xb7, 0x08, 0x02, 0x00, 0x00, 0x75, 0x45, 0x2c, 0xd7, 0x76,
+      0x0c, 0x00, 0x00, 0x54, 0x49, 0x92, 0x44, 0x9c, 0xbf, 0xff, 0xff, 0x41, 0xa9, 0x6d, 0xec,
+      0x7a, 0xd0, 0xff, 0xff, 0x27, 0xa0, 0x23, 0x41, 0x44, 0xc1, 0xff, 0xff, 0x18, 0xd4, 0xe1,
+      0x30, 0xd3, 0xe0, 0xff, 0xff, 0x59, 0xac, 0x14, 0xf4, 0xec, 0x58, 0x00, 0x00, 0x2c, 0x17,
+      0x29, 0x57, 0x44, 0x13, 0x00, 0x00, 0xa2, 0x0d, 0x4a, 0xcc, 0x63, 0xff, 0xff, 0xff, 0x81,
+      0x33, 0xbc, 0xda, 0xd5, 0xda, 0xff, 0xff, 0x4c, 0x05, 0xf4, 0x78, 0x19, 0xea, 0xff, 0xff,
+      0x06, 0x71, 0x25, 0xde, 0x5a, 0xaf, 0xff, 0xff, 0x95, 0x32, 0x5f, 0x76, 0x98, 0xb3, 0xff,
+      0xff, 0xf1, 0x34, 0x3c, 0xbf, 0xa8, 0xbe, 0xff, 0xff, 0x27, 0x73, 0x40, 0x0c, 0x7d, 0xcd,
+      0xff, 0xff, 0x68, 0xa9, 0xc2, 0xe9, 0x2c, 0x03, 0x00, 0x00, 0x3f, 0x79, 0xd9, 0x04, 0x8c,
+      0xe5, 0xff, 0xff, 0x91, 0xb4, 0x9b, 0xe3, 0x8f, 0x21, 0x00, 0x00, 0xb8, 0x20, 0xc8, 0xc2,
+      0x4d, 0xa6, 0xff, 0xff, 0x47, 0xfa, 0xde, 0x36, 0x4a, 0xf3, 0xff, 0xff, 0x72, 0x80, 0x94,
+      0x59, 0xdd, 0x4e, 0x00, 0x00, 0x29, 0xe4, 0xd6, 0x43, 0xb0, 0xf0, 0xff, 0xff, 0x68, 0x36,
+      0xbc, 0x2d, 0xd1, 0xa9, 0xff, 0xff, 0xbc, 0xe4, 0xbe, 0xd7, 0xed, 0x1b, 0x00, 0x00, 0x02,
+      0x8b, 0xcb, 0xd7, 0xed, 0x47, 0x00, 0x00, 0x3c, 0x06, 0xe4, 0xda, 0xc7, 0x47, 0x00, 0x00,
+      0xf3, 0x39, 0x55, 0x28, 0x97, 0xba, 0xff, 0xff, 0x07, 0x79, 0x38, 0x4e, 0xe0, 0x21, 0x00,
+      0x00, 0xde, 0xed, 0x1c, 0x23, 0x09, 0x49, 0x00, 0x00, 0x49, 0x46, 0x49, 0x5d, 0x8f, 0x34,
+      0x00, 0x00, 0x38, 0x18, 0x50, 0xf6, 0xa1, 0x11, 0x00, 0x00, 0xdf, 0xb8, 0x19, 0x14, 0xd1,
+      0xe1, 0xff, 0xff, 0x2c, 0x56, 0x72, 0x93, 0x64, 0x3f, 0x00, 0x00, 0x1c, 0xe0, 0xbe, 0x87,
+      0x7d, 0xf9, 0xff, 0xff, 0x73, 0x0e, 0x3c, 0x01, 0x91, 0xf9, 0xff, 0xff, 0xb2, 0x37, 0x85,
+      0x81, 0x5f, 0x54, 0x00, 0x00, 0x58, 0x44, 0xb0, 0x1a, 0xac, 0xbb, 0xff, 0xff, 0x36, 0xbf,
+      0xbe, 0x5e, 0x22, 0xff, 0xff, 0xff, 0x06, 0x20, 0xa0, 0x23, 0x0d, 0x3b, 0x00, 0x00, 0x19,
+      0xc6, 0x49, 0x0a, 0x00, 0xcf, 0xff, 0xff, 0x4f, 0xcd, 0xc6, 0x95, 0x4b, 0xf1, 0xff, 0xff,
+      0xa3, 0x59, 0xaf, 0x65, 0xec, 0xe9, 0xff, 0xff, 0x58, 0xef, 0x05, 0x50, 0x63, 0xe4, 0xff,
+      0xff, 0xc7, 0x6a, 0x9e, 0xf1, 0x69, 0x20, 0x00, 0x00, 0xd1, 0xb3, 0xc9, 0x14, 0xb2, 0x29,
+      0x00, 0x00, 0x1d, 0x48, 0x16, 0x70, 0xf0, 0x40, 0x00, 0x00, 0x01, 0xc0, 0x85, 0xef, 0x0c,
+      0xa5, 0xff, 0xff, 0xff, 0x3f, 0x7a, 0x10, 0xf3, 0x5a, 0x00, 0x00, 0x15, 0x00, 0x15, 0x90,
+      0x0d, 0x15, 0x9a, 0x0d, 0x2c, 0x15, 0x6a, 0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1c, 0x36,
+      0x02, 0x28, 0x10, 0x4b, 0x3b, 0x4c, 0xa8, 0x5a, 0x86, 0xc4, 0x7a, 0x09, 0x8a, 0x22, 0x3f,
+      0xff, 0xff, 0xff, 0xff, 0x18, 0x10, 0xb4, 0xc4, 0xb3, 0x57, 0xa5, 0x79, 0x3b, 0x85, 0xf6,
+      0x75, 0xdd, 0xc0, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xc8, 0x06, 0xf4, 0x47, 0x03,
+      0x04, 0x00, 0x00, 0x00, 0x64, 0x01, 0x03, 0x06, 0x05, 0x49, 0xf7, 0xfc, 0x89, 0x3d, 0x3e,
+      0x20, 0x07, 0x72, 0x3e, 0xa1, 0x66, 0x81, 0x67, 0x80, 0x23, 0x78, 0x06, 0x68, 0x0e, 0x78,
+      0xf5, 0x08, 0xed, 0x20, 0xcd, 0x0e, 0x7f, 0x9c, 0x70, 0xa0, 0xb9, 0x16, 0x44, 0xb2, 0x41,
+      0x62, 0xba, 0x82, 0xad, 0xe1, 0x12, 0x9b, 0xa6, 0x53, 0x8d, 0x20, 0x27, 0xd5, 0x84, 0x63,
+      0xb8, 0x07, 0x4b, 0x5b, 0xa4, 0x1c, 0xa4, 0x1c, 0x17, 0xbf, 0x4b, 0x00, 0x24, 0x04, 0x56,
+      0xa8, 0x52, 0xaf, 0x33, 0xf7, 0xad, 0x7c, 0xc8, 0x83, 0x25, 0x13, 0xaf, 0x80, 0x25, 0x6f,
+      0xbd, 0xd1, 0x15, 0x69, 0x64, 0x20, 0x7b, 0xd7, 0x33, 0xba, 0x66, 0x29, 0x8a, 0x00, 0xda,
+      0x42, 0x07, 0x2c, 0x6c, 0x39, 0x76, 0x9f, 0xdc, 0x17, 0xad, 0xb6, 0x58, 0xdf, 0x5f, 0x00,
+      0x18, 0x3a, 0xae, 0x1c, 0xd6, 0x5f, 0x9d, 0x78, 0x8d, 0x73, 0xdd, 0x3e, 0xd6, 0x18, 0x33,
+      0x40, 0xe4, 0x36, 0xde, 0xb0, 0xb7, 0x33, 0x2a, 0x6b, 0x08, 0x03, 0x6c, 0x6d, 0x8f, 0x13,
+      0x93, 0xd0, 0xd7, 0x87, 0x62, 0x63, 0x53, 0xfb, 0xd8, 0xbb, 0xc9, 0x54, 0x90, 0xd6, 0xa9,
+      0x8f, 0xc8, 0x60, 0xbd, 0xec, 0x75, 0x23, 0x9a, 0x21, 0xec, 0xe4, 0x86, 0x43, 0xd7, 0xc1,
+      0x88, 0xdc, 0x82, 0x00, 0x32, 0x79, 0xc9, 0x2b, 0x70, 0x85, 0xb7, 0x25, 0xa1, 0xcc, 0x7d,
+      0x0b, 0x29, 0x03, 0xea, 0x80, 0xff, 0x9b, 0xf3, 0x24, 0x7f, 0xd1, 0xff, 0xf0, 0x22, 0x65,
+      0x85, 0x99, 0x17, 0x63, 0xc2, 0xc0, 0xb7, 0x62, 0x05, 0xda, 0x7a, 0xa0, 0xc3, 0x2a, 0x6f,
+      0x1f, 0xee, 0x1f, 0x31, 0xa8, 0x42, 0x80, 0xe4, 0xb7, 0x6c, 0xf6, 0xac, 0x47, 0xb0, 0x17,
+      0x69, 0xcb, 0xff, 0x66, 0x8a, 0xd6, 0x25, 0x00, 0xf3, 0xcf, 0x0a, 0xaf, 0xf8, 0x92, 0x8a,
+      0xa0, 0xdf, 0x71, 0x13, 0x8d, 0x9d, 0xff, 0x7e, 0xe0, 0x0a, 0x52, 0xf1, 0x97, 0x01, 0xa9,
+      0x73, 0x27, 0xfd, 0x63, 0x58, 0x00, 0x32, 0xa6, 0xf6, 0x78, 0xb8, 0xe4, 0xfd, 0x20, 0x7c,
+      0x90, 0xee, 0xad, 0x8c, 0xc9, 0x71, 0x35, 0x66, 0x71, 0x3c, 0xe0, 0xe4, 0x0b, 0xbb, 0xa0,
+      0x50, 0xe9, 0xf2, 0x81, 0x1d, 0x3a, 0x95, 0x94, 0x00, 0xd5, 0x49, 0x00, 0x07, 0xdf, 0x21,
+      0x53, 0x36, 0x8d, 0x9e, 0xd9, 0xa5, 0x52, 0x4d, 0x0d, 0x29, 0x74, 0xf0, 0x40, 0xbd, 0xda,
+      0x63, 0x4e, 0xdd, 0x91, 0x8e, 0xa6, 0xa7, 0xf6, 0x78, 0x58, 0x3b, 0x0a, 0x5c, 0x60, 0x3c,
+      0x15, 0x34, 0xf8, 0x2c, 0x21, 0xe3, 0x56, 0x1b, 0x9e, 0xd9, 0x56, 0xd3, 0x13, 0x2e, 0x80,
+      0x2c, 0x36, 0xda, 0x1d, 0xc8, 0xfb, 0x52, 0xee, 0x17, 0xb3, 0x2b, 0xf3, 0xd2, 0xeb, 0x29,
+      0xa0, 0x37, 0xa0, 0x12, 0xce, 0x1c, 0x50, 0x6a, 0xf4, 0x11, 0xcd, 0x96, 0x88, 0x3f, 0x43,
+      0x78, 0xc0, 0x2c, 0x53, 0x6c, 0xa6, 0xdf, 0xb9, 0x9e, 0x93, 0xd4, 0x1e, 0xa9, 0x7f, 0x67,
+      0xa6, 0xc1, 0x80, 0x46, 0x0f, 0x63, 0x7d, 0x15, 0xf2, 0x4c, 0xc5, 0xda, 0x11, 0x9a, 0x20,
+      0x67, 0x27, 0xe8, 0x00, 0xec, 0x03, 0x1d, 0x15, 0xa7, 0x92, 0xb3, 0x1f, 0xda, 0x20, 0x92,
+      0xd8, 0x00, 0xfb, 0x06, 0x80, 0xeb, 0x4b, 0x0c, 0xc1, 0x1f, 0x49, 0x40, 0x06, 0x8d, 0x8a,
+      0xf8, 0x34, 0xb1, 0x0c, 0x1d, 0x20, 0xd0, 0x47, 0xe5, 0xb1, 0x7e, 0xf7, 0xe4, 0xb4, 0x7e,
+      0x9c, 0x84, 0x18, 0x61, 0x32, 0x4f, 0xc0, 0xc2, 0xb2, 0xcc, 0x63, 0xf6, 0xe1, 0x16, 0xd6,
+      0xd9, 0x4b, 0x74, 0x13, 0x01, 0xa1, 0xe2, 0x00, 0xb7, 0x9e, 0xc1, 0x3a, 0xc5, 0xaf, 0xe8,
+      0x54, 0x07, 0x2a, 0x20, 0xfd, 0x2c, 0x6f, 0xb9, 0x80, 0x18, 0x92, 0x87, 0xa0, 0x81, 0x24,
+      0x60, 0x47, 0x17, 0x4f, 0xbc, 0xbe, 0xf5, 0x03, 0x69, 0x80, 0xe3, 0x10, 0x54, 0xd6, 0x68,
+      0x7d, 0x75, 0xd3, 0x0a, 0x45, 0x38, 0x9e, 0xa9, 0xfd, 0x05, 0x40, 0xd2, 0x1e, 0x6f, 0x5c,
+      0x30, 0x10, 0xfe, 0x9b, 0x9f, 0x6d, 0xc0, 0x9d, 0x6c, 0x17, 0x7d, 0x00, 0x09, 0xb6, 0x8a,
+      0x31, 0x8e, 0x1b, 0x6b, 0x84, 0x1e, 0x79, 0xce, 0x10, 0x55, 0x59, 0x6a, 0x40, 0x16, 0xdc,
+      0x9a, 0xcf, 0x4d, 0xb0, 0x8f, 0xac, 0xe3, 0x8d, 0xee, 0xd2, 0xef, 0x01, 0x8c, 0xe0, 0x2b,
+      0x24, 0xe5, 0xb4, 0xe1, 0x86, 0x72, 0x00, 0x30, 0x07, 0xce, 0x02, 0x23, 0x41, 0x33, 0x40,
+      0xf0, 0x9b, 0xc2, 0x2d, 0x30, 0xec, 0x3b, 0x17, 0xb2, 0x8f, 0x64, 0x7d, 0xcd, 0x70, 0x9e,
+      0x80, 0x22, 0xb5, 0xdf, 0x6d, 0x2a, 0x43, 0xd4, 0x2b, 0x5a, 0xf6, 0x96, 0xa6, 0xea, 0x91,
+      0x62, 0x80, 0x39, 0xf2, 0x5a, 0x8e, 0xc0, 0xb9, 0x29, 0x99, 0x17, 0xe7, 0x35, 0x2c, 0xf6,
+      0x4d, 0x18, 0x00, 0x48, 0x10, 0x85, 0xb4, 0x3f, 0x89, 0x60, 0x49, 0x6e, 0xf0, 0xcd, 0x9d,
+      0x92, 0xeb, 0x96, 0x80, 0xcf, 0xf9, 0xf1, 0x46, 0x1d, 0xc0, 0x49, 0xb3, 0x36, 0x2e, 0x24,
+      0xc8, 0xdb, 0x41, 0x72, 0x20, 0xf5, 0xde, 0x5c, 0xf9, 0x4a, 0x6e, 0xa0, 0x0b, 0x13, 0xfc,
+      0x2d, 0x17, 0x07, 0x16, 0x5e, 0x00, 0x3c, 0x54, 0x41, 0x0e, 0xa2, 0x0d, 0xf3, 0x48, 0x12,
+      0x2e, 0x7c, 0xab, 0x3c, 0x59, 0x1c, 0x40, 0xca, 0xb0, 0x71, 0xc7, 0x29, 0xf0, 0xbb, 0x9f,
+      0xf4, 0x3f, 0x25, 0x49, 0xad, 0xc2, 0x8f, 0x80, 0x04, 0x38, 0x6d, 0x35, 0x02, 0xca, 0xe6,
+      0x02, 0x83, 0x89, 0x4e, 0x74, 0xdb, 0x08, 0x5a, 0x80, 0x13, 0x99, 0xd4, 0x26, 0xc1, 0x27,
+      0xce, 0xb0, 0x98, 0x99, 0xca, 0xf6, 0x3e, 0x50, 0x49, 0xd0, 0xbf, 0xcb, 0x6f, 0xbe, 0x5b,
+      0x92, 0x63, 0xde, 0x94, 0xd3, 0x8f, 0x07, 0x06, 0x0f, 0x2b, 0x80, 0x36, 0xf1, 0x77, 0xf6,
+      0x29, 0x33, 0x13, 0xa9, 0x4a, 0x55, 0x3d, 0x6c, 0xca, 0xdb, 0x4e, 0x40, 0xc4, 0x95, 0x54,
+      0xf4, 0xe2, 0x8c, 0x1b, 0xa0, 0xfe, 0x30, 0x50, 0x9d, 0x62, 0xbc, 0x5c, 0x00, 0xb4, 0xc4,
+      0xb3, 0x57, 0xa5, 0x79, 0x3b, 0x85, 0xf6, 0x75, 0xdd, 0xc0, 0x00, 0x00, 0x00, 0x01, 0x4b,
+      0x3b, 0x4c, 0xa8, 0x5a, 0x86, 0xc4, 0x7a, 0x09, 0x8a, 0x22, 0x3f, 0xff, 0xff, 0xff, 0xff,
+      0x15, 0x02, 0x19, 0x4c, 0x48, 0x0c, 0x73, 0x70, 0x61, 0x72, 0x6b, 0x5f, 0x73, 0x63, 0x68,
+      0x65, 0x6d, 0x61, 0x15, 0x06, 0x00, 0x15, 0x02, 0x25, 0x02, 0x18, 0x06, 0x64, 0x65, 0x63,
+      0x37, 0x70, 0x34, 0x25, 0x0a, 0x15, 0x08, 0x15, 0x0e, 0x00, 0x15, 0x04, 0x25, 0x02, 0x18,
+      0x07, 0x64, 0x65, 0x63, 0x31, 0x34, 0x70, 0x35, 0x25, 0x0a, 0x15, 0x0a, 0x15, 0x1c, 0x00,
+      0x15, 0x0e, 0x15, 0x20, 0x15, 0x02, 0x18, 0x08, 0x64, 0x65, 0x63, 0x33, 0x38, 0x70, 0x31,
+      0x38, 0x25, 0x0a, 0x15, 0x24, 0x15, 0x4c, 0x00, 0x16, 0x6a, 0x19, 0x1c, 0x19, 0x3c, 0x26,
+      0x08, 0x1c, 0x15, 0x02, 0x19, 0x35, 0x06, 0x08, 0x00, 0x19, 0x18, 0x06, 0x64, 0x65, 0x63,
+      0x37, 0x70, 0x34, 0x15, 0x02, 0x16, 0x6a, 0x16, 0xf6, 0x03, 0x16, 0xfe, 0x03, 0x26, 0x08,
+      0x3c, 0x36, 0x02, 0x28, 0x04, 0x7f, 0x96, 0x98, 0x00, 0x18, 0x04, 0x81, 0x69, 0x67, 0xff,
+      0x00, 0x19, 0x1c, 0x15, 0x00, 0x15, 0x00, 0x15, 0x02, 0x00, 0x00, 0x00, 0x26, 0x86, 0x04,
+      0x1c, 0x15, 0x04, 0x19, 0x35, 0x06, 0x08, 0x00, 0x19, 0x18, 0x07, 0x64, 0x65, 0x63, 0x31,
+      0x34, 0x70, 0x35, 0x15, 0x02, 0x16, 0x6a, 0x16, 0xa6, 0x07, 0x16, 0xb0, 0x07, 0x26, 0x86,
+      0x04, 0x3c, 0x36, 0x02, 0x28, 0x08, 0xff, 0x3f, 0x7a, 0x10, 0xf3, 0x5a, 0x00, 0x00, 0x18,
+      0x08, 0x01, 0xc0, 0x85, 0xef, 0x0c, 0xa5, 0xff, 0xff, 0x00, 0x19, 0x1c, 0x15, 0x00, 0x15,
+      0x00, 0x15, 0x02, 0x00, 0x00, 0x00, 0x26, 0xb6, 0x0b, 0x1c, 0x15, 0x0e, 0x19, 0x35, 0x06,
+      0x08, 0x00, 0x19, 0x18, 0x08, 0x64, 0x65, 0x63, 0x33, 0x38, 0x70, 0x31, 0x38, 0x15, 0x02,
+      0x16, 0x6a, 0x16, 0x86, 0x0e, 0x16, 0x90, 0x0e, 0x26, 0xb6, 0x0b, 0x3c, 0x36, 0x02, 0x28,
+      0x10, 0x4b, 0x3b, 0x4c, 0xa8, 0x5a, 0x86, 0xc4, 0x7a, 0x09, 0x8a, 0x22, 0x3f, 0xff, 0xff,
+      0xff, 0xff, 0x18, 0x10, 0xb4, 0xc4, 0xb3, 0x57, 0xa5, 0x79, 0x3b, 0x85, 0xf6, 0x75, 0xdd,
+      0xc0, 0x00, 0x00, 0x00, 0x01, 0x00, 0x19, 0x1c, 0x15, 0x00, 0x15, 0x00, 0x15, 0x02, 0x00,
+      0x00, 0x00, 0x16, 0xa2, 0x19, 0x16, 0x6a, 0x00, 0x19, 0x2c, 0x18, 0x18, 0x6f, 0x72, 0x67,
+      0x2e, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2e, 0x73, 0x70, 0x61, 0x72, 0x6b, 0x2e, 0x76,
+      0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x18, 0x05, 0x33, 0x2e, 0x30, 0x2e, 0x31, 0x00, 0x18,
+      0x29, 0x6f, 0x72, 0x67, 0x2e, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2e, 0x73, 0x70, 0x61,
+      0x72, 0x6b, 0x2e, 0x73, 0x71, 0x6c, 0x2e, 0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2e,
+      0x72, 0x6f, 0x77, 0x2e, 0x6d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x18, 0xf4, 0x01,
+      0x7b, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x22, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74,
+      0x22, 0x2c, 0x22, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x22, 0x3a, 0x5b, 0x7b, 0x22, 0x6e,
+      0x61, 0x6d, 0x65, 0x22, 0x3a, 0x22, 0x64, 0x65, 0x63, 0x37, 0x70, 0x34, 0x22, 0x2c, 0x22,
+      0x74, 0x79, 0x70, 0x65, 0x22, 0x3a, 0x22, 0x64, 0x65, 0x63, 0x69, 0x6d, 0x61, 0x6c, 0x28,
+      0x37, 0x2c, 0x34, 0x29, 0x22, 0x2c, 0x22, 0x6e, 0x75, 0x6c, 0x6c, 0x61, 0x62, 0x6c, 0x65,
+      0x22, 0x3a, 0x74, 0x72, 0x75, 0x65, 0x2c, 0x22, 0x6d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74,
+      0x61, 0x22, 0x3a, 0x7b, 0x7d, 0x7d, 0x2c, 0x7b, 0x22, 0x6e, 0x61, 0x6d, 0x65, 0x22, 0x3a,
+      0x22, 0x64, 0x65, 0x63, 0x31, 0x34, 0x70, 0x35, 0x22, 0x2c, 0x22, 0x74, 0x79, 0x70, 0x65,
+      0x22, 0x3a, 0x22, 0x64, 0x65, 0x63, 0x69, 0x6d, 0x61, 0x6c, 0x28, 0x31, 0x34, 0x2c, 0x35,
+      0x29, 0x22, 0x2c, 0x22, 0x6e, 0x75, 0x6c, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x22, 0x3a, 0x74,
+      0x72, 0x75, 0x65, 0x2c, 0x22, 0x6d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x22, 0x3a,
+      0x7b, 0x7d, 0x7d, 0x2c, 0x7b, 0x22, 0x6e, 0x61, 0x6d, 0x65, 0x22, 0x3a, 0x22, 0x64, 0x65,
+      0x63, 0x33, 0x38, 0x70, 0x31, 0x38, 0x22, 0x2c, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3a,
+      0x22, 0x64, 0x65, 0x63, 0x69, 0x6d, 0x61, 0x6c, 0x28, 0x33, 0x38, 0x2c, 0x31, 0x38, 0x29,
+      0x22, 0x2c, 0x22, 0x6e, 0x75, 0x6c, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x22, 0x3a, 0x74, 0x72,
+      0x75, 0x65, 0x2c, 0x22, 0x6d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x22, 0x3a, 0x7b,
+      0x7d, 0x7d, 0x5d, 0x7d, 0x00, 0x18, 0x4a, 0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2d,
+      0x6d, 0x72, 0x20, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x31, 0x2e, 0x31, 0x30,
+      0x2e, 0x31, 0x20, 0x28, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x20, 0x61, 0x38, 0x39, 0x64, 0x66,
+      0x38, 0x66, 0x39, 0x39, 0x33, 0x32, 0x62, 0x36, 0x65, 0x66, 0x36, 0x36, 0x33, 0x33, 0x64,
+      0x30, 0x36, 0x30, 0x36, 0x39, 0x65, 0x35, 0x30, 0x63, 0x39, 0x62, 0x37, 0x39, 0x37, 0x30,
+      0x62, 0x65, 0x62, 0x64, 0x31, 0x29, 0x19, 0x3c, 0x1c, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x1c,
+      0x00, 0x00, 0x00, 0xd3, 0x02, 0x00, 0x00, 0x50, 0x41, 0x52, 0x31};
+    unsigned int decimals_parquet_len = 2366;
+
+    cudf_io::parquet_reader_options read_opts = cudf_io::parquet_reader_options::builder(
+      cudf_io::source_info{reinterpret_cast<const char*>(decimals_parquet), decimals_parquet_len});
+    auto result = cudf_io::read_parquet(read_opts);
+
+    auto validity = cudf::test::make_counting_transform_iterator(0, [](auto i) { return i != 50; });
+
+    EXPECT_EQ(result.tbl->view().num_columns(), 3);
+
+    int32_t col0_data[] = {
+      -2354584, -190275,  8393572,  6446515,  -5687920, -1843550, -6897687, -6780385, 3428529,
+      5842056,  -4312278, -4450603, -7516141, 2974667,  -4288640, 1065090,  -9410428, 7891355,
+      1076244,  -1975984, 6999466,  2666959,  9262967,  7931374,  -1370640, 451074,   8799111,
+      3026424,  -6803730, 5098297,  -1414370, -9662848, 2499991,  658765,   8348874,  -6177036,
+      -9694494, -5343299, 3558393,  -8789072, 2697890,  -4454707, 8299309,  -6223703, -3112513,
+      7537487,  825776,   -495683,  328299,   -4529727, 0,        -9999999, 9999999};
+
+    EXPECT_EQ(result.tbl->view().column(0).size(), sizeof(col0_data) / sizeof(col0_data[0]));
+    cudf::test::fixed_point_column_wrapper<int32_t> col0(
+      std::begin(col0_data), std::end(col0_data), validity, numeric::scale_type{-4});
+    cudf::test::expect_columns_equal(result.tbl->view().column(0), col0);
+
+    int64_t col1_data[] = {29274040266581,  -17210335917753, -58420730139037,
+                           68073792696254,  2236456014294,   13704555677045,
+                           -70797090469548, -52248605513407, -68976081919961,
+                           -34277313883112, 97774730521689,  21184241014572,
+                           -670882460254,   -40862944054399, -24079852370612,
+                           -88670167797498, -84007574359403, -71843004533519,
+                           -55538016554201, 3491435293032,   -29085437167297,
+                           36901882672273,  -98622066122568, -13974902998457,
+                           86712597643378,  -16835133643735, -94759096142232,
+                           30708340810940,  79086853262082,  78923696440892,
+                           -76316597208589, 37247268714759,  80303592631774,
+                           57790350050889,  19387319851064,  -33186875066145,
+                           69701203023404,  -7157433049060,  -7073790423437,
+                           92769171617714,  -75127120182184, -951893180618,
+                           64927618310150,  -53875897154023, -16168039035569,
+                           -24273449166429, -30359781249192, 35639397345991,
+                           45844829680593,  71401416837149,  0,
+                           -99999999999999, 99999999999999};
+
+    EXPECT_EQ(result.tbl->view().column(1).size(), sizeof(col1_data) / sizeof(col1_data[0]));
+    cudf::test::fixed_point_column_wrapper<int64_t> col1(
+      std::begin(col1_data), std::end(col1_data), validity, numeric::scale_type{-5});
+    cudf::test::expect_columns_equal(result.tbl->view().column(1), col1);
+
+    cudf_io::parquet_reader_options read_strict_opts = read_opts;
+    read_strict_opts.set_strict_decimal_types(true);
+    read_strict_opts.set_columns({"dec7p4", "dec14p5"});
+    EXPECT_NO_THROW(cudf_io::read_parquet(read_strict_opts));
+  }
+  {
+    // dec7p3: Decimal(precision=7, scale=3) backed by FIXED_LENGTH_BYTE_ARRAY(length = 4)
+    // dec12p11: Decimal(precision=12, scale=11) backed by FIXED_LENGTH_BYTE_ARRAY(length = 6)
+    // dec20p1: Decimal(precision=20, scale=1) backed by FIXED_LENGTH_BYTE_ARRAY(length = 9)
+    const unsigned char fixed_len_bytes_decimal_parquet[] = {
+      0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0xA8, 0x01, 0x15, 0xAE, 0x01, 0x2C, 0x15, 0x28,
+      0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1C, 0x36, 0x02, 0x28, 0x04, 0x00, 0x97, 0x45, 0x72,
+      0x18, 0x04, 0x00, 0x01, 0x81, 0x3B, 0x00, 0x00, 0x00, 0x54, 0xF0, 0x53, 0x04, 0x00, 0x00,
+      0x00, 0x26, 0x01, 0x03, 0x00, 0x00, 0x61, 0x10, 0xCF, 0x00, 0x0A, 0xA9, 0x08, 0x00, 0x77,
+      0x58, 0x6F, 0x00, 0x6B, 0xEE, 0xA4, 0x00, 0x92, 0xF8, 0x94, 0x00, 0x2E, 0x18, 0xD4, 0x00,
+      0x4F, 0x45, 0x33, 0x00, 0x97, 0x45, 0x72, 0x00, 0x0D, 0xC2, 0x75, 0x00, 0x76, 0xAA, 0xAA,
+      0x00, 0x30, 0x9F, 0x86, 0x00, 0x4B, 0x9D, 0xB1, 0x00, 0x4E, 0x4B, 0x3B, 0x00, 0x01, 0x81,
+      0x3B, 0x00, 0x22, 0xD4, 0x53, 0x00, 0x72, 0xC4, 0xAF, 0x00, 0x43, 0x9B, 0x72, 0x00, 0x1D,
+      0x91, 0xC3, 0x00, 0x45, 0x27, 0x48, 0x15, 0x00, 0x15, 0xF4, 0x01, 0x15, 0xFA, 0x01, 0x2C,
+      0x15, 0x28, 0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1C, 0x36, 0x02, 0x28, 0x06, 0x00, 0xD5,
+      0xD7, 0x31, 0x99, 0xA6, 0x18, 0x06, 0xFF, 0x17, 0x2B, 0x5A, 0xF0, 0x01, 0x00, 0x00, 0x00,
+      0x7A, 0xF0, 0x79, 0x04, 0x00, 0x00, 0x00, 0x24, 0x01, 0x03, 0x02, 0x00, 0x54, 0x23, 0xCF,
+      0x13, 0x0A, 0x00, 0x07, 0x22, 0xB1, 0x21, 0x7E, 0x00, 0x64, 0x19, 0xD6, 0xD2, 0xA5, 0x00,
+      0x61, 0x7F, 0xF6, 0xB9, 0xB0, 0x00, 0xD0, 0x7F, 0x9C, 0xA9, 0xE9, 0x00, 0x65, 0x58, 0xF0,
+      0xAD, 0xFB, 0x00, 0xBC, 0x61, 0xE2, 0x03, 0xDA, 0xFF, 0x17, 0x2B, 0x5A, 0xF0, 0x01, 0x00,
+      0x63, 0x4B, 0x4C, 0xFE, 0x45, 0x00, 0x7A, 0xA0, 0xD8, 0xD1, 0xC0, 0x00, 0xC0, 0x63, 0xF7,
+      0x9D, 0x0A, 0x00, 0x88, 0x22, 0x0F, 0x1B, 0x25, 0x00, 0x1A, 0x80, 0x56, 0x34, 0xC7, 0x00,
+      0x5F, 0x48, 0x61, 0x09, 0x7C, 0x00, 0x61, 0xEF, 0x92, 0x42, 0x2F, 0x00, 0xD5, 0xD7, 0x31,
+      0x99, 0xA6, 0xFF, 0x17, 0x2B, 0x5A, 0xF0, 0x01, 0x00, 0x71, 0xDD, 0xE2, 0x22, 0x7B, 0x00,
+      0x54, 0xBF, 0xAE, 0xE9, 0x3C, 0x15, 0x00, 0x15, 0xD4, 0x02, 0x15, 0xDC, 0x02, 0x2C, 0x15,
+      0x28, 0x15, 0x00, 0x15, 0x06, 0x15, 0x08, 0x1C, 0x36, 0x04, 0x28, 0x09, 0x00, 0x7D, 0xFE,
+      0x02, 0xDA, 0xB2, 0x62, 0xA3, 0xFB, 0x18, 0x09, 0x00, 0x03, 0x9C, 0xCD, 0x5A, 0xAC, 0xBB,
+      0xF1, 0xE3, 0x00, 0x00, 0x00, 0xAA, 0x01, 0xF0, 0xA9, 0x04, 0x00, 0x00, 0x00, 0x07, 0xBF,
+      0xBF, 0x0F, 0x00, 0x7D, 0xFE, 0x02, 0xDA, 0xB2, 0x62, 0xA3, 0xFB, 0x00, 0x7D, 0x9A, 0xCB,
+      0xDA, 0x4B, 0x10, 0x8B, 0xAC, 0x00, 0x20, 0xBA, 0x97, 0x87, 0x2E, 0x3B, 0x4E, 0x04, 0x00,
+      0x15, 0xBB, 0xC2, 0xDF, 0x2D, 0x25, 0x08, 0xB6, 0x00, 0x5C, 0x67, 0x0E, 0x36, 0x30, 0xF1,
+      0xAC, 0xA4, 0x00, 0x44, 0xF1, 0x8E, 0xFB, 0x17, 0x5E, 0xE1, 0x96, 0x00, 0x64, 0x69, 0xF9,
+      0x66, 0x3F, 0x11, 0xED, 0xB9, 0x00, 0x45, 0xB5, 0xDA, 0x14, 0x9C, 0xA3, 0xFA, 0x64, 0x00,
+      0x26, 0x5F, 0xDE, 0xD7, 0x67, 0x95, 0xEF, 0xB1, 0x00, 0x35, 0xDB, 0x9B, 0x88, 0x46, 0xD0,
+      0xA1, 0x0E, 0x00, 0x45, 0xA9, 0x92, 0x8E, 0x89, 0xD1, 0xAC, 0x4C, 0x00, 0x4C, 0xF1, 0xCB,
+      0x27, 0x82, 0x3A, 0x7D, 0xB7, 0x00, 0x64, 0xD3, 0xD2, 0x2F, 0x9C, 0x83, 0x16, 0x75, 0x00,
+      0x15, 0xDF, 0xC2, 0xA9, 0x63, 0xB8, 0x33, 0x65, 0x00, 0x27, 0x40, 0x28, 0x97, 0x05, 0x8E,
+      0xE3, 0x46, 0x00, 0x03, 0x9C, 0xCD, 0x5A, 0xAC, 0xBB, 0xF1, 0xE3, 0x00, 0x22, 0x23, 0xF5,
+      0xE8, 0x9D, 0x55, 0xD4, 0x9C, 0x00, 0x25, 0xB9, 0xD8, 0x87, 0x2D, 0xF1, 0xF2, 0x17, 0x15,
+      0x02, 0x19, 0x4C, 0x48, 0x0C, 0x73, 0x70, 0x61, 0x72, 0x6B, 0x5F, 0x73, 0x63, 0x68, 0x65,
+      0x6D, 0x61, 0x15, 0x06, 0x00, 0x15, 0x0E, 0x15, 0x08, 0x15, 0x02, 0x18, 0x06, 0x64, 0x65,
+      0x63, 0x37, 0x70, 0x33, 0x25, 0x0A, 0x15, 0x06, 0x15, 0x0E, 0x00, 0x15, 0x0E, 0x15, 0x0C,
+      0x15, 0x02, 0x18, 0x08, 0x64, 0x65, 0x63, 0x31, 0x32, 0x70, 0x31, 0x31, 0x25, 0x0A, 0x15,
+      0x16, 0x15, 0x18, 0x00, 0x15, 0x0E, 0x15, 0x12, 0x15, 0x02, 0x18, 0x07, 0x64, 0x65, 0x63,
+      0x32, 0x30, 0x70, 0x31, 0x25, 0x0A, 0x15, 0x02, 0x15, 0x28, 0x00, 0x16, 0x28, 0x19, 0x1C,
+      0x19, 0x3C, 0x26, 0x08, 0x1C, 0x15, 0x0E, 0x19, 0x35, 0x06, 0x08, 0x00, 0x19, 0x18, 0x06,
+      0x64, 0x65, 0x63, 0x37, 0x70, 0x33, 0x15, 0x02, 0x16, 0x28, 0x16, 0xEE, 0x01, 0x16, 0xF4,
+      0x01, 0x26, 0x08, 0x3C, 0x36, 0x02, 0x28, 0x04, 0x00, 0x97, 0x45, 0x72, 0x18, 0x04, 0x00,
+      0x01, 0x81, 0x3B, 0x00, 0x19, 0x1C, 0x15, 0x00, 0x15, 0x00, 0x15, 0x02, 0x00, 0x00, 0x00,
+      0x26, 0xFC, 0x01, 0x1C, 0x15, 0x0E, 0x19, 0x35, 0x06, 0x08, 0x00, 0x19, 0x18, 0x08, 0x64,
+      0x65, 0x63, 0x31, 0x32, 0x70, 0x31, 0x31, 0x15, 0x02, 0x16, 0x28, 0x16, 0xC2, 0x02, 0x16,
+      0xC8, 0x02, 0x26, 0xFC, 0x01, 0x3C, 0x36, 0x02, 0x28, 0x06, 0x00, 0xD5, 0xD7, 0x31, 0x99,
+      0xA6, 0x18, 0x06, 0xFF, 0x17, 0x2B, 0x5A, 0xF0, 0x01, 0x00, 0x19, 0x1C, 0x15, 0x00, 0x15,
+      0x00, 0x15, 0x02, 0x00, 0x00, 0x00, 0x26, 0xC4, 0x04, 0x1C, 0x15, 0x0E, 0x19, 0x35, 0x06,
+      0x08, 0x00, 0x19, 0x18, 0x07, 0x64, 0x65, 0x63, 0x32, 0x30, 0x70, 0x31, 0x15, 0x02, 0x16,
+      0x28, 0x16, 0xAE, 0x03, 0x16, 0xB6, 0x03, 0x26, 0xC4, 0x04, 0x3C, 0x36, 0x04, 0x28, 0x09,
+      0x00, 0x7D, 0xFE, 0x02, 0xDA, 0xB2, 0x62, 0xA3, 0xFB, 0x18, 0x09, 0x00, 0x03, 0x9C, 0xCD,
+      0x5A, 0xAC, 0xBB, 0xF1, 0xE3, 0x00, 0x19, 0x1C, 0x15, 0x00, 0x15, 0x00, 0x15, 0x02, 0x00,
+      0x00, 0x00, 0x16, 0xDE, 0x07, 0x16, 0x28, 0x00, 0x19, 0x2C, 0x18, 0x18, 0x6F, 0x72, 0x67,
+      0x2E, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2E, 0x73, 0x70, 0x61, 0x72, 0x6B, 0x2E, 0x76,
+      0x65, 0x72, 0x73, 0x69, 0x6F, 0x6E, 0x18, 0x05, 0x33, 0x2E, 0x30, 0x2E, 0x31, 0x00, 0x18,
+      0x29, 0x6F, 0x72, 0x67, 0x2E, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2E, 0x73, 0x70, 0x61,
+      0x72, 0x6B, 0x2E, 0x73, 0x71, 0x6C, 0x2E, 0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2E,
+      0x72, 0x6F, 0x77, 0x2E, 0x6D, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x18, 0xF4, 0x01,
+      0x7B, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x22, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74,
+      0x22, 0x2C, 0x22, 0x66, 0x69, 0x65, 0x6C, 0x64, 0x73, 0x22, 0x3A, 0x5B, 0x7B, 0x22, 0x6E,
+      0x61, 0x6D, 0x65, 0x22, 0x3A, 0x22, 0x64, 0x65, 0x63, 0x37, 0x70, 0x33, 0x22, 0x2C, 0x22,
+      0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x22, 0x64, 0x65, 0x63, 0x69, 0x6D, 0x61, 0x6C, 0x28,
+      0x37, 0x2C, 0x33, 0x29, 0x22, 0x2C, 0x22, 0x6E, 0x75, 0x6C, 0x6C, 0x61, 0x62, 0x6C, 0x65,
+      0x22, 0x3A, 0x74, 0x72, 0x75, 0x65, 0x2C, 0x22, 0x6D, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74,
+      0x61, 0x22, 0x3A, 0x7B, 0x7D, 0x7D, 0x2C, 0x7B, 0x22, 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A,
+      0x22, 0x64, 0x65, 0x63, 0x31, 0x32, 0x70, 0x31, 0x31, 0x22, 0x2C, 0x22, 0x74, 0x79, 0x70,
+      0x65, 0x22, 0x3A, 0x22, 0x64, 0x65, 0x63, 0x69, 0x6D, 0x61, 0x6C, 0x28, 0x31, 0x32, 0x2C,
+      0x31, 0x31, 0x29, 0x22, 0x2C, 0x22, 0x6E, 0x75, 0x6C, 0x6C, 0x61, 0x62, 0x6C, 0x65, 0x22,
+      0x3A, 0x74, 0x72, 0x75, 0x65, 0x2C, 0x22, 0x6D, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61,
+      0x22, 0x3A, 0x7B, 0x7D, 0x7D, 0x2C, 0x7B, 0x22, 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, 0x22,
+      0x64, 0x65, 0x63, 0x32, 0x30, 0x70, 0x31, 0x22, 0x2C, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22,
+      0x3A, 0x22, 0x64, 0x65, 0x63, 0x69, 0x6D, 0x61, 0x6C, 0x28, 0x32, 0x30, 0x2C, 0x31, 0x29,
+      0x22, 0x2C, 0x22, 0x6E, 0x75, 0x6C, 0x6C, 0x61, 0x62, 0x6C, 0x65, 0x22, 0x3A, 0x74, 0x72,
+      0x75, 0x65, 0x2C, 0x22, 0x6D, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x22, 0x3A, 0x7B,
+      0x7D, 0x7D, 0x5D, 0x7D, 0x00, 0x18, 0x4A, 0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2D,
+      0x6D, 0x72, 0x20, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6F, 0x6E, 0x20, 0x31, 0x2E, 0x31, 0x30,
+      0x2E, 0x31, 0x20, 0x28, 0x62, 0x75, 0x69, 0x6C, 0x64, 0x20, 0x61, 0x38, 0x39, 0x64, 0x66,
+      0x38, 0x66, 0x39, 0x39, 0x33, 0x32, 0x62, 0x36, 0x65, 0x66, 0x36, 0x36, 0x33, 0x33, 0x64,
+      0x30, 0x36, 0x30, 0x36, 0x39, 0x65, 0x35, 0x30, 0x63, 0x39, 0x62, 0x37, 0x39, 0x37, 0x30,
+      0x62, 0x65, 0x62, 0x64, 0x31, 0x29, 0x19, 0x3C, 0x1C, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x1C,
+      0x00, 0x00, 0x00, 0xC5, 0x02, 0x00, 0x00, 0x50, 0x41, 0x52, 0x31,
+    };
+
+    unsigned int parquet_len = 1226;
 
-  auto validity = cudf::test::make_counting_transform_iterator(0, [](auto i) { return i != 50; });
-
-  EXPECT_EQ(result.tbl->view().num_columns(), 3);
-
-  int32_t col0_data[] = {
-    -2354584, -190275,  8393572,  6446515,  -5687920, -1843550, -6897687, -6780385, 3428529,
-    5842056,  -4312278, -4450603, -7516141, 2974667,  -4288640, 1065090,  -9410428, 7891355,
-    1076244,  -1975984, 6999466,  2666959,  9262967,  7931374,  -1370640, 451074,   8799111,
-    3026424,  -6803730, 5098297,  -1414370, -9662848, 2499991,  658765,   8348874,  -6177036,
-    -9694494, -5343299, 3558393,  -8789072, 2697890,  -4454707, 8299309,  -6223703, -3112513,
-    7537487,  825776,   -495683,  328299,   -4529727, 0,        -9999999, 9999999};
-
-  EXPECT_EQ(result.tbl->view().column(0).size(), sizeof(col0_data) / sizeof(col0_data[0]));
-  cudf::test::fixed_point_column_wrapper<int32_t> col0(
-    std::begin(col0_data), std::end(col0_data), validity, numeric::scale_type{-4});
-  cudf::test::expect_columns_equal(result.tbl->view().column(0), col0);
-
-  int64_t col1_data[] = {29274040266581,  -17210335917753, -58420730139037,
-                         68073792696254,  2236456014294,   13704555677045,
-                         -70797090469548, -52248605513407, -68976081919961,
-                         -34277313883112, 97774730521689,  21184241014572,
-                         -670882460254,   -40862944054399, -24079852370612,
-                         -88670167797498, -84007574359403, -71843004533519,
-                         -55538016554201, 3491435293032,   -29085437167297,
-                         36901882672273,  -98622066122568, -13974902998457,
-                         86712597643378,  -16835133643735, -94759096142232,
-                         30708340810940,  79086853262082,  78923696440892,
-                         -76316597208589, 37247268714759,  80303592631774,
-                         57790350050889,  19387319851064,  -33186875066145,
-                         69701203023404,  -7157433049060,  -7073790423437,
-                         92769171617714,  -75127120182184, -951893180618,
-                         64927618310150,  -53875897154023, -16168039035569,
-                         -24273449166429, -30359781249192, 35639397345991,
-                         45844829680593,  71401416837149,  0,
-                         -99999999999999, 99999999999999};
-
-  EXPECT_EQ(result.tbl->view().column(1).size(), sizeof(col1_data) / sizeof(col1_data[0]));
-  cudf::test::fixed_point_column_wrapper<int64_t> col1(
-    std::begin(col1_data), std::end(col1_data), validity, numeric::scale_type{-5});
-  cudf::test::expect_columns_equal(result.tbl->view().column(1), col1);
-
-  cudf_io::parquet_reader_options read_strict_opts = read_opts;
-  read_strict_opts.set_strict_decimal_types(true);
-  EXPECT_THROW(cudf_io::read_parquet(read_strict_opts), cudf::logic_error);
-
-  read_strict_opts.set_columns({"dec7p4", "dec14p5"});
-  EXPECT_NO_THROW(cudf_io::read_parquet(read_strict_opts));
+    cudf_io::parquet_reader_options read_opts =
+      cudf_io::parquet_reader_options::builder(cudf_io::source_info{
+        reinterpret_cast<const char*>(fixed_len_bytes_decimal_parquet), parquet_len});
+    read_opts.set_strict_decimal_types(true);
+    read_opts.set_columns({"dec7p3", "dec12p11"});
+    auto result = cudf_io::read_parquet(read_opts);
+    EXPECT_EQ(result.tbl->view().num_columns(), 2);
+
+    auto validity_c0 =
+      cudf::test::make_counting_transform_iterator(0, [](auto i) { return i != 19; });
+    int64_t col0_data[] = {6361295, 698632,  7821423, 7073444, 9631892, 3021012, 5195059,
+                           9913714, 901749,  7776938, 3186566, 4955569, 5131067, 98619,
+                           2282579, 7521455, 4430706, 1937859, 4532040, 0};
+
+    EXPECT_EQ(result.tbl->view().column(0).size(), sizeof(col0_data) / sizeof(col0_data[0]));
+    cudf::test::fixed_point_column_wrapper<int64_t> col0(
+      std::begin(col0_data), std::end(col0_data), validity_c0, numeric::scale_type{-3});
+    cudf::test::expect_columns_equal(result.tbl->view().column(0), col0);
+
+    auto validity_c1 =
+      cudf::test::make_counting_transform_iterator(0, [](auto i) { return i != 18; });
+    int64_t col1_data[] = {361378026250,
+                           30646804862,
+                           429930238629,
+                           418758703536,
+                           895494171113,
+                           435283865083,
+                           809096053722,
+                           -999999999999,
+                           426465099333,
+                           526684574144,
+                           826310892810,
+                           584686967589,
+                           113822282951,
+                           409236212092,
+                           420631167535,
+                           918438386086,
+                           -999999999999,
+                           489053889147,
+                           0,
+                           363993164092};
+
+    EXPECT_EQ(result.tbl->view().column(1).size(), sizeof(col1_data) / sizeof(col1_data[0]));
+    cudf::test::fixed_point_column_wrapper<int64_t> col1(
+      std::begin(col1_data), std::end(col1_data), validity_c1, numeric::scale_type{-11});
+    cudf::test::expect_columns_equal(result.tbl->view().column(1), col1);
+
+    read_opts.set_columns({"dec20p1"});
+    EXPECT_THROW(cudf_io::read_parquet(read_opts), cudf::logic_error);
+  }
 }
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/jit/jit-cache-multiprocess-test.cpp b/cpp/tests/jit/jit-cache-multiprocess-test.cpp
index 5e11ed51633..2524da494e4 100644
--- a/cpp/tests/jit/jit-cache-multiprocess-test.cpp
+++ b/cpp/tests/jit/jit-cache-multiprocess-test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,7 +36,7 @@
  * when multiple processes are reading/writing to it at the same time. Since
  * the public API of JitCache doesn't return the serialized string of the
  * cached kernel, the way to test its validity is to run it on test data.
- **/
+ */
 TEST_F(JitCacheMultiProcessTest, MultiProcessTest)
 {
   int num_tests = 20;
diff --git a/cpp/tests/jit/jit-cache-test.hpp b/cpp/tests/jit/jit-cache-test.hpp
index 1e0d73b5bbb..1c71eccefd9 100644
--- a/cpp/tests/jit/jit-cache-test.hpp
+++ b/cpp/tests/jit/jit-cache-test.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -108,7 +108,7 @@ struct JitCacheTest : public ::testing::Test, public cudf::jit::cudfJitCache {
 /**
  * @brief Similar to JitCacheTest but it doesn't run warmUp() test in SetUp and
  * purgeFileCache() in TearDown
- **/
+ */
 struct JitCacheMultiProcessTest : public JitCacheTest {
   virtual void SetUp() { purgeFileCache(); }
 
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index 406bc69220a..efc5330ea7d 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -1120,7 +1120,7 @@ TEST_F(JoinTest, HashJoinSequentialProbes)
 
   Table t1(std::move(cols1));
 
-  cudf::hash_join hash_join(t1, {0, 1});
+  cudf::hash_join hash_join(t1, {0, 1}, cudf::null_equality::EQUAL);
 
   {
     CVector cols0;
diff --git a/cpp/tests/partitioning/hash_partition_test.cpp b/cpp/tests/partitioning/hash_partition_test.cpp
index 198274bcbf8..b440a2d68cf 100644
--- a/cpp/tests/partitioning/hash_partition_test.cpp
+++ b/cpp/tests/partitioning/hash_partition_test.cpp
@@ -325,4 +325,27 @@ TYPED_TEST(HashPartitionFixedWidth, HasNulls)
   run_fixed_width_test<TypeParam>(10, 1000, 10, cudf::hash_id::HASH_IDENTITY, true);
 }
 
+TEST_F(HashPartition, FixedPointColumnsToHash)
+{
+  fixed_width_column_wrapper<int32_t> to_hash({1});
+  cudf::test::fixed_point_column_wrapper<int64_t> first_col({7}, numeric::scale_type{-1});
+
+  auto first_input = cudf::table_view({to_hash, first_col});
+
+  auto columns_to_hash = std::vector<cudf::size_type>({0});
+
+  cudf::size_type const num_partitions = 1;
+  std::unique_ptr<cudf::table> first_result;
+  std::vector<cudf::size_type> first_offsets;
+  std::tie(first_result, first_offsets) =
+    cudf::hash_partition(first_input, columns_to_hash, num_partitions);
+
+  // Expect offsets to be equal and num_partitions in length
+  EXPECT_EQ(static_cast<size_t>(num_partitions), first_offsets.size());
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(first_result->get_column(0).view(), first_input.column(0));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(first_result->get_column(1).view(), first_input.column(1));
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/partitioning/partition_test.cpp b/cpp/tests/partitioning/partition_test.cpp
index 511c5baa6c6..a6838112a54 100644
--- a/cpp/tests/partitioning/partition_test.cpp
+++ b/cpp/tests/partitioning/partition_test.cpp
@@ -86,7 +86,6 @@ TYPED_TEST(PartitionTest, MapWithNullsThrows)
  *
  * The order of rows within each partition may be different, so each partition
  * is first sorted before being compared for equality.
- *
  */
 void expect_equal_partitions(cudf::table_view expected,
                              cudf::table_view actual,
@@ -201,3 +200,54 @@ TYPED_TEST(PartitionTest, EmptyPartitions)
 
   run_partition_test(table_to_partition, map, 5, expected_partitioned_table, expected_offsets);
 }
+
+template <typename T>
+class PartitionTestFixedPoint : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_CASE(PartitionTestFixedPoint, cudf::test::FixedPointTypes);
+
+TYPED_TEST(PartitionTestFixedPoint, Partition)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  auto const input    = fp_wrapper({11, 22, 33, 44, 55, 66}, scale_type{-1});
+  auto const map      = fixed_width_column_wrapper<int32_t>{0, 1, 2, 3, 4, 5};
+  auto const expected = cudf::table_view{{input}};
+  auto const offsets  = std::vector<cudf::size_type>{0, 1, 2, 3, 4, 5, 6};
+
+  run_partition_test(expected, map, 6, expected, offsets);
+}
+
+TYPED_TEST(PartitionTestFixedPoint, Partition1)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  auto const input    = fp_wrapper({11, 22, 33, 44, 55, 66}, scale_type{-1});
+  auto const map      = fixed_width_column_wrapper<int32_t>{5, 4, 3, 2, 1, 0};
+  auto const expected = fp_wrapper({66, 55, 44, 33, 22, 11}, scale_type{-1});
+  auto const offsets  = std::vector<cudf::size_type>{0, 1, 2, 3, 4, 5, 6};
+
+  run_partition_test(cudf::table_view{{input}}, map, 6, cudf::table_view{{expected}}, offsets);
+}
+
+TYPED_TEST(PartitionTestFixedPoint, Partition2)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  auto const input    = fp_wrapper({11, 22, 33, 44, 55, 66}, scale_type{-1});
+  auto const map      = fixed_width_column_wrapper<int32_t>{2, 1, 0, 2, 1, 0};
+  auto const expected = fp_wrapper({33, 66, 22, 55, 11, 44}, scale_type{-1});
+  auto const offsets  = std::vector<cudf::size_type>{0, 2, 4, 6};
+
+  run_partition_test(cudf::table_view{{input}}, map, 3, cudf::table_view{{expected}}, offsets);
+}
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 7eabc7e2a8b..7c27c8b3f03 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -1173,7 +1173,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumLarge)
   using RepType    = cudf::device_storage_type_t<decimalXX>;
   using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
 
-  for (int i = -2; i <= 0; ++i) {
+  for (auto const i : {0, -1, -2}) {
     auto const scale          = scale_type{i};
     auto f                    = thrust::make_counting_iterator(0);
     auto const values         = std::vector<RepType>(f, f + 1000);
@@ -1290,6 +1290,111 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointReductionNUnique)
   }
 }
 
+TYPED_TEST(FixedPointTestBothReps, FixedPointReductionSumOfSquares)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  for (auto const i : {0, -1, -2}) {
+    auto const scale    = scale_type{i};
+    auto const column   = fp_wrapper{{1, 2, 3, 4}, scale};
+    auto const out_type = static_cast<cudf::column_view>(column).type();
+    auto const expected = decimalXX{scaled_integer<RepType>{30, scale_type{i * 2}}};
+
+    auto const result = cudf::reduce(column, cudf::make_sum_of_squares_aggregation(), out_type);
+    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX> *>(result.get());
+
+    EXPECT_EQ(result_scalar->fixed_point_value(), expected);
+  }
+}
+
+TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMedianOddNumberOfElements)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  for (auto const i : {0, -1, -2, -3, -4}) {
+    auto const scale    = scale_type{i};
+    auto const column   = fp_wrapper{{1, 2, 2, 3, 4}, scale};
+    auto const out_type = static_cast<cudf::column_view>(column).type();
+    auto const expected = decimalXX{scaled_integer<RepType>{2, scale}};
+
+    auto const result        = cudf::reduce(column, cudf::make_median_aggregation(), out_type);
+    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX> *>(result.get());
+
+    EXPECT_EQ(result_scalar->fixed_point_value(), expected);
+  }
+}
+
+TYPED_TEST(FixedPointTestBothReps, FixedPointReductionMedianEvenNumberOfElements)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  for (auto const i : {0, -1, -2, -3, -4}) {
+    auto const scale    = scale_type{i};
+    auto const column   = fp_wrapper{{10, 20, 20, 30, 30, 40}, scale};
+    auto const out_type = static_cast<cudf::column_view>(column).type();
+    auto const expected = decimalXX{scaled_integer<RepType>{25, scale}};
+
+    auto const result        = cudf::reduce(column, cudf::make_median_aggregation(), out_type);
+    auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX> *>(result.get());
+
+    EXPECT_EQ(result_scalar->fixed_point_value(), expected);
+  }
+}
+
+TYPED_TEST(FixedPointTestBothReps, FixedPointReductionQuantile)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  for (auto const i : {0, -1, -2, -3, -4}) {
+    auto const scale    = scale_type{i};
+    auto const column   = fp_wrapper{{1, 2, 3, 4, 5}, scale};
+    auto const out_type = static_cast<cudf::column_view>(column).type();
+
+    for (auto const i : {0, 1, 2, 3, 4}) {
+      auto const expected = decimalXX{scaled_integer<RepType>{i + 1, scale}};
+      auto const result   = cudf::reduce(
+        column, cudf::make_quantile_aggregation({i / 4.0}, cudf::interpolation::LINEAR), out_type);
+      auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX> *>(result.get());
+      EXPECT_EQ(result_scalar->fixed_point_value(), expected);
+    }
+  }
+}
+
+TYPED_TEST(FixedPointTestBothReps, FixedPointReductionNthElement)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  for (auto const i : {0, -1, -2, -3, -4}) {
+    auto const scale    = scale_type{i};
+    auto const values   = std::vector<RepType>{4104, 42, 1729, 55};
+    auto const column   = fp_wrapper{values.cbegin(), values.cend(), scale};
+    auto const out_type = static_cast<cudf::column_view>(column).type();
+
+    for (auto const i : {0, 1, 2, 3}) {
+      auto const expected = decimalXX{scaled_integer<RepType>{values[i], scale}};
+      auto const result   = cudf::reduce(
+        column, cudf::make_nth_element_aggregation(i, cudf::null_policy::INCLUDE), out_type);
+      auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX> *>(result.get());
+      EXPECT_EQ(result_scalar->fixed_point_value(), expected);
+    }
+  }
+}
+
 TYPED_TEST(ReductionTest, NthElement)
 {
   using T = TypeParam;
diff --git a/cpp/tests/replace/replace_nulls_tests.cpp b/cpp/tests/replace/replace_nulls_tests.cpp
index 2b8b4b6ec13..bd3bf7ddd03 100644
--- a/cpp/tests/replace/replace_nulls_tests.cpp
+++ b/cpp/tests/replace/replace_nulls_tests.cpp
@@ -180,6 +180,61 @@ TEST_F(ReplaceNullsStringsTest, SimpleReplaceScalar)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected_w);
 }
 
+struct ReplaceNullsPolicyStringTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(ReplaceNullsPolicyStringTest, PrecedingFill)
+{
+  cudf::test::strings_column_wrapper input({"head", "", "", "mid", "mid", "", "tail"},
+                                           {1, 0, 0, 1, 1, 0, 1});
+
+  cudf::test::strings_column_wrapper expected({"head", "head", "head", "mid", "mid", "mid", "tail"},
+                                              cudf::test::all_valid());
+
+  auto result = cudf::replace_nulls(input, cudf::replace_policy::PRECEDING);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+}
+
+TEST_F(ReplaceNullsPolicyStringTest, FollowingFill)
+{
+  cudf::test::strings_column_wrapper input({"head", "", "", "mid", "mid", "", "tail"},
+                                           {1, 0, 0, 1, 1, 0, 1});
+
+  cudf::test::strings_column_wrapper expected({"head", "mid", "mid", "mid", "mid", "tail", "tail"},
+                                              cudf::test::all_valid());
+
+  auto result = cudf::replace_nulls(input, cudf::replace_policy::FOLLOWING);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+}
+
+TEST_F(ReplaceNullsPolicyStringTest, PrecedingFillLeadingNulls)
+{
+  cudf::test::strings_column_wrapper input({"", "", "", "mid", "mid", "", "tail"},
+                                           {0, 0, 0, 1, 1, 0, 1});
+
+  cudf::test::strings_column_wrapper expected({"", "", "", "mid", "mid", "mid", "tail"},
+                                              {0, 0, 0, 1, 1, 1, 1});
+
+  auto result = cudf::replace_nulls(input, cudf::replace_policy::PRECEDING);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+}
+
+TEST_F(ReplaceNullsPolicyStringTest, FollowingFillTrailingNulls)
+{
+  cudf::test::strings_column_wrapper input({"head", "", "", "mid", "mid", "", ""},
+                                           {1, 0, 0, 1, 1, 0, 0});
+
+  cudf::test::strings_column_wrapper expected({"head", "mid", "mid", "mid", "mid", "", ""},
+                                              {1, 1, 1, 1, 1, 0, 0});
+
+  auto result = cudf::replace_nulls(input, cudf::replace_policy::FOLLOWING);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+}
+
 template <typename T>
 struct ReplaceNullsTest : public cudf::test::BaseFixture {
 };
@@ -452,4 +507,67 @@ TEST_F(ReplaceDictionaryTest, ReplaceNullsNoNulls)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), input->view());
 }
 
+struct ReplaceNullsPolicyDictionaryTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(ReplaceNullsPolicyDictionaryTest, PrecedingFill)
+{
+  cudf::test::strings_column_wrapper input_w({"head", "", "", "mid1", "mid2", "tail", "", ""},
+                                             {1, 0, 0, 1, 1, 1, 0, 0});
+  auto input = cudf::dictionary::encode(input_w);
+
+  cudf::test::strings_column_wrapper expected_w(
+    {"head", "head", "head", "mid1", "mid2", "tail", "tail", "tail"}, cudf::test::all_valid());
+  auto expected = cudf::dictionary::encode(expected_w);
+
+  auto result = cudf::replace_nulls(*input, cudf::replace_policy::PRECEDING);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected->view());
+}
+
+TEST_F(ReplaceNullsPolicyDictionaryTest, FollowingFill)
+{
+  cudf::test::strings_column_wrapper input_w({"head", "", "", "mid1", "mid2", "", "", "tail"},
+                                             {1, 0, 0, 1, 1, 0, 0, 1});
+  auto input = cudf::dictionary::encode(input_w);
+
+  cudf::test::strings_column_wrapper expected_w(
+    {"head", "mid1", "mid1", "mid1", "mid2", "tail", "tail", "tail"}, cudf::test::all_valid());
+  auto expected = cudf::dictionary::encode(expected_w);
+
+  auto result = cudf::replace_nulls(*input, cudf::replace_policy::FOLLOWING);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected->view());
+}
+
+TEST_F(ReplaceNullsPolicyDictionaryTest, PrecedingFillLeadingNulls)
+{
+  cudf::test::strings_column_wrapper input_w({"", "", "", "mid1", "mid2", "", "", "tail"},
+                                             {0, 0, 0, 1, 1, 0, 0, 1});
+  auto input = cudf::dictionary::encode(input_w);
+
+  cudf::test::strings_column_wrapper expected_w(
+    {"", "", "", "mid1", "mid2", "mid2", "mid2", "tail"}, {0, 0, 0, 1, 1, 1, 1, 1});
+  auto expected = cudf::dictionary::encode(expected_w);
+
+  auto result = cudf::replace_nulls(*input, cudf::replace_policy::PRECEDING);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected->view());
+}
+
+TEST_F(ReplaceNullsPolicyDictionaryTest, FollowingFillTrailingNulls)
+{
+  cudf::test::strings_column_wrapper input_w({"head", "", "", "mid", "tail", "", "", ""},
+                                             {1, 0, 0, 1, 1, 0, 0, 0});
+  auto input = cudf::dictionary::encode(input_w);
+
+  cudf::test::strings_column_wrapper expected_w({"head", "mid", "mid", "mid", "tail", "", "", ""},
+                                                {1, 1, 1, 1, 1, 0, 0, 0});
+  auto expected = cudf::dictionary::encode(expected_w);
+
+  auto result = cudf::replace_nulls(*input, cudf::replace_policy::FOLLOWING);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected->view());
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp
index 4a6a028f063..a83f8361536 100644
--- a/cpp/tests/rolling/rolling_test.cpp
+++ b/cpp/tests/rolling/rolling_test.cpp
@@ -952,4 +952,82 @@ TEST_F(RollingTestUdf, DynamicWindow)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*output, expected);
 }
 
+template <typename T>
+struct FixedPointTests : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_CASE(FixedPointTests, cudf::test::FixedPointTypes);
+
+TYPED_TEST(FixedPointTests, MinMaxCountLagLead)
+{
+  using namespace numeric;
+  using namespace cudf;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+  using fw_wrapper = cudf::test::fixed_width_column_wrapper<size_type>;
+
+  auto const scale         = scale_type{-1};
+  auto const input         = fp_wrapper{{42, 1729, 55, 3, 1, 2}, {1, 1, 1, 1, 1, 1}, scale};
+  auto const expected_min  = fp_wrapper{{42, 42, 3, 1, 1, 1}, {1, 1, 1, 1, 1, 1}, scale};
+  auto const expected_max  = fp_wrapper{{1729, 1729, 1729, 55, 3, 2}, {1, 1, 1, 1, 1, 1}, scale};
+  auto const expected_lag  = fp_wrapper{{0, 42, 1729, 55, 3, 1}, {0, 1, 1, 1, 1, 1}, scale};
+  auto const expected_lead = fp_wrapper{{1729, 55, 3, 1, 2, 0}, {1, 1, 1, 1, 1, 0}, scale};
+  auto const expected_count_val = fw_wrapper{{2, 3, 3, 3, 3, 2}, {1, 1, 1, 1, 1, 1}};
+  auto const expected_count_all = fw_wrapper{{2, 3, 3, 3, 3, 2}, {1, 1, 1, 1, 1, 1}};
+  // auto const expected_rowno     = fw_wrapper{{1, 2, 2, 2, 2, 2}, {1, 1, 1, 1, 1, 1}};
+
+  auto const min   = rolling_window(input, 2, 1, 1, make_min_aggregation());
+  auto const max   = rolling_window(input, 2, 1, 1, make_max_aggregation());
+  auto const lag   = rolling_window(input, 2, 1, 1, make_lag_aggregation(1));
+  auto const lead  = rolling_window(input, 2, 1, 1, make_lead_aggregation(1));
+  auto const valid = rolling_window(input, 2, 1, 1, make_count_aggregation());
+  auto const all   = rolling_window(input, 2, 1, 1, make_count_aggregation(null_policy::INCLUDE));
+  EXPECT_THROW(rolling_window(input, 2, 1, 1, make_row_number_aggregation()), cudf::logic_error);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_min, min->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_max, max->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lag, lag->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lead, lead->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_count_val, valid->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_count_all, all->view());
+  // CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_rowno, rowno->view());
+}
+
+TYPED_TEST(FixedPointTests, MinMaxCountLagLeadNulls)
+{
+  using namespace numeric;
+  using namespace cudf;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+  using fw_wrapper = cudf::test::fixed_width_column_wrapper<size_type>;
+
+  auto const scale              = scale_type{-1};
+  auto const input              = fp_wrapper{{42, 1729, 55, 343, 1, 2}, {1, 0, 1, 0, 1, 1}, scale};
+  auto const expected_min       = fp_wrapper{{42, 42, 55, 1, 1, 1}, {1, 1, 1, 1, 1, 1}, scale};
+  auto const expected_max       = fp_wrapper{{42, 55, 55, 55, 2, 2}, {1, 1, 1, 1, 1, 1}, scale};
+  auto const expected_lag       = fp_wrapper{{0, 42, 1729, 55, 343, 1}, {0, 1, 0, 1, 0, 1}, scale};
+  auto const expected_lead      = fp_wrapper{{1729, 55, 343, 1, 2, 0}, {0, 1, 0, 1, 1, 0}, scale};
+  auto const expected_count_val = fw_wrapper{{1, 2, 1, 2, 2, 2}, {1, 1, 1, 1, 1, 1}};
+  auto const expected_count_all = fw_wrapper{{2, 3, 3, 3, 3, 2}, {1, 1, 1, 1, 1, 1}};
+  // auto const expected_rowno     = fw_wrapper{{1, 2, 2, 2, 2, 2}, {1, 1, 1, 1, 1, 1}};
+
+  auto const min   = rolling_window(input, 2, 1, 1, make_min_aggregation());
+  auto const max   = rolling_window(input, 2, 1, 1, make_max_aggregation());
+  auto const lag   = rolling_window(input, 2, 1, 1, make_lag_aggregation(1));
+  auto const lead  = rolling_window(input, 2, 1, 1, make_lead_aggregation(1));
+  auto const valid = rolling_window(input, 2, 1, 1, make_count_aggregation());
+  auto const all   = rolling_window(input, 2, 1, 1, make_count_aggregation(null_policy::INCLUDE));
+  EXPECT_THROW(rolling_window(input, 2, 1, 1, make_row_number_aggregation()), cudf::logic_error);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_min, min->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_max, max->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lag, lag->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lead, lead->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_count_val, valid->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_count_all, all->view());
+  // CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_rowno, rowno->view());
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/round/round_tests.cpp b/cpp/tests/round/round_tests.cpp
index d265900ef67..06fd5faddf0 100644
--- a/cpp/tests/round/round_tests.cpp
+++ b/cpp/tests/round/round_tests.cpp
@@ -503,6 +503,17 @@ TYPED_TEST(RoundTestsIntegerTypes, SimpleNegativeIntegerHalfUp)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
+TYPED_TEST(RoundTestsIntegerTypes, SimpleNegativeIntegerHalfEven)
+{
+  using fw_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam>;
+
+  auto const input    = fw_wrapper{-12, -135, -145, -146, -1454, -1455, -1500};
+  auto const expected = fw_wrapper{-10, -140, -140, -150, -1450, -1460, -1500};
+  auto const result   = cudf::round(input, -1, cudf::rounding_method::HALF_EVEN);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
 TEST_F(RoundTests, SimpleNegativeIntegerWithUnsignedNumbersHalfUp)
 {
   using fw_wrapper = cudf::test::fixed_width_column_wrapper<uint32_t>;
diff --git a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
index 5174297ec2b..066a6624fb7 100644
--- a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
+++ b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
@@ -247,6 +247,28 @@ TEST_F(ApplyBooleanMask, NoNullInput)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got->view());
 }
 
+TEST_F(ApplyBooleanMask, CorrectNullCount)
+{
+  cudf::size_type inputRows = 75000;
+
+  auto seq1       = cudf::test::make_counting_transform_iterator(0, [](auto i) { return i; });
+  auto valid_seq1 = cudf::test::make_counting_transform_iterator(0, [](auto row) { return true; });
+  cudf::test::fixed_width_column_wrapper<int64_t, typename decltype(seq1)::value_type> col1(
+    seq1, seq1 + inputRows, valid_seq1);
+
+  cudf::table_view input{{col1}};
+
+  auto seq3 =
+    cudf::test::make_counting_transform_iterator(0, [](auto i) { return (i % 277) == 0; });
+  cudf::test::fixed_width_column_wrapper<bool> boolean_mask(seq3, seq3 + inputRows);
+
+  auto got                 = cudf::apply_boolean_mask(input, boolean_mask);
+  auto out_col             = got->get_column(0).view();
+  auto expected_null_count = cudf::count_unset_bits(out_col.null_mask(), 0, out_col.size());
+
+  ASSERT_EQ(out_col.null_count(), expected_null_count);
+}
+
 TEST_F(ApplyBooleanMask, StructFiltering)
 {
   using namespace cudf::test;
diff --git a/cpp/tests/strings/array_tests.cu b/cpp/tests/strings/array_tests.cu
index 1ba919de29e..1b096a8305d 100644
--- a/cpp/tests/strings/array_tests.cu
+++ b/cpp/tests/strings/array_tests.cu
@@ -182,6 +182,15 @@ struct column_to_string_view_vector {
   }
 };
 
+TEST_F(StringsColumnTest, GatherTooBig)
+{
+  cudf::test::strings_column_wrapper strings({"0123456789012345678901234567890123456789"});
+  auto map = thrust::constant_iterator<int8_t>(0);
+  cudf::test::fixed_width_column_wrapper<int8_t> gather_map(
+    map, map + std::numeric_limits<cudf::size_type>::max() / 20);
+  EXPECT_THROW(cudf::gather(cudf::table_view{{strings}}, gather_map), cudf::logic_error);
+}
+
 TEST_F(StringsColumnTest, Scatter)
 {
   std::vector<const char*> h_strings1{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp
index ddb6f196241..c01b220d9da 100644
--- a/cpp/tests/strings/find_tests.cpp
+++ b/cpp/tests/strings/find_tests.cpp
@@ -281,7 +281,7 @@ TEST_F(StringsFindTest, ErrorCheck)
                cudf::logic_error);
 }
 
-class FindParmsTest : public StringsFindTest, public testing::WithParamInterface<int32_t> {
+class FindParmsTest : public StringsFindTest, public testing::WithParamInterface<cudf::size_type> {
 };
 
 TEST_P(FindParmsTest, Find)
@@ -308,12 +308,12 @@ TEST_P(FindParmsTest, Find)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    auto begin   = position;
+    auto begin   = static_cast<int32_t>(position);
     auto results = cudf::strings::find(strings_view, cudf::string_scalar(""), begin);
     cudf::test::fixed_width_column_wrapper<int32_t> expected(
       {begin, (begin > 0 ? -1 : 0), begin, begin, begin});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-    auto end = position + 1;
+    auto end = static_cast<int32_t>(position + 1);
     results  = cudf::strings::rfind(strings_view, cudf::string_scalar(""), 0, end);
     cudf::test::fixed_width_column_wrapper<int32_t> rexpected({end, 0, end, end, end});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, rexpected);
@@ -322,4 +322,4 @@ TEST_P(FindParmsTest, Find)
 
 INSTANTIATE_TEST_CASE_P(StringsFindTest,
                         FindParmsTest,
-                        testing::ValuesIn(std::array<int32_t, 4>{0, 1, 2, 3}));
+                        testing::ValuesIn(std::array<cudf::size_type, 4>{0, 1, 2, 3}));
diff --git a/cpp/tests/strings/substring_tests.cpp b/cpp/tests/strings/substring_tests.cpp
index 8a52b406be4..dcff678a0a5 100644
--- a/cpp/tests/strings/substring_tests.cpp
+++ b/cpp/tests/strings/substring_tests.cpp
@@ -50,7 +50,7 @@ TEST_F(StringsSubstringsTest, Substring)
 }
 
 class SubstringParmsTest : public StringsSubstringsTest,
-                           public testing::WithParamInterface<int32_t> {
+                           public testing::WithParamInterface<cudf::size_type> {
 };
 
 TEST_P(SubstringParmsTest, Substring)
diff --git a/cpp/tests/strings/utilities.h b/cpp/tests/strings/utilities.h
index 9b9cbfadc89..d6f0e9c4f1f 100644
--- a/cpp/tests/strings/utilities.h
+++ b/cpp/tests/strings/utilities.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@ namespace test {
  * @brief Utility will verify the given strings column is empty.
  *
  * @param strings_column Column of strings to check
- **/
+ */
 void expect_strings_empty(cudf::column_view strings_column);
 
 }  // namespace test
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 28f31e7f744..62f31233c80 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -373,7 +373,6 @@ struct column_comparator {
 
 /**
  * @copydoc cudf::test::expect_column_properties_equal
- *
  */
 void expect_column_properties_equal(column_view const& lhs, column_view const& rhs)
 {
@@ -382,7 +381,6 @@ void expect_column_properties_equal(column_view const& lhs, column_view const& r
 
 /**
  * @copydoc cudf::test::expect_column_properties_equivalent
- *
  */
 void expect_column_properties_equivalent(column_view const& lhs, column_view const& rhs)
 {
@@ -391,7 +389,6 @@ void expect_column_properties_equivalent(column_view const& lhs, column_view con
 
 /**
  * @copydoc cudf::test::expect_columns_equal
- *
  */
 void expect_columns_equal(cudf::column_view const& lhs,
                           cudf::column_view const& rhs,
@@ -402,7 +399,6 @@ void expect_columns_equal(cudf::column_view const& lhs,
 
 /**
  * @copydoc cudf::test::expect_columns_equivalent
- *
  */
 void expect_columns_equivalent(cudf::column_view const& lhs,
                                cudf::column_view const& rhs,
@@ -413,7 +409,6 @@ void expect_columns_equivalent(cudf::column_view const& lhs,
 
 /**
  * @copydoc cudf::test::expect_equal_buffers
- *
  */
 void expect_equal_buffers(void const* lhs, void const* rhs, std::size_t size_bytes)
 {
@@ -428,7 +423,6 @@ void expect_equal_buffers(void const* lhs, void const* rhs, std::size_t size_byt
 
 /**
  * @copydoc cudf::test::bitmask_to_host
- *
  */
 std::vector<bitmask_type> bitmask_to_host(cudf::column_view const& c)
 {
@@ -735,7 +729,6 @@ namespace detail {
 
 /**
  * @copydoc cudf::test::detail::to_strings
- *
  */
 std::vector<std::string> to_strings(cudf::column_view const& col, std::string const& indent)
 {
@@ -787,7 +780,6 @@ std::string to_string(std::vector<bitmask_type> const& null_mask,
 
 /**
  * @copydoc cudf::test::to_strings
- *
  */
 std::vector<std::string> to_strings(cudf::column_view const& col)
 {
@@ -796,7 +788,6 @@ std::vector<std::string> to_strings(cudf::column_view const& col)
 
 /**
  * @copydoc cudf::test::to_string(cudf::column_view, std::string)
- *
  */
 std::string to_string(cudf::column_view const& col, std::string const& delimiter)
 {
@@ -805,7 +796,6 @@ std::string to_string(cudf::column_view const& col, std::string const& delimiter
 
 /**
  * @copydoc cudf::test::to_string(std::vector<bitmask_type>, size_type)
- *
  */
 std::string to_string(std::vector<bitmask_type> const& null_mask, size_type null_mask_size)
 {
@@ -814,7 +804,6 @@ std::string to_string(std::vector<bitmask_type> const& null_mask, size_type null
 
 /**
  * @copydoc cudf::test::print
- *
  */
 void print(cudf::column_view const& col, std::ostream& os, std::string const& delimiter)
 {
@@ -823,7 +812,6 @@ void print(cudf::column_view const& col, std::ostream& os, std::string const& de
 
 /**
  * @copydoc cudf::test::validate_host_masks
- *
  */
 bool validate_host_masks(std::vector<bitmask_type> const& expected_mask,
                          std::vector<bitmask_type> const& got_mask,
diff --git a/cpp/tests/utilities/table_utilities.cu b/cpp/tests/utilities/table_utilities.cu
index b4b25730b72..e7fe97efa96 100644
--- a/cpp/tests/utilities/table_utilities.cu
+++ b/cpp/tests/utilities/table_utilities.cu
@@ -21,8 +21,7 @@ void expect_tables_equal(cudf::table_view lhs, cudf::table_view rhs)
 
 /**
  * @copydoc cudf::test::expect_tables_equivalent
- *
- **/
+ */
 void expect_tables_equivalent(cudf::table_view lhs, cudf::table_view rhs)
 {
   auto num_columns = lhs.num_columns();
diff --git a/cpp/tests/utilities_tests/type_list_tests.cpp b/cpp/tests/utilities_tests/type_list_tests.cpp
index 117293d48eb..f0f174a4328 100644
--- a/cpp/tests/utilities_tests/type_list_tests.cpp
+++ b/cpp/tests/utilities_tests/type_list_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,7 +48,7 @@ struct argument_type<T(U)> {
  * // Parentheses around types with commas
  * EXPECT_SAME_TYPE((std::map<int, float>), (std::map<int, float>));
  * ```
- **/
+ */
 #define EXPECT_SAME_TYPE(expected, actual) \
   static_assert(                           \
     std::is_same_v<argument_type<void(expected)>::type, argument_type<void(actual)>::type>, "");
@@ -60,7 +60,7 @@ struct argument_type<T(U)> {
  *
  * @tparam T The type whose name is returned as a string
  * @return std::string The demangled name of `T`
- **/
+ */
 template <typename T>
 std::string type_name()
 {
diff --git a/java/pom.xml b/java/pom.xml
index 49a54d4e1ec..6221697a269 100755
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -170,8 +170,7 @@
             <build>
                 <plugins>
                     <plugin>
-                        <groupId>org.apache.maven.plugins</groupId>
-                        <artifactId>maven-compiler-plugin</artifactId>
+                        <artifactId>maven-surefire-plugin</artifactId>
                         <configuration>
                             <excludes>
                                 <exclude>**/CuFileTest.java</exclude>
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 9c69a85e470..03a4f2acafd 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -640,7 +640,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas
     cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
     cudf::data_type n_data_type = cudf::jni::make_data_type(type, scale);
     std::unique_ptr<cudf::column> result;
-    if (n_data_type.id() == column->type().id()) {
+    if (n_data_type == column->type()) {
       std::unique_ptr<cudf::column> copy(new cudf::column(*column));
       return reinterpret_cast<jlong>(copy.release());
     }
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 387a6ad344f..b8327baec85 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -735,7 +735,7 @@ void testReadParquetContainsDecimalData() {
       };
       assertTableTypes(expectedTypes, table);
     }
-    // An CudfException will be thrown here because we haven't support reading decimal stored as FIXED_LEN_BYTE_ARRAY.
+    // An CudfException will be thrown here because it contains a FIXED_LEN_BYTE_ARRAY column whose type length exceeds 8.
     ParquetOptions opts = ParquetOptions.builder().enableStrictDecimalType(true).build();
     assertThrows(ai.rapids.cudf.CudfException.class, () -> {
       try (Table table = Table.readParquet(opts, TEST_DECIMAL_PARQUET_FILE)) {}
diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index 91242b9ca06..77d69ebc150 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -39,6 +39,7 @@
     from_pandas,
     merge,
 )
+from cudf.core.algorithms import factorize
 from cudf.core.dtypes import CategoricalDtype
 from cudf.core.groupby import Grouper
 from cudf.core.ops import (
@@ -63,7 +64,7 @@
 )
 from cudf.core.reshape import concat, get_dummies, melt, merge_sorted
 from cudf.core.series import isclose
-from cudf.core.tools.datetimes import to_datetime
+from cudf.core.tools.datetimes import to_datetime, DateOffset
 from cudf.core.tools.numeric import to_numeric
 from cudf.io import (
     from_dlpack,
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 47dda88dfde..c2f047fd0d5 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -567,7 +567,10 @@ cdef class Column:
         return result
 
 
-def make_column_from_scalar(DeviceScalar val, size_type size):
+def make_column_from_scalar(object py_val, size_type size):
+
+    cdef DeviceScalar val = py_val.device_value
+
     cdef const scalar* c_val = val.get_raw_ptr()
     cdef unique_ptr[column] c_result
     with nogil:
diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/cpp/datetime.pxd
index 228db7f4013..20fdd2e842a 100644
--- a/python/cudf/cudf/_lib/cpp/datetime.pxd
+++ b/python/cudf/cudf/_lib/cpp/datetime.pxd
@@ -12,3 +12,7 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
     cdef unique_ptr[column] extract_hour(const column_view& column) except +
     cdef unique_ptr[column] extract_minute(const column_view& column) except +
     cdef unique_ptr[column] extract_second(const column_view& column) except +
+    cdef unique_ptr[column] add_calendrical_months(
+        const column_view& timestamps,
+        const column_view& months
+    ) except +
diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx
index 6f5a9cdc161..3e40cb62f9c 100644
--- a/python/cudf/cudf/_lib/datetime.pyx
+++ b/python/cudf/cudf/_lib/datetime.pyx
@@ -9,6 +9,23 @@ from cudf._lib.column cimport Column
 cimport cudf._lib.cpp.datetime as libcudf_datetime
 
 
+def add_months(Column col, Column months):
+    # months must be int16 dtype
+    cdef unique_ptr[column] c_result
+    cdef column_view col_view = col.view()
+    cdef column_view months_view = months.view()
+
+    with nogil:
+        c_result = move(
+            libcudf_datetime.add_calendrical_months(
+                col_view,
+                months_view
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
+
+
 def extract_datetime_component(Column col, object field):
 
     cdef unique_ptr[column] c_result
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 873873a5153..0f5cdc73d3b 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -51,6 +51,7 @@ _STRING_AGGS = {
     "min",
     "nunique",
     "nth",
+    "collect"
 }
 
 _LIST_AGGS = {
diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index 2b38f6b655f..04971b58cd2 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -1,7 +1,6 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 
 import cudf
-import warnings
 
 from cudf._lib.table cimport Table
 from libcpp.vector cimport vector
@@ -32,10 +31,6 @@ def from_dlpack(dlpack_capsule):
 
     DLPack Tensor PyCapsule is expected to have the name "dltensor".
     """
-    warnings.warn("WARNING: cuDF from_dlpack() assumes column-major (Fortran"
-                  " order) input. If the input tensor is row-major, transpose"
-                  " it before passing it to this function.")
-
     cdef DLManagedTensor* dlpack_tensor = <DLManagedTensor*>pycapsule.\
         PyCapsule_GetPointer(dlpack_capsule, 'dltensor')
     pycapsule.PyCapsule_SetName(dlpack_capsule, 'used_dltensor')
@@ -61,11 +56,6 @@ def to_dlpack(Table source_table):
 
     DLPack Tensor PyCapsule will have the name "dltensor".
     """
-
-    warnings.warn("WARNING: cuDF to_dlpack() produces column-major (Fortran "
-                  "order) output. If the output tensor needs to be row major, "
-                  "transpose the output of this function.")
-
     for column in source_table._columns:
         if column.null_count:
             raise ValueError(
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
new file mode 100644
index 00000000000..12bd17e1f6a
--- /dev/null
+++ b/python/cudf/cudf/core/algorithms.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+from warnings import warn
+
+import cupy as cp
+
+from cudf.core.series import Index, Series
+
+
+def factorize(values, sort=False, na_sentinel=-1, size_hint=None):
+    """Encode the input values as integer labels
+
+    Parameters
+    ----------
+    values: Series, Index, or CuPy array
+        The data to be factorized.
+    na_sentinel : number, default -1
+        Value to indicate missing category.
+
+    Returns
+    --------
+    (labels, cats) : (Series, Series)
+        - *labels* contains the encoded values
+        - *cats* contains the categories in order that the N-th
+            item corresponds to the (N-1) code.
+
+    Examples
+    --------
+    >>> import cudf
+    >>> data = cudf.Series(['a', 'c', 'c'])
+    >>> codes, uniques = cudf.factorize(data)
+    >>> codes
+    0    0
+    1    1
+    2    1
+    dtype: int8
+    >>> uniques
+    0    a
+    1    c
+    dtype: object
+
+    See Also
+    --------
+    cudf.core.series.Series.factorize : Encode the input values of Series.
+
+    """
+    if sort:
+        raise NotImplementedError(
+            "Sorting not yet supported during factorization."
+        )
+    if na_sentinel is None:
+        raise NotImplementedError("na_sentinel can not be None.")
+
+    if size_hint:
+        warn("size_hint is not applicable for cudf.factorize")
+
+    return_cupy_array = isinstance(values, cp.core.core.ndarray)
+
+    values = Series(values)
+
+    cats = values._column.dropna().unique().astype(values.dtype)
+
+    name = values.name  # label_encoding mutates self.name
+    labels = values.label_encoding(cats=cats, na_sentinel=na_sentinel).values
+    values.name = name
+
+    return labels, cats.values if return_cupy_array else Index(cats)
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 8c1b51fa978..ff514e6c6f0 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -49,17 +49,17 @@ def __init__(self, column, parent=None):
         dtype: category
         Categories (3, int64): [3, 2, 1]
         >>> s.cat.remove_categories([1])
-        0   null
-        1      2
-        2      3
+        0    <NA>
+        1       2
+        2       3
         dtype: category
         Categories (2, int64): [2, 3]
         >>> s.cat.set_categories(list('abcde'))
-        0   null
-        1   null
-        2   null
+        0    <NA>
+        1    <NA>
+        2    <NA>
         dtype: category
-        Categories (5, object): [a, b, c, d, e]
+        Categories (5, object): ['a', 'b', 'c', 'd', 'e']
         >>> s.cat.as_ordered()
         0    1
         1    2
@@ -360,13 +360,13 @@ def remove_categories(self, removals, inplace=False):
         dtype: category
         Categories (3, int64): [1, 2, 10]
         >>> s.cat.remove_categories([1])
-        0     10
-        1   null
-        2   null
-        3      2
-        4     10
-        5      2
-        6     10
+        0      10
+        1    <NA>
+        2    <NA>
+        3       2
+        4      10
+        5       2
+        6      10
         dtype: category
         Categories (2, int64): [2, 10]
         >>> s
@@ -381,13 +381,13 @@ def remove_categories(self, removals, inplace=False):
         Categories (3, int64): [1, 2, 10]
         >>> s.cat.remove_categories([10], inplace=True)
         >>> s
-        0   null
-        1      1
-        2      1
-        3      2
-        4   null
-        5      2
-        6   null
+        0    <NA>
+        1       1
+        2       1
+        3       2
+        4    <NA>
+        5       2
+        6    <NA>
         dtype: category
         Categories (2, int64): [1, 2]
         """
@@ -475,22 +475,22 @@ def set_categories(
         dtype: category
         Categories (3, int64): [1, 2, 10]
         >>> s.cat.set_categories([1, 10])
-        0      1
-        1      1
-        2   null
-        3     10
-        4   null
-        5     10
+        0       1
+        1       1
+        2    <NA>
+        3      10
+        4    <NA>
+        5      10
         dtype: category
         Categories (2, int64): [1, 10]
         >>> s.cat.set_categories([1, 10], inplace=True)
         >>> s
-        0      1
-        1      1
-        2   null
-        3     10
-        4   null
-        5     10
+        0       1
+        1       1
+        2    <NA>
+        3      10
+        4    <NA>
+        5      10
         dtype: category
         Categories (2, int64): [1, 10]
         """
@@ -1038,36 +1038,40 @@ def find_and_replace(self, to_replace, replacement, all_nan):
             ordered=self.dtype.ordered,
         )
 
-    def fillna(self, fill_value):
+    def fillna(self, fill_value=None, method=None):
         """
         Fill null values with *fill_value*
         """
         if not self.nullable:
             return self
 
-        fill_is_scalar = np.isscalar(fill_value)
+        if fill_value is not None:
+            fill_is_scalar = np.isscalar(fill_value)
 
-        if fill_is_scalar:
-            if fill_value == self.default_na_value():
-                fill_value = self.codes.dtype.type(fill_value)
-            else:
-                try:
-                    fill_value = self._encode(fill_value)
+            if fill_is_scalar:
+                if fill_value == self.default_na_value():
                     fill_value = self.codes.dtype.type(fill_value)
-                except (ValueError) as err:
-                    err_msg = "fill value must be in categories"
-                    raise ValueError(err_msg) from err
-        else:
-            fill_value = column.as_column(fill_value, nan_as_null=False)
-            # TODO: only required if fill_value has a subset of the categories:
-            fill_value = fill_value.cat()._set_categories(
-                fill_value.cat().categories, self.categories, is_unique=True
-            )
-            fill_value = column.as_column(fill_value.codes).astype(
-                self.codes.dtype
-            )
+                else:
+                    try:
+                        fill_value = self._encode(fill_value)
+                        fill_value = self.codes.dtype.type(fill_value)
+                    except (ValueError) as err:
+                        err_msg = "fill value must be in categories"
+                        raise ValueError(err_msg) from err
+            else:
+                fill_value = column.as_column(fill_value, nan_as_null=False)
+                # TODO: only required if fill_value has a subset of the
+                # categories:
+                fill_value = fill_value.cat()._set_categories(
+                    fill_value.cat().categories,
+                    self.categories,
+                    is_unique=True,
+                )
+                fill_value = column.as_column(fill_value.codes).astype(
+                    self.codes.dtype
+                )
 
-        result = libcudf.replace.replace_nulls(self, fill_value)
+        result = super().fillna(value=fill_value, method=method)
 
         result = column.build_categorical_column(
             categories=self.dtype.categories,
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index d388ebb9985..1a32842b027 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -750,12 +750,14 @@ def __setitem__(self, key, value):
 
         self._mimic_inplace(out, inplace=True)
 
-    def fillna(self, value):
+    def fillna(self, value=None, method=None, dtype=None):
         """Fill null values with ``value``.
 
         Returns a copy with null filled.
         """
-        raise NotImplementedError
+        return libcudf.replace.replace_nulls(
+            input_col=self, replacement=value, method=method, dtype=dtype
+        )
 
     def isnull(self):
         """Identify missing values in a Column.
@@ -2108,5 +2110,5 @@ def full(size, fill_value, dtype=None):
     """
 
     return libcudf.column.make_column_from_scalar(
-        as_device_scalar(fill_value, dtype), size
+        cudf.Scalar(fill_value, dtype), size
     )
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 040b1b42c52..4561b1f68f2 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -140,6 +140,8 @@ def normalize_binop_value(self, other):
             other = other.to_datetime64()
         elif isinstance(other, pd.Timedelta):
             other = other.to_timedelta64()
+        elif isinstance(other, cudf.DateOffset):
+            return other
         if isinstance(other, np.datetime64):
             if np.isnat(other):
                 return cudf.Scalar(None, dtype=self.dtype)
@@ -215,6 +217,8 @@ def quantile(self, q, interpolation, exact):
         return result.astype(self.dtype)
 
     def binary_operator(self, op, rhs, reflect=False):
+        if isinstance(rhs, cudf.DateOffset):
+            return binop_offset(self, rhs, op)
         lhs, rhs = self, rhs
         if op in ("eq", "ne", "lt", "gt", "le", "ge"):
             out_dtype = np.bool
@@ -246,17 +250,17 @@ def binary_operator(self, op, rhs, reflect=False):
 
         return binop(lhs, rhs, op=op, out_dtype=out_dtype)
 
-    def fillna(self, fill_value):
-        if cudf.utils.utils.isnat(fill_value):
-            return _fillna_natwise(self)
-        if is_scalar(fill_value):
-            if not isinstance(fill_value, cudf.Scalar):
-                fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
-        else:
-            fill_value = column.as_column(fill_value, nan_as_null=False)
+    def fillna(self, fill_value=None, method=None):
+        if fill_value is not None:
+            if cudf.utils.utils.isnat(fill_value):
+                return _fillna_natwise(self)
+            if is_scalar(fill_value):
+                if not isinstance(fill_value, cudf.Scalar):
+                    fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
+            else:
+                fill_value = column.as_column(fill_value, nan_as_null=False)
 
-        result = libcudf.replace.replace_nulls(self, fill_value)
-        return result
+        return super().fillna(fill_value, method)
 
     def find_first_value(self, value, closest=False):
         """
@@ -316,6 +320,15 @@ def binop(lhs, rhs, op, out_dtype):
     return out
 
 
+def binop_offset(lhs, rhs, op):
+    if rhs._is_no_op:
+        return lhs
+    else:
+        rhs = rhs._generate_column(len(lhs), op)
+        out = libcudf.datetime.add_months(lhs, rhs)
+        return out
+
+
 def infer_format(element, **kwargs):
     """
     Infers datetime format from a string, also takes cares for `ms` and `ns`
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index e28378f656f..665c40b3faf 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -192,7 +192,7 @@ def leaves(self):
         >>> a = cudf.Series([[[1, None], [3, 4]], None, [[5, 6]]])
         >>> a.list.leaves
         0       1
-        1    null
+        1    <NA>
         2       3
         3       4
         4       5
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index b96c26a23ba..ddac48b60ab 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -3,6 +3,7 @@
 from numbers import Number
 
 import numpy as np
+import pandas as pd
 from nvtx import annotate
 from pandas.api.types import is_integer_dtype
 
@@ -16,6 +17,7 @@
     min_column_type,
     min_signed_type,
     numeric_normalize_types,
+    to_cudf_compatible_scalar,
 )
 
 
@@ -107,6 +109,8 @@ def normalize_binop_value(self, other):
         if other is None:
             return other
         if isinstance(other, cudf.Scalar):
+            if self.dtype == other.dtype:
+                return other
             # expensive device-host transfer just to
             # adjust the dtype
             other = other.value
@@ -413,15 +417,18 @@ def find_and_replace(self, to_replace, replacement, all_nan):
             replaced, to_replace_col, replacement_col
         )
 
-    def fillna(self, fill_value):
+    def fillna(self, fill_value=None, method=None):
         """
         Fill null values with *fill_value*
         """
+        if method is not None:
+            return super().fillna(fill_value, method)
+
         if (
             isinstance(fill_value, cudf.Scalar)
             and fill_value.dtype == self.dtype
         ):
-            return libcudf.replace.replace_nulls(self, fill_value)
+            return super().fillna(fill_value, method)
         if np.isscalar(fill_value):
             # castsafely to the same dtype as self
             fill_value_casted = self.dtype.type(fill_value)
@@ -438,9 +445,8 @@ def fillna(self, fill_value):
                 fill_value = _safe_cast_to_int(fill_value, self.dtype)
             else:
                 fill_value = fill_value.astype(self.dtype)
-        result = libcudf.replace.replace_nulls(self, fill_value)
 
-        return result
+        return super().fillna(fill_value, method)
 
     def find_first_value(self, value, closest=False):
         """
@@ -448,6 +454,9 @@ def find_first_value(self, value, closest=False):
         columns, returns the offset of the first larger value
         if closest=True.
         """
+        value = to_cudf_compatible_scalar(value)
+        if not pd.api.types.is_number(value):
+            raise ValueError("Expected a numeric value")
         found = 0
         if len(self):
             found = cudautils.find_first(
@@ -474,6 +483,9 @@ def find_last_value(self, value, closest=False):
         columns, returns the offset of the last smaller value
         if closest=True.
         """
+        value = to_cudf_compatible_scalar(value)
+        if not pd.api.types.is_number(value):
+            raise ValueError("Expected a numeric value")
         found = 0
         if len(self):
             found = cudautils.find_last(
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index d8cf6e0229b..ea14f23ea44 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -295,7 +295,7 @@ def len(self):
         0       3
         1       0
         2       1
-        3    null
+        3    <NA>
         dtype: int32
         """
 
@@ -388,7 +388,7 @@ def cat(self, others=None, sep=None, na_rep=None):
         >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',')
         0     a,A
         1     b,B
-        2    None
+        2    <NA>
         3     d,D
         dtype: object
 
@@ -489,7 +489,7 @@ def extract(self, pat, flags=0, expand=True):
               0     1
         0     a     1
         1     b     2
-        2  None  None
+        2  <NA>  <NA>
 
         A pattern with one group will return a DataFrame with one
         column if expand=True.
@@ -498,14 +498,14 @@ def extract(self, pat, flags=0, expand=True):
               0
         0     1
         1     2
-        2  None
+        2  <NA>
 
         A pattern with one group will return a Series if expand=False.
 
         >>> s.str.extract(r'[ab](\d)', expand=False)                    # noqa W605
         0       1
         1       2
-        2    None
+        2    <NA>
         dtype: object
         """
         if flags != 0:
@@ -557,22 +557,24 @@ def contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
         1                 dog
         2    house and parrot
         3                  23
-        4                None
+        4                <NA>
         dtype: object
         >>> s1.str.contains('og', regex=False)
         0    False
         1     True
         2    False
         3    False
-        4     null
+        4     <NA>
         dtype: bool
 
         Returning an Index of booleans using only a literal pattern.
 
         >>> data = ['Mouse', 'dog', 'house and parrot', '23.0', np.NaN]
-        >>> ind = cudf.core.index.StringIndex(data)
-        >>> ind.str.contains('23', regex=False)
-        Index(['False', 'False', 'False', 'True', 'null'], dtype='object')
+        >>> idx = cudf.Index(data)
+        >>> idx
+        StringIndex(['Mouse' 'dog' 'house and parrot' '23.0' None], dtype='object')
+        >>> idx.str.contains('23', regex=False)
+        GenericIndex([False, False, False, True, <NA>], dtype='bool')
 
         Returning ‘house’ or ‘dog’ when either expression occurs in a string.
 
@@ -581,7 +583,7 @@ def contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
         1     True
         2     True
         3    False
-        4     null
+        4     <NA>
         dtype: bool
 
         Returning any digit using regular expression.
@@ -591,7 +593,7 @@ def contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
         1    False
         2    False
         3     True
-        4     null
+        4     <NA>
         dtype: bool
 
         Ensure ``pat`` is a not a literal pattern when ``regex`` is set
@@ -617,7 +619,7 @@ def contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
         1     True
         2     True
         3     True
-        4     null
+        4     <NA>
         dtype: bool
         """
         if case is not True:
@@ -685,7 +687,7 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True):
         >>> s
         0     foo
         1     fuz
-        2    None
+        2    <NA>
         dtype: object
 
         When pat is a string and regex is True (the default), the given pat
@@ -696,7 +698,7 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True):
         >>> s.str.replace('f.', 'ba', regex=True)
         0     bao
         1     baz
-        2    None
+        2    <NA>
         dtype: object
 
         When pat is a string and `regex` is False, every pat is replaced
@@ -705,7 +707,7 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True):
         >>> s.str.replace('f.', 'ba', regex=False)
         0     foo
         1     fuz
-        2    None
+        2    <NA>
         dtype: object
         """
         if case is not None:
@@ -2305,15 +2307,9 @@ def partition(self, sep=" ", expand=True):
         Which will create a MultiIndex:
 
         >>> idx.str.partition()
-        MultiIndex(levels=[0    X
-        1    Y
-        dtype: object, 0
-        dtype: object, 0    123
-        1    999
-        dtype: object],
-        codes=   0  1  2
-        0  0  0  0
-        1  1  0  1)
+        MultiIndex([('X', ' ', '123'),
+                    ('Y', ' ', '999')],
+                   )
         """
         if expand is not True:
             raise NotImplementedError(
@@ -2375,15 +2371,9 @@ def rpartition(self, sep=" ", expand=True):
         Which will create a MultiIndex:
 
         >>> idx.str.rpartition()
-        MultiIndex(levels=[0    X
-        1    Y
-        dtype: object, 0
-        dtype: object, 0    123
-        1    999
-        dtype: object],
-        codes=   0  1  2
-        0  0  0  0
-        1  1  0  1)
+        MultiIndex([('X', ' ', '123'),
+                    ('Y', ' ', '999')],
+                   )
         """
         if expand is not True:
             raise NotImplementedError(
@@ -2531,7 +2521,7 @@ def zfill(self, width):
         0      -1
         1       1
         2    1000
-        3    None
+        3    <NA>
         dtype: object
 
         Note that ``None`` is not string, therefore it is converted
@@ -2546,7 +2536,7 @@ def zfill(self, width):
         0     0-1
         1     001
         2    1000
-        3    None
+        3    <NA>
         dtype: object
         """
         if not pd.api.types.is_integer(width):
@@ -2582,31 +2572,31 @@ def center(self, width, fillchar=" "):
         >>> s.str.center(1)
         0       a
         1       b
-        2    None
+        2    <NA>
         3       d
         dtype: object
         >>> s.str.center(1, fillchar='-')
         0       a
         1       b
-        2    None
+        2    <NA>
         3       d
         dtype: object
         >>> s.str.center(2, fillchar='-')
         0      a-
         1      b-
-        2    None
+        2    <NA>
         3      d-
         dtype: object
         >>> s.str.center(5, fillchar='-')
         0    --a--
         1    --b--
-        2     None
+        2     <NA>
         3    --d--
         dtype: object
         >>> s.str.center(6, fillchar='-')
         0    --a---
         1    --b---
-        2      None
+        2      <NA>
         3    --d---
         dtype: object
         """
@@ -2772,19 +2762,19 @@ def strip(self, to_strip=None):
         0    1. Ant.
         1    2. Bee!\\n
         2    3. Cat?\\t
-        3         None
+        3         <NA>
         dtype: object
         >>> s.str.strip()
         0    1. Ant.
         1    2. Bee!
         2    3. Cat?
-        3       None
+        3       <NA>
         dtype: object
         >>> s.str.strip('123.!? \\n\\t')
         0     Ant
         1     Bee
         2     Cat
-        3    None
+        3    <NA>
         dtype: object
         """
         if to_strip is None:
@@ -2831,7 +2821,7 @@ def lstrip(self, to_strip=None):
         0     Ant.
         1     Bee!\\n
         2     Cat?\\t
-        3       None
+        3       <NA>
         dtype: object
         """
         if to_strip is None:
@@ -2880,13 +2870,13 @@ def rstrip(self, to_strip=None):
         0    1. Ant.
         1    2. Bee!\\n
         2    3. Cat?\\t
-        3         None
+        3         <NA>
         dtype: object
         >>> s.str.rstrip('.!? \\n\\t')
         0    1. Ant
         1    2. Bee
         2    3. Cat
-        3      None
+        3      <NA>
         dtype: object
         """
         if to_strip is None:
@@ -3022,7 +3012,7 @@ def count(self, pat, flags=0):
         1       0
         2       2
         3       2
-        4    null
+        4    <NA>
         5       0
         6       1
         dtype: int32
@@ -3079,9 +3069,9 @@ def findall(self, pat, flags=0, expand=True):
 
         >>> s.str.findall('Monkey')
                 0
-        0    None
+        0    <NA>
         1  Monkey
-        2    None
+        2    <NA>
 
         When the pattern matches more than one string
         in the Series, all matches are returned:
@@ -3090,7 +3080,7 @@ def findall(self, pat, flags=0, expand=True):
               0
         0    on
         1    on
-        2  None
+        2  <NA>
 
         Regular expressions are supported too. For instance,
         the search for all the strings ending with
@@ -3099,16 +3089,16 @@ def findall(self, pat, flags=0, expand=True):
         >>> s.str.findall('on$')
               0
         0    on
-        1  None
-        2  None
+        1  <NA>
+        2  <NA>
 
         If the pattern is found more than once in the same
         string, then multiple strings are returned as columns:
 
         >>> s.str.findall('b')
               0     1
-        0  None  None
-        1  None  None
+        0  <NA>  <NA>
+        1  <NA>  <NA>
         2     b     b
         """
         if flags != 0:
@@ -3228,13 +3218,13 @@ def endswith(self, pat):
         0     bat
         1    bear
         2     caT
-        3    None
+        3    <NA>
         dtype: object
         >>> s.str.endswith('t')
         0     True
         1    False
         2    False
-        3     null
+        3     <NA>
         dtype: bool
         """
         if pat is None:
@@ -3283,17 +3273,18 @@ def startswith(self, pat):
         Examples
         --------
         >>> import cudf
+        >>> s = cudf.Series(['bat', 'Bear', 'cat', None])
         >>> s
         0     bat
         1    Bear
         2     cat
-        3    None
+        3    <NA>
         dtype: object
         >>> s.str.startswith('b')
         0     True
         1    False
         2    False
-        3     null
+        3     <NA>
         dtype: bool
         """
         if pat is None:
@@ -4883,6 +4874,9 @@ def serialize(self):
     @classmethod
     def deserialize(cls, header, frames):
         size = header["size"]
+        if not isinstance(size, int):
+            size = pickle.loads(size)
+
         # Deserialize the mask, value, and offset frames
         buffers = [Buffer(each_frame) for each_frame in frames]
 
@@ -4925,10 +4919,13 @@ def find_and_replace(self, to_replace, replacement, all_nan):
         replacement = column.as_column(replacement, dtype=self.dtype)
         return libcudf.replace.replace(self, to_replace, replacement)
 
-    def fillna(self, fill_value):
-        if not is_scalar(fill_value):
-            fill_value = column.as_column(fill_value, dtype=self.dtype)
-        return libcudf.replace.replace_nulls(self, fill_value, dtype="object")
+    def fillna(self, fill_value=None, method=None):
+        if fill_value is not None:
+            if not is_scalar(fill_value):
+                fill_value = column.as_column(fill_value, dtype=self.dtype)
+            return super().fillna(value=fill_value, dtype="object")
+        else:
+            return super().fillna(method=method)
 
     def _find_first_and_last(self, value):
         found_indices = self.str().contains(f"^{value}$")
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 8f1ecd6e2c3..9036f1e2962 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -9,7 +9,7 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf.core.column import column, string
+from cudf.core.column import ColumnBase, column, string
 from cudf.core.column.datetime import _numpy_to_pandas_conversion
 from cudf.utils.dtypes import is_scalar, np_to_pa_dtype
 from cudf.utils.utils import _fillna_natwise
@@ -266,22 +266,24 @@ def default_na_value(self):
     def time_unit(self):
         return self._time_unit
 
-    def fillna(self, fill_value):
-        if cudf.utils.utils.isnat(fill_value):
-            return _fillna_natwise(self)
-        col = self
-        if is_scalar(fill_value):
-            if isinstance(fill_value, np.timedelta64):
-                dtype = determine_out_dtype(self.dtype, fill_value.dtype)
-                fill_value = fill_value.astype(dtype)
-                col = col.astype(dtype)
-            if not isinstance(fill_value, cudf.Scalar):
-                fill_value = cudf.Scalar(fill_value, dtype=dtype)
-        else:
-            fill_value = column.as_column(fill_value, nan_as_null=False)
+    def fillna(self, fill_value=None, method=None):
+        if fill_value is not None:
+            if cudf.utils.utils.isnat(fill_value):
+                return _fillna_natwise(self)
+            col = self
+            if is_scalar(fill_value):
+                if isinstance(fill_value, np.timedelta64):
+                    dtype = determine_out_dtype(self.dtype, fill_value.dtype)
+                    fill_value = fill_value.astype(dtype)
+                    col = col.astype(dtype)
+                if not isinstance(fill_value, cudf.Scalar):
+                    fill_value = cudf.Scalar(fill_value, dtype=dtype)
+            else:
+                fill_value = column.as_column(fill_value, nan_as_null=False)
 
-        result = libcudf.replace.replace_nulls(col, fill_value)
-        return result
+            return ColumnBase.fillna(col, fill_value)
+        else:
+            return super().fillna(method=method)
 
     def as_numerical_column(self, dtype):
         return self.as_numerical.astype(dtype)
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 1eeb070c40e..030db06527e 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1,5 +1,5 @@
 # Copyright (c) 2018-2020, NVIDIA CORPORATION.
-from __future__ import division, print_function
+from __future__ import division
 
 import inspect
 import itertools
@@ -8,7 +8,7 @@
 import sys
 import warnings
 from collections import OrderedDict, defaultdict
-from collections.abc import Mapping, Sequence
+from collections.abc import Iterable, Mapping, Sequence
 
 import cupy
 import numpy as np
@@ -142,8 +142,8 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
         >>> df = cudf.DataFrame()
         >>> df['key'] = [0, 1, 2, 3, 4]
         >>> df['val'] = [float(i + 10) for i in range(5)]  # insert column
-        >>> print(df)
-        key   val
+        >>> df
+           key   val
         0    0  10.0
         1    1  11.0
         2    2  12.0
@@ -152,16 +152,14 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
 
         Build DataFrame via dict of columns:
 
-        >>> import cudf
         >>> import numpy as np
         >>> from datetime import datetime, timedelta
-
         >>> t0 = datetime.strptime('2018-10-07 12:00:00', '%Y-%m-%d %H:%M:%S')
         >>> n = 5
         >>> df = cudf.DataFrame({
-        ... 'id': np.arange(n),
-        ... 'datetimes': np.array(
-        ... [(t0+ timedelta(seconds=x)) for x in range(n)])
+        ...     'id': np.arange(n),
+        ...     'datetimes': np.array(
+        ...     [(t0+ timedelta(seconds=x)) for x in range(n)])
         ... })
         >>> df
             id                datetimes
@@ -173,30 +171,34 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
 
         Build DataFrame via list of rows as tuples:
 
-        >>> import cudf
         >>> df = cudf.DataFrame([
-        ... (5, "cats", "jump", np.nan),
-        ... (2, "dogs", "dig", 7.5),
-        ... (3, "cows", "moo", -2.1, "occasionally"),
+        ...     (5, "cats", "jump", np.nan),
+        ...     (2, "dogs", "dig", 7.5),
+        ...     (3, "cows", "moo", -2.1, "occasionally"),
         ... ])
         >>> df
-        0     1     2     3             4
-        0  5  cats  jump  null          None
-        1  2  dogs   dig   7.5          None
+           0     1     2     3             4
+        0  5  cats  jump  <NA>          <NA>
+        1  2  dogs   dig   7.5          <NA>
         2  3  cows   moo  -2.1  occasionally
 
         Convert from a Pandas DataFrame:
 
         >>> import pandas as pd
-        >>> import cudf
         >>> pdf = pd.DataFrame({'a': [0, 1, 2, 3],'b': [0.1, 0.2, None, 0.3]})
+        >>> pdf
+           a    b
+        0  0  0.1
+        1  1  0.2
+        2  2  NaN
+        3  3  0.3
         >>> df = cudf.from_pandas(pdf)
         >>> df
-        a b
-        0 0 0.1
-        1 1 0.2
-        2 2 nan
-        3 3 0.3
+           a     b
+        0  0   0.1
+        1  1   0.2
+        2  2  <NA>
+        3  3   0.3
         """
         super().__init__()
 
@@ -646,20 +648,20 @@ def __getitem__(self, arg):
         >>> df = DataFrame([('a', list(range(20))),
         ...                 ('b', list(range(20))),
         ...                 ('c', list(range(20)))])
-        >>> print(df[:4])    # get first 4 rows of all columns
+        >>> df[:4]    # get first 4 rows of all columns
            a  b  c
         0  0  0  0
         1  1  1  1
         2  2  2  2
         3  3  3  3
-        >>> print(df[-5:])  # get last 5 rows of all columns
+        >>> df[-5:]  # get last 5 rows of all columns
             a   b   c
         15  15  15  15
         16  16  16  16
         17  17  17  17
         18  18  18  18
         19  19  19  19
-        >>> print(df[['a', 'c']]) # get columns a and c
+        >>> df[['a', 'c']] # get columns a and c
            a  c
         0  0  0
         1  1  1
@@ -671,7 +673,7 @@ def __getitem__(self, arg):
         7  7  7
         8  8  8
         9  9  9
-        >>> print(df[[True, False, True, False]]) # mask the entire dataframe,
+        >>> df[[True, False, True, False]] # mask the entire dataframe,
         # returning the rows specified in the boolean mask
         """
         if is_scalar(arg) or isinstance(arg, tuple):
@@ -976,7 +978,7 @@ def assign(self, **kwargs):
         >>> import cudf
         >>> df = cudf.DataFrame()
         >>> df = df.assign(a=[0, 1, 2], b=[3, 4, 5])
-        >>> print(df)
+        >>> df
            a  b
         0  0  3
         1  1  4
@@ -997,7 +999,7 @@ def head(self, n=5):
         >>> df = cudf.DataFrame()
         >>> df['key'] = [0, 1, 2, 3, 4]
         >>> df['val'] = [float(i + 10) for i in range(5)]  # insert column
-        >>> print(df.head(2))
+        >>> df.head(2)
            key   val
         0    0  10.0
         1    1  11.0
@@ -1014,7 +1016,7 @@ def tail(self, n=5):
         >>> df = cudf.DataFrame()
         >>> df['key'] = [0, 1, 2, 3, 4]
         >>> df['val'] = [float(i + 10) for i in range(5)]  # insert column
-        >>> print(df.tail(2))
+        >>> df.tail(2)
            key   val
         3    3  13.0
         4    4  14.0
@@ -1664,13 +1666,13 @@ def mul(self, other, axis="columns", level=None, fill_value=None):
         >>> df = cudf.DataFrame({'angles': [0, 3, 4],
         ...                    'degrees': [360, 180, 360]},
         ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> other = pd.DataFrame({'angles': [0, 3, 4]},
+        >>> other = cudf.DataFrame({'angles': [0, 3, 4]},
         ...                      index=['circle', 'triangle', 'rectangle'])
         >>> df * other
-                angles degrees
-        circle          0    null
-        triangle        9    null
-        rectangle      16    null
+                   angles degrees
+        circle          0    <NA>
+        triangle        9    <NA>
+        rectangle      16    <NA>
         >>> df.mul(other, fill_value=0)
                 angles  degrees
         circle          0        0
@@ -1722,15 +1724,15 @@ def rmul(self, other, axis="columns", level=None, fill_value=None):
         >>> df = cudf.DataFrame({'angles': [0, 3, 4],
         ...                    'degrees': [360, 180, 360]},
         ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> other = pd.DataFrame({'angles': [0, 3, 4]},
+        >>> other = cudf.DataFrame({'angles': [0, 3, 4]},
         ...                      index=['circle', 'triangle', 'rectangle'])
         >>> other * df
-                angles degrees
-        circle          0    null
-        triangle        9    null
-        rectangle      16    null
+                   angles degrees
+        circle          0    <NA>
+        triangle        9    <NA>
+        rectangle      16    <NA>
         >>> df.rmul(other, fill_value=0)
-                angles  degrees
+                   angles  degrees
         circle          0        0
         triangle        9        0
         rectangle      16        0
@@ -1781,12 +1783,12 @@ def mod(self, other, axis="columns", level=None, fill_value=None):
         ...                    'degrees': [360, 180, 360]},
         ...                   index=['circle', 'triangle', 'rectangle'])
         >>> df % 100
-                angles  degrees
+                   angles  degrees
         circle          0       60
         triangle        3       80
         rectangle       4       60
         >>> df.mod(100)
-                angles  degrees
+                   angles  degrees
         circle          0       60
         triangle        3       80
         rectangle       4       60
@@ -1837,12 +1839,12 @@ def rmod(self, other, axis="columns", level=None, fill_value=None):
         ...                    'degrees': [360, 180, 360]},
         ...                   index=['circle', 'triangle', 'rectangle'])
         >>> 100 % df
-                angles  degrees
+                   angles  degrees
         circle          0      100
         triangle        1      100
         rectangle       0      100
         >>> df.rmod(100)
-                angles  degrees
+                   angles  degrees
         circle          0      100
         triangle        1      100
         rectangle       0      100
@@ -1893,12 +1895,12 @@ def pow(self, other, axis="columns", level=None, fill_value=None):
         ...                    'degrees': [360, 180, 360]},
         ...                   index=['circle', 'triangle', 'rectangle'])
         >>> df ** 2
-                angles  degrees
+                   angles  degrees
         circle          0   129600
         triangle        9    32400
         rectangle      16   129600
         >>> df.pow(2)
-                angles  degrees
+                   angles  degrees
         circle          0   129600
         triangle        9    32400
         rectangle      16   129600
@@ -1949,12 +1951,12 @@ def rpow(self, other, axis="columns", level=None, fill_value=None):
         ...                    'degrees': [360, 180, 360]},
         ...                   index=['circle', 'triangle', 'rectangle'])
         >>> 1 ** df
-                angles  degrees
+                   angles  degrees
         circle          1        1
         triangle        1        1
         rectangle       1        1
         >>> df.rpow(1)
-                angles  degrees
+                   angles  degrees
         circle          1        1
         triangle        1        1
         rectangle       1        1
@@ -2005,12 +2007,12 @@ def floordiv(self, other, axis="columns", level=None, fill_value=None):
         ...                    'degrees': [360, 180, 360]},
         ...                   index=['circle', 'triangle', 'rectangle'])
         >>> df.floordiv(2)
-                angles  degrees
+                   angles  degrees
         circle          0      180
         triangle        1       90
         rectangle       2      180
         >>> df // 2
-                angles  degrees
+                   angles  degrees
         circle          0      180
         triangle        1       90
         rectangle       2      180
@@ -2127,17 +2129,17 @@ def truediv(self, other, axis="columns", level=None, fill_value=None):
         ...                    'degrees': [360, 180, 360]},
         ...                   index=['circle', 'triangle', 'rectangle'])
         >>> df.truediv(10)
-                    angles  degrees
+                   angles  degrees
         circle        0.0     36.0
         triangle      0.3     18.0
         rectangle     0.4     36.0
         >>> df.div(10)
-                    angles  degrees
+                   angles  degrees
         circle        0.0     36.0
         triangle      0.3     18.0
         rectangle     0.4     36.0
         >>> df / 10
-                    angles  degrees
+                   angles  degrees
         circle        0.0     36.0
         triangle      0.3     18.0
         rectangle     0.4     36.0
@@ -2282,7 +2284,7 @@ def loc(self):
 
         DataFrame with string index.
 
-        >>> print(df)
+        >>> df
            a  b
         a  0  5
         b  1  6
@@ -2292,14 +2294,14 @@ def loc(self):
 
         Select a single row by label.
 
-        >>> print(df.loc['a'])
+        >>> df.loc['a']
         a    0
         b    5
         Name: a, dtype: int64
 
         Select multiple rows and a single column.
 
-        >>> print(df.loc[['a', 'c', 'e'], 'b'])
+        >>> df.loc[['a', 'c', 'e'], 'b']
         a    5
         c    7
         e    9
@@ -2307,7 +2309,7 @@ def loc(self):
 
         Selection by boolean mask.
 
-        >>> print(df.loc[df.a > 2])
+        >>> df.loc[df.a > 2]
            a  b
         d  3  8
         e  4  9
@@ -2315,7 +2317,7 @@ def loc(self):
         Setting values using loc.
 
         >>> df.loc[['a', 'c', 'e'], 'a'] = 0
-        >>> print(df)
+        >>> df
            a  b
         a  0  5
         b  1  6
@@ -2357,14 +2359,15 @@ def iloc(self):
 
         Select a single row using an integer index.
 
-        >>> print(df.iloc[1])
+        >>> df.iloc[1]
         a    1
         b    1
         c    1
+        Name: 1, dtype: int64
 
         Select multiple rows using a list of integers.
 
-        >>> print(df.iloc[[0, 2, 9, 18]])
+        >>> df.iloc[[0, 2, 9, 18]]
               a    b    c
          0    0    0    0
          2    2    2    2
@@ -2373,7 +2376,7 @@ def iloc(self):
 
         Select rows using a slice.
 
-        >>> print(df.iloc[3:10:2])
+        >>> df.iloc[3:10:2]
              a    b    c
         3    3    3    3
         5    5    5    5
@@ -2382,7 +2385,7 @@ def iloc(self):
 
         Select both rows and columns.
 
-        >>> print(df.iloc[[1, 3, 5, 7], 2])
+        >>> df.iloc[[1, 3, 5, 7], 2]
         1    1
         3    3
         5    5
@@ -2392,7 +2395,7 @@ def iloc(self):
         Setting values in a column using iloc.
 
         >>> df.iloc[:4] = 0
-        >>> print(df)
+        >>> df
            a  b  c
         0  0  0  0
         1  0  0  0
@@ -2558,14 +2561,14 @@ def reindex(
         >>> df['val'] = [float(i + 10) for i in range(5)]
         >>> df_new = df.reindex(index=[0, 3, 4, 5],
         ...                     columns=['key', 'val', 'sum'])
-        >>> print(df)
+        >>> df
            key   val
         0    0  10.0
         1    1  11.0
         2    2  12.0
         3    3  13.0
         4    4  14.0
-        >>> print(df_new)
+        >>> df_new
            key   val  sum
         0    0  10.0  NaN
         3    3  13.0  NaN
@@ -2648,9 +2651,11 @@ def set_index(
 
         Examples
         --------
-        >>> df = cudf.DataFrame({"a": [1, 2, 3, 4, 5],
-        ... "b": ["a", "b", "c", "d","e"],
-        ... "c": [1.0, 2.0, 3.0, 4.0, 5.0]})
+        >>> df = cudf.DataFrame({
+        ...     "a": [1, 2, 3, 4, 5],
+        ...     "b": ["a", "b", "c", "d","e"],
+        ...     "c": [1.0, 2.0, 3.0, 4.0, 5.0]
+        ... })
         >>> df
            a  b    c
         0  1  a  1.0
@@ -2823,23 +2828,23 @@ def reset_index(
         ...                   index=['falcon', 'parrot', 'lion', 'monkey'],
         ...                   columns=('class', 'max_speed'))
         >>> df
-                class max_speed
+                 class max_speed
         falcon    bird     389.0
         parrot    bird      24.0
         lion    mammal      80.5
-        monkey  mammal      null
+        monkey  mammal      <NA>
         >>> df.reset_index()
             index   class max_speed
         0  falcon    bird     389.0
         1  parrot    bird      24.0
         2    lion  mammal      80.5
-        3  monkey  mammal      null
+        3  monkey  mammal      <NA>
         >>> df.reset_index(drop=True)
             class max_speed
         0    bird     389.0
         1    bird      24.0
         2  mammal      80.5
-        3  mammal      null
+        3  mammal      <NA>
         """
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
@@ -3695,7 +3700,7 @@ def sort_values(
         >>> a = ('a', [0, 1, 2])
         >>> b = ('b', [-3, 2, 0])
         >>> df = cudf.DataFrame([a, b])
-        >>> print(df.sort_values('b'))
+        >>> df.sort_values('b')
            a  b
         0  0 -3
         2  2  0
@@ -3714,6 +3719,137 @@ def sort_values(
             keep_index=not ignore_index,
         )
 
+    def agg(self, aggs, axis=None):
+        """
+        Aggregate using one or more operations over the specified axis.
+
+        Parameters
+        ----------
+        aggs : Iterable (set, list, string, tuple or dict)
+            Function to use for aggregating data. Accepted types are:
+             * string name, e.g. ``"sum"``
+             * list of functions, e.g. ``["sum", "min", "max"]``
+             * dict of axis labels specified operations per column,
+               e.g. ``{"a": "sum"}``
+
+        axis : not yet supported
+
+        Returns
+        -------
+        Aggregation Result : ``Series`` or ``DataFrame``
+            When ``DataFrame.agg`` is called with single agg,
+            ``Series`` is returned.
+            When ``DataFrame.agg`` is called with several aggs,
+            ``DataFrame`` is returned.
+
+        Notes
+        -----
+        Difference from pandas:
+          * Not supporting: ``axis``, ``*args``, ``**kwargs``
+
+        """
+        # TODO: Remove the typecasting below once issue #6846 is fixed
+        # link <https://github.com/rapidsai/cudf/issues/6846>
+        dtypes = [self[col].dtype for col in self._column_names]
+        common_dtype = cudf.utils.dtypes.find_common_type(dtypes)
+        df_normalized = self.astype(common_dtype)
+
+        if any(is_string_dtype(dt) for dt in dtypes):
+            raise NotImplementedError(
+                "DataFrame.agg() is not supported for "
+                "frames containing string columns"
+            )
+
+        if axis == 0 or axis is not None:
+            raise NotImplementedError("axis not implemented yet")
+
+        if isinstance(aggs, Iterable) and not isinstance(aggs, (str, dict)):
+            result = cudf.DataFrame()
+            # TODO : Allow simultaneous pass for multi-aggregation as
+            # a future optimization
+            for agg in aggs:
+                result[agg] = getattr(df_normalized, agg)()
+            return result.T.sort_index(axis=1, ascending=True)
+
+        elif isinstance(aggs, str):
+            if not hasattr(df_normalized, aggs):
+                raise AttributeError(
+                    f"{aggs} is not a valid function for "
+                    f"'DataFrame' object"
+                )
+            result = cudf.DataFrame()
+            result[aggs] = getattr(df_normalized, aggs)()
+            result = result.iloc[:, 0]
+            result.name = None
+            return result
+
+        elif isinstance(aggs, dict):
+            cols = aggs.keys()
+            if any([callable(val) for val in aggs.values()]):
+                raise NotImplementedError(
+                    "callable parameter is not implemented yet"
+                )
+            elif all([isinstance(val, str) for val in aggs.values()]):
+                result = cudf.Series(index=cols)
+                for key, value in aggs.items():
+                    col = df_normalized[key]
+                    if not hasattr(col, value):
+                        raise AttributeError(
+                            f"{value} is not a valid function for "
+                            f"'Series' object"
+                        )
+                    result[key] = getattr(col, value)()
+            elif all([isinstance(val, Iterable) for val in aggs.values()]):
+                idxs = set()
+                for val in aggs.values():
+                    if isinstance(val, Iterable):
+                        idxs.update(val)
+                    elif isinstance(val, str):
+                        idxs.add(val)
+                idxs = sorted(list(idxs))
+                for agg in idxs:
+                    if agg is callable:
+                        raise NotImplementedError(
+                            "callable parameter is not implemented yet"
+                        )
+                result = cudf.DataFrame(index=idxs, columns=cols)
+                for key in aggs.keys():
+                    col = df_normalized[key]
+                    col_empty = column_empty(
+                        len(idxs), dtype=col.dtype, masked=True
+                    )
+                    ans = cudf.Series(data=col_empty, index=idxs)
+                    if isinstance(aggs.get(key), Iterable):
+                        # TODO : Allow simultaneous pass for multi-aggregation
+                        # as a future optimization
+                        for agg in aggs.get(key):
+                            if not hasattr(col, agg):
+                                raise AttributeError(
+                                    f"{agg} is not a valid function for "
+                                    f"'Series' object"
+                                )
+                            ans[agg] = getattr(col, agg)()
+                    elif isinstance(aggs.get(key), str):
+                        if not hasattr(col, aggs.get(key)):
+                            raise AttributeError(
+                                f"{aggs.get(key)} is not a valid function for "
+                                f"'Series' object"
+                            )
+                        ans[aggs.get(key)] = getattr(col, agg)()
+                    result[key] = ans
+            else:
+                raise ValueError("values of dict must be a string or list")
+
+            return result
+
+        elif callable(aggs):
+            raise NotImplementedError(
+                "callable parameter is not implemented yet"
+            )
+
+        else:
+            raise ValueError("argument must be a string, list or dict")
+
     def nlargest(self, n, columns, keep="first"):
         """Get the rows of the DataFrame sorted by the n largest value of *columns*
 
@@ -4094,7 +4230,7 @@ def query(self, expr, local_dict=None):
         >>> b = ('b', [3, 4, 5])
         >>> df = cudf.DataFrame([a, b])
         >>> expr = "(a == 2 and b == 4) or (b == 3)"
-        >>> print(df.query(expr))
+        >>> df.query(expr)
            a  b
         0  1  3
         1  2  4
@@ -4107,7 +4243,7 @@ def query(self, expr, local_dict=None):
         >>> data = np.array(['2018-10-07', '2018-10-08'], dtype='datetime64')
         >>> df['datetimes'] = data
         >>> search_date = datetime.datetime.strptime('2018-10-08', '%Y-%m-%d')
-        >>> print(df.query('datetimes==@search_date'))
+        >>> df.query('datetimes==@search_date')
                         datetimes
         1 2018-10-08T00:00:00.000
 
@@ -4119,8 +4255,8 @@ def query(self, expr, local_dict=None):
         >>> data = np.array(['2018-10-07', '2018-10-08'], dtype='datetime64')
         >>> df['datetimes'] = data
         >>> search_date2 = datetime.datetime.strptime('2018-10-08', '%Y-%m-%d')
-        >>> print(df.query('datetimes==@search_date',
-        >>>         local_dict={'search_date':search_date2}))
+        >>> df.query('datetimes==@search_date',
+        ...         local_dict={'search_date':search_date2})
                         datetimes
         1 2018-10-08T00:00:00.000
         """
@@ -4388,17 +4524,17 @@ def replace(
         Examples
         --------
         >>> import cudf
-        >>> gdf = cudf.DataFrame()
-        >>> gdf['id']= [0, 1, 2, -1, 4, -1, 6]
-        >>> gdf['id']= gdf['id'].replace(-1, None)
-        >>> gdf
+        >>> df = cudf.DataFrame()
+        >>> df['id']= [0, 1, 2, -1, 4, -1, 6]
+        >>> df['id']= df['id'].replace(-1, None)
+        >>> df
              id
         0     0
         1     1
         2     2
-        3  null
+        3  <NA>
         4     4
-        5  null
+        5  <NA>
         6     6
 
         Notes
@@ -7117,23 +7253,15 @@ def from_pandas(obj, nan_as_null=None):
                 (3, 2),
                 (4, 2),
                 (5, 1)],
-            names=['x', 'y'])
+               names=['x', 'y'])
     >>> gmidx = cudf.from_pandas(pmidx)
     >>> gmidx
-    MultiIndex(levels=[0    1
-    1    3
-    2    4
-    3    5
-    dtype: int64, 0    1
-    1    2
-    2    5
-    dtype: int64],
-    codes=   x  y
-    0  0  0
-    1  0  2
-    2  1  1
-    3  2  1
-    4  3  0)
+    MultiIndex([(1, 1),
+                (1, 5),
+                (3, 2),
+                (4, 2),
+                (5, 1)],
+               names=['x', 'y'])
     >>> type(gmidx)
     <class 'cudf.core.multiindex.MultiIndex'>
     >>> type(pmidx)
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 7da3facb6ad..6fb1fe67b31 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -119,20 +119,12 @@ def size(self):
         ...                 names=["x", "y"],
         ...             )
         >>> midx
-        MultiIndex(levels=[0       a
-        1       b
-        2       c
-        3    None
-        dtype: object, 0       1
-        1    None
-        2       5
-        dtype: object],
-        codes=   x  y
-        0  0  0
-        1  0  2
-        2  1  1
-        3  2  1
-        4  3  0)
+        MultiIndex([( 'a',  '1'),
+                    ( 'a',  '5'),
+                    ( 'b', <NA>),
+                    ( 'c', <NA>),
+                    (<NA>,  '1')],
+                   names=['x', 'y'])
         >>> midx.size
         5
         """
@@ -174,8 +166,8 @@ def empty(self):
         >>> df = cudf.DataFrame({'A' : [None, None]})
         >>> df
               A
-        0  null
-        1  null
+        0  <NA>
+        1  <NA>
         >>> df.empty
         False
         >>> df.dropna().empty
@@ -187,7 +179,7 @@ def empty(self):
         >>> s
         0       1
         1       2
-        2    null
+        2    <NA>
         dtype: int64
         >>> s.empty
         False
@@ -870,9 +862,9 @@ def where(self, cond, other=None, inplace=False):
         >>> ser.where(ser > 2)
         0       4
         1       3
-        2    null
-        3    null
-        4    null
+        2    <NA>
+        3    <NA>
+        4    <NA>
         dtype: int64
         """
 
@@ -1049,8 +1041,8 @@ def mask(self, cond, other=None, inplace=False):
         4     0
         dtype: int64
         >>> ser.mask(ser > 2)
-        0    null
-        1    null
+        0    <NA>
+        1    <NA>
         2       2
         3       1
         4       0
@@ -1255,10 +1247,10 @@ def dropna(
         ...                             np.datetime64("NaT"),
         ...                             np.datetime64("NaT")]})
         >>> df
-               name        toy       born
-        0    Alfred  Batmobile 1940-04-25
-        1    Batman       None       null
-        2  Catwoman   Bullwhip       null
+               name        toy                 born
+        0    Alfred  Batmobile  1940-04-25 00:00:00
+        1    Batman       <NA>                 <NA>
+        2  Catwoman   Bullwhip                 <NA>
 
         Drop the rows where at least one element is null.
 
@@ -1277,17 +1269,17 @@ def dropna(
         Drop the rows where all elements are null.
 
         >>> df.dropna(how='all')
-               name        toy       born
-        0    Alfred  Batmobile 1940-04-25
-        1    Batman       None       null
-        2  Catwoman   Bullwhip       null
+               name        toy                 born
+        0    Alfred  Batmobile  1940-04-25 00:00:00
+        1    Batman       <NA>                 <NA>
+        2  Catwoman   Bullwhip                 <NA>
 
         Keep only the rows with at least 2 non-null values.
 
         >>> df.dropna(thresh=2)
-               name        toy       born
-        0    Alfred  Batmobile 1940-04-25
-        2  Catwoman   Bullwhip       null
+               name        toy                 born
+        0    Alfred  Batmobile  1940-04-25 00:00:00
+        2  Catwoman   Bullwhip                 <NA>
 
         Define in which columns to look for null values.
 
@@ -1311,8 +1303,10 @@ def dropna(
 
         return self._mimic_inplace(result, inplace=inplace)
 
-    def fillna(self, value, method=None, axis=None, inplace=False, limit=None):
-        """Fill null values with ``value``.
+    def fillna(
+        self, value=None, method=None, axis=None, inplace=False, limit=None
+    ):
+        """Fill null values with ``value`` or specified ``method``.
 
         Parameters
         ----------
@@ -1320,7 +1314,13 @@ def fillna(self, value, method=None, axis=None, inplace=False, limit=None):
             Value to use to fill nulls. If Series-like, null values
             are filled with values in corresponding indices.
             A dict can be used to provide different values to fill nulls
-            in different columns.
+            in different columns. Cannot be used with ``method``.
+
+        method : {'ffill', 'bfill'}, default None
+            Method to use for filling null values in the dataframe or series.
+            `ffill` propagates the last non-null values forward to the next
+            non-null value. `bfill` propagates backward with the next non-null
+            value. Cannot be used with ``value``.
 
         Returns
         -------
@@ -1334,8 +1334,8 @@ def fillna(self, value, method=None, axis=None, inplace=False, limit=None):
         >>> df
               a     b
         0     1     3
-        1     2  null
-        2  null     5
+        1     2  <NA>
+        2  <NA>     5
         >>> df.fillna(4)
            a  b
         0  1  3
@@ -1353,7 +1353,7 @@ def fillna(self, value, method=None, axis=None, inplace=False, limit=None):
         >>> ser
         0       a
         1       b
-        2    None
+        2    <NA>
         3       c
         dtype: object
         >>> ser.fillna('z')
@@ -1374,18 +1374,44 @@ def fillna(self, value, method=None, axis=None, inplace=False, limit=None):
         dtype: object
         >>> df.fillna({'a': 3, 'b': 4}, inplace=True)
         >>> df
-        a  b
+           a  b
         0  1  3
         1  2  4
         2  3  5
+
+        ``fillna`` specified with fill ``method``
+
+        >>> ser = cudf.Series([1, None, None, 2, 3, None, None])
+        >>> ser.fillna(method='ffill')
+        0    1
+        1    1
+        2    1
+        3    2
+        4    3
+        5    3
+        6    3
+        dtype: int64
+        >>> ser.fillna(method='bfill')
+        0       1
+        1       2
+        2       2
+        3       2
+        4       3
+        5    <NA>
+        6    <NA>
+        dtype: int64
         """
-        if method is not None:
-            raise NotImplementedError("The method keyword is not supported")
         if limit is not None:
             raise NotImplementedError("The limit keyword is not supported")
         if axis:
             raise NotImplementedError("The axis keyword is not supported")
 
+        if value is not None and method is not None:
+            raise ValueError("Cannot specify both 'value' and 'method'.")
+
+        if method and method not in {"ffill", "bfill"}:
+            raise NotImplementedError(f"Fill method {method} is not supported")
+
         if isinstance(value, cudf.Series):
             value = value.reindex(self._data.names)
         elif isinstance(value, cudf.DataFrame):
@@ -1406,11 +1432,12 @@ def fillna(self, value, method=None, axis=None, inplace=False, limit=None):
         copy_data = self._data.copy(deep=True)
 
         for name in copy_data.keys():
-            if name in value and not libcudf.scalar._is_null_host_scalar(
-                value[name]
-            ):
-                copy_data[name] = copy_data[name].fillna(value[name],)
-
+            should_fill = (
+                name in value
+                and not libcudf.scalar._is_null_host_scalar(value[name])
+            ) or method is not None
+            if should_fill:
+                copy_data[name] = copy_data[name].fillna(value[name], method)
         result = self._from_table(Frame(copy_data, self._index))
 
         return self._mimic_inplace(result, inplace=inplace)
@@ -1817,6 +1844,11 @@ def sample(
                 n = int(round(self.shape[1] * frac))
 
         if axis is None or axis == 0 or axis == "index":
+            if n > 0 and self.shape[0] == 0:
+                raise ValueError(
+                    "Cannot take a sample larger than 0 when axis is empty"
+                )
+
             if not replace and n > self.shape[0]:
                 raise ValueError(
                     "Cannot take a larger sample than population "
@@ -1857,6 +1889,17 @@ def sample(
                     f"object type {self.__class__}"
                 )
 
+            if replace:
+                raise NotImplementedError(
+                    "Sample is not supported for "
+                    f"axis {axis} when 'replace=True'"
+                )
+
+            if n > 0 and self.shape[1] == 0:
+                raise ValueError(
+                    "Cannot take a sample larger than 0 when axis is empty"
+                )
+
             columns = np.asarray(self._data.names)
             if not replace and n > columns.size:
                 raise ValueError(
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index f60288b95b3..219d355d3cc 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -126,14 +126,9 @@ def __init__(
         UInt64Index([1, 2, 3], dtype='uint64', name='a')
 
         >>> cudf.Index(cudf.DataFrame({"a":[1, 2], "b":[2, 3]}))
-        MultiIndex(levels=[0    1
-        1    2
-        dtype: int64, 0    2
-        1    3
-        dtype: int64],
-        codes=   a  b
-        0  0  0
-        1  1  1)
+        MultiIndex([(1, 2),
+                    (2, 3)],
+                  names=['a', 'b'])
         """
         pass
 
@@ -237,7 +232,7 @@ def get_level_values(self, level):
         Examples
         --------
         >>> import cudf
-        >>> idx = cudf.core.index.StringIndex(["a","b","c"])
+        >>> idx = cudf.Index(["a", "b", "c"])
         >>> idx.get_level_values(0)
         StringIndex(['a' 'b' 'c'], dtype='object')
         """
@@ -407,31 +402,17 @@ def dropna(self, how="any"):
         ...         names=["x", "y"],
         ...     )
         >>> midx
-        MultiIndex(levels=[0       1
-        1    null
-        2       4
-        3    null
-        dtype: int64, 0    1
-        1    2
-        2    5
-        dtype: int64],
-        codes=   x  y
-        0  0  0
-        1  0  2
-        2  1  1
-        3  2  1
-        4  3  0)
+        MultiIndex([(   1, 1),
+                    (   1, 5),
+                    (<NA>, 2),
+                    (   4, 2),
+                    (<NA>, 1)],
+                   names=['x', 'y'])
         >>> midx.dropna()
-        MultiIndex(levels=[0    1
-        1    4
-        dtype: int64, 0    1
-        1    2
-        2    5
-        dtype: int64],
-        codes=   x  y
-        0  0  0
-        1  0  2
-        2  1  1)
+        MultiIndex([(1, 1),
+                    (1, 5),
+                    (4, 2)],
+                   names=['x', 'y'])
         """
         return super().dropna(how=how)
 
@@ -452,6 +433,17 @@ def _clean_nulls_from_index(self):
         else:
             return self
 
+    def factorize(self, na_sentinel=-1):
+        """
+        Encode the input values as integer labels
+
+        See Also
+        --------
+        cudf.core.series.Series.factorize : Encode the input values of Series.
+
+        """
+        return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel)
+
     @property
     def nlevels(self):
         """
@@ -505,16 +497,11 @@ def set_names(self, names, level=None, inplace=False):
         >>> idx = cudf.MultiIndex.from_product([['python', 'cobra'],
         ... [2018, 2019]])
         >>> idx
-        MultiIndex(levels=[0     cobra
-        1    python
-        dtype: object, 0    2018
-        1    2019
-        dtype: int64],
-        codes=   0  1
-        0  1  0
-        1  1  1
-        2  0  0
-        3  0  1)
+        MultiIndex([('python', 2018),
+                    ('python', 2019),
+                    ( 'cobra', 2018),
+                    ( 'cobra', 2019)],
+                   )
         >>> idx.names
         FrozenList([None, None])
         >>> idx.set_names(['kind', 'year'], inplace=True)
@@ -611,20 +598,12 @@ def argsort(self, ascending=True, **kwargs):
         ...      names=["x", "y"],
         ... )
         >>> index
-        MultiIndex(levels=[0     1
-        1     3
-        2     4
-        3   -10
-        dtype: int64, 0     1
-        1    11
-        2     5
-        dtype: int64],
-        codes=   x  y
-        0  0  0
-        1  0  2
-        2  1  1
-        3  2  1
-        4  3  0)
+        MultiIndex([(  1,  1),
+                    (  1,  5),
+                    (  3, 11),
+                    (  4, 11),
+                    (-10,  1)],
+                   names=['x', 'y'])
         >>> index.argsort()
         array([4, 0, 1, 2, 3], dtype=int32)
         >>> index.argsort(ascending=False)
@@ -969,50 +948,26 @@ def sort_values(self, return_indexer=False, ascending=True, key=None):
         ...      names=["x", "y"],
         ... )
         >>> midx
-        MultiIndex(levels=[0     1
-        1     3
-        2     4
-        3   -10
-        dtype: int64, 0     1
-        1    11
-        2     5
-        dtype: int64],
-        codes=   x  y
-        0  0  0
-        1  0  2
-        2  1  1
-        3  2  1
-        4  3  0)
+        MultiIndex([(  1,  1),
+                    (  1,  5),
+                    (  3, 11),
+                    (  4, 11),
+                    (-10,  1)],
+                   names=['x', 'y'])
         >>> midx.sort_values()
-        MultiIndex(levels=[0     1
-        1     3
-        2     4
-        3   -10
-        dtype: int64, 0     1
-        1    11
-        2     5
-        dtype: int64],
-        codes=   x  y
-        4  3  0
-        0  0  0
-        1  0  2
-        2  1  1
-        3  2  1)
+        MultiIndex([(-10,  1),
+                    (  1,  1),
+                    (  1,  5),
+                    (  3, 11),
+                    (  4, 11)],
+                   names=['x', 'y'])
         >>> midx.sort_values(ascending=False)
-        MultiIndex(levels=[0     1
-        1     3
-        2     4
-        3   -10
-        dtype: int64, 0     1
-        1    11
-        2     5
-        dtype: int64],
-        codes=   x  y
-        3  2  1
-        2  1  1
-        1  0  2
-        0  0  0
-        4  3  0)
+        MultiIndex([(  4, 11),
+                    (  3, 11),
+                    (  1,  5),
+                    (  1,  1),
+                    (-10,  1)],
+                   names=['x', 'y'])
         """
         if key is not None:
             raise NotImplementedError("key parameter is not yet implemented.")
@@ -1127,16 +1082,18 @@ def join(
         >>> lhs = cudf.DataFrame(
         ...     {"a":[2, 3, 1], "b":[3, 4, 2]}).set_index(['a', 'b']
         ... ).index
+        >>> lhs
+        MultiIndex([(2, 3),
+                    (3, 4),
+                    (1, 2)],
+                   names=['a', 'b'])
         >>> rhs = cudf.DataFrame({"a":[1, 4, 3]}).set_index('a').index
+        >>> rhs
+        Int64Index([1, 4, 3], dtype='int64', name='a')
         >>> lhs.join(rhs, how='inner')
-        MultiIndex(levels=[0    1
-        1    3
-        dtype: int64, 0    2
-        1    4
-        dtype: int64],
-        codes=   a  b
-        0  1  1
-        1  0  0)
+        MultiIndex([(3, 4),
+                    (1, 2)],
+                   names=['a', 'b'])
         """
 
         if isinstance(self, cudf.MultiIndex) and isinstance(
@@ -1481,7 +1438,7 @@ def from_pandas(cls, index, nan_as_null=None):
         >>> data = [10, 20, 30, np.nan]
         >>> pdi = pd.Index(data)
         >>> cudf.core.index.Index.from_pandas(pdi)
-        Index(['10.0', '20.0', '30.0', 'null'], dtype='object')
+        Float64Index([10.0, 20.0, 30.0, <NA>], dtype='float64')
         >>> cudf.core.index.Index.from_pandas(pdi, nan_as_null=False)
         Float64Index([10.0, 20.0, 30.0, nan], dtype='float64')
         """
@@ -1756,7 +1713,7 @@ def deserialize(cls, header, frames):
         name = pickle.loads(header["name"])
         start = h["start"]
         stop = h["stop"]
-        step = h["step"]
+        step = h.get("step", 1)
         return RangeIndex(start=start, stop=stop, step=step, name=name)
 
     @property
@@ -1922,7 +1879,7 @@ class GenericIndex(Index):
     """An array of orderable values that represent the indices of another Column
 
     Attributes
-    ---
+    ----------
     _values: A Column object
     name: A string
     """
@@ -2621,18 +2578,12 @@ class CategoricalIndex(GenericIndex):
     >>> import pandas as pd
     >>> cudf.CategoricalIndex(
     ... data=[1, 2, 3, 4], categories=[1, 2], ordered=False, name="a")
-    CategoricalIndex(['1', '2', 'null', 'null'],
-            categories=['1', '2', 'null'],
-            ordered=False,
-            dtype='category')
+    CategoricalIndex([1, 2, <NA>, <NA>], categories=[1, 2], ordered=False, name='a', dtype='category', name='a')
 
     >>> cudf.CategoricalIndex(
     ... data=[1, 2, 3, 4], dtype=pd.CategoricalDtype([1, 2, 3]), name="a")
-    CategoricalIndex(['1', '2', '3', 'null'],
-            categories=['1', '2', '3', 'null'],
-            ordered=False,
-            dtype='category')
-    """
+    CategoricalIndex([1, 2, 3, <NA>], categories=[1, 2, 3], ordered=False, name='a', dtype='category', name='a')
+    """  # noqa: E501
 
     def __new__(
         cls,
diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py
index 2d43715de72..bcbb8454a63 100755
--- a/python/cudf/cudf/core/indexing.py
+++ b/python/cudf/cudf/core/indexing.py
@@ -117,7 +117,7 @@ def __getitem__(self, arg):
         try:
             arg = self._loc_to_iloc(arg)
         except (TypeError, KeyError, IndexError, ValueError):
-            raise IndexError("Failed to convert index to appropirate row")
+            raise KeyError(arg)
 
         return self._sr.iloc[arg]
 
@@ -139,7 +139,7 @@ def _loc_to_iloc(self, arg):
                 )
                 return found_index
             except (TypeError, KeyError, IndexError, ValueError):
-                raise IndexError("label scalar is out of bound")
+                raise KeyError("label scalar is out of bound")
 
         elif isinstance(arg, slice):
             return get_label_range_or_mask(
@@ -158,7 +158,7 @@ def _loc_to_iloc(self, arg):
             else:
                 indices = indices_from_labels(self._sr, arg)
                 if indices.null_count > 0:
-                    raise IndexError("label scalar is out of bound")
+                    raise KeyError("label scalar is out of bound")
                 return indices
 
 
@@ -337,7 +337,7 @@ def _getitem_tuple_arg(self, arg):
                     df.drop(columns=[tmp_col_name], inplace=True)
                     # There were no indices found
                     if len(df) == 0:
-                        raise IndexError
+                        raise KeyError(arg)
 
         # Step 3: Gather index
         if df.shape[0] == 1:  # we have a single row
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 3e8f8212b0c..dc090d96744 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -8,6 +8,7 @@
 import cupy
 import numpy as np
 import pandas as pd
+from pandas._config import get_option
 
 import cudf
 from cudf import _lib as libcudf
@@ -48,17 +49,11 @@ class MultiIndex(Index):
     >>> import cudf
     >>> cudf.MultiIndex(
     ... levels=[[1, 2], ['blue', 'red']], codes=[[0, 0, 1, 1], [1, 0, 1, 0]])
-    MultiIndex(levels=[0    1
-    1    2
-    dtype: int64, 0    blue
-    1     red
-    dtype: object],
-    codes=   0  1
-    0  0  1
-    1  0  0
-    2  1  1
-    3  1  0)
-
+    MultiIndex([(1,  'red'),
+                (1, 'blue'),
+                (2,  'red'),
+                (2, 'blue')],
+               )
     """
 
     def __new__(
@@ -392,13 +387,73 @@ def _popn(self, n):
         return result
 
     def __repr__(self):
-        return (
-            "MultiIndex(levels="
-            + str(self.levels)
-            + ",\ncodes="
-            + str(self.codes)
-            + ")"
-        )
+        max_seq_items = get_option("display.max_seq_items") or len(self)
+
+        if len(self) > max_seq_items:
+            n = int(max_seq_items / 2) + 1
+            # TODO: Update the following two arange calls to
+            # a single arange call once arange has support for
+            # a vector start/end points.
+            indices = cudf.core.column.arange(start=0, stop=n, step=1)
+            indices = indices.append(
+                cudf.core.column.arange(
+                    start=len(self) - n, stop=len(self), step=1
+                )
+            )
+            preprocess = self.take(indices)
+        else:
+            preprocess = self
+
+        cols_nulls = [
+            preprocess._source_data._data[col].has_nulls
+            for col in preprocess._source_data._data
+        ]
+        if any(cols_nulls):
+            preprocess_df = preprocess._source_data
+            for name, col in preprocess_df._data.items():
+                if isinstance(
+                    col,
+                    (
+                        cudf.core.column.datetime.DatetimeColumn,
+                        cudf.core.column.timedelta.TimeDeltaColumn,
+                    ),
+                ):
+                    preprocess_df[name] = col.astype("str").fillna(
+                        cudf._NA_REP
+                    )
+                else:
+                    preprocess_df[name] = col
+
+            tuples_list = list(
+                zip(
+                    *list(
+                        map(lambda val: pd.NA if val is None else val, col)
+                        for col in preprocess_df.to_arrow()
+                        .to_pydict()
+                        .values()
+                    )
+                )
+            )
+            preprocess = preprocess.to_pandas(nullable=True)
+            preprocess.values[:] = tuples_list
+        else:
+            preprocess = preprocess.to_pandas(nullable=True)
+
+        output = preprocess.__repr__()
+        output_prefix = self.__class__.__name__ + "("
+        output = output.lstrip(output_prefix)
+        lines = output.split("\n")
+
+        if len(lines) > 1:
+            if "length=" in lines[-1] and len(self) != len(preprocess):
+                last_line = lines[-1]
+                length_index = last_line.index("length=")
+                last_line = last_line[:length_index] + f"length={len(self)})"
+                lines = lines[:-1]
+                lines.append(last_line)
+
+        data_output = "\n".join(lines)
+        return output_prefix + data_output
 
     @classmethod
     def from_arrow(cls, table):
@@ -419,19 +474,11 @@ def from_arrow(cls, table):
         >>> import cudf
         >>> import pyarrow as pa
         >>> tbl = pa.table({"a":[1, 2, 3], "b":["a", "b", "c"]})
-
         >>> cudf.MultiIndex.from_arrow(tbl)
-        MultiIndex(levels=[0    1
-        1    2
-        2    3
-        dtype: int64, 0    a
-        1    b
-        2    c
-        dtype: object],
-        codes=   a  b
-        0  0  0
-        1  1  1
-        2  2  2)
+        MultiIndex([(1, 'a'),
+                    (2, 'b'),
+                    (3, 'c')],
+                   names=['a', 'b'])
         """
 
         return super(Index, cls).from_arrow(table)
@@ -449,17 +496,10 @@ def to_arrow(self):
         >>> df = cudf.DataFrame({"a":[1, 2, 3], "b":[2, 3, 4]})
         >>> mindex = cudf.Index(df)
         >>> mindex
-        MultiIndex(levels=[0    1
-        1    2
-        2    3
-        dtype: int64, 0    2
-        1    3
-        2    4
-        dtype: int64],
-        codes=   a  b
-        0  0  0
-        1  1  1
-        2  2  2)
+        MultiIndex([(1, 2),
+                    (2, 3),
+                    (3, 4)],
+                   names=['a', 'b'])
         >>> mindex.to_arrow()
         pyarrow.Table
         a: int64
@@ -618,9 +658,9 @@ def _compute_levels_and_codes(self):
         codes = cudf.DataFrame()
         for name in self._source_data.columns:
             code, cats = self._source_data[name].factorize()
-            codes[name] = code.reset_index(drop=True).astype(np.int64)
+            codes[name] = code.astype(np.int64)
             cats.name = None
-            cats = cats.reset_index(drop=True)._copy_construct(name=None)
+            cats = cudf.Series(cats)._copy_construct(name=None)
             levels.append(cats)
 
         self._levels = levels
@@ -1115,38 +1155,24 @@ def droplevel(self, level=-1):
         Dropping level by index:
 
         >>> idx.droplevel(0)
-        MultiIndex(levels=[0    1
-        1    2
-        2    3
-        dtype: int64, 0    0
-        1    1
-        2    2
-        dtype: int64],
-        codes=   second  third
-        0       0      0
-        1       0      1
-        2       1      2
-        3       1      0
-        4       2      1
-        5       2      2)
+        MultiIndex([(1, 0),
+                    (1, 1),
+                    (2, 2),
+                    (2, 0),
+                    (3, 1),
+                    (3, 2)],
+                   names=['second', 'third'])
 
         Dropping level by name:
 
         >>> idx.droplevel("first")
-        MultiIndex(levels=[0    1
-        1    2
-        2    3
-        dtype: int64, 0    0
-        1    1
-        2    2
-        dtype: int64],
-        codes=   second  third
-        0       0      0
-        1       0      1
-        2       1      2
-        3       1      0
-        4       2      1
-        5       2      2)
+        MultiIndex([(1, 0),
+                    (1, 1),
+                    (2, 2),
+                    (2, 0),
+                    (3, 1),
+                    (3, 2)],
+                   names=['second', 'third'])
 
         Dropping multiple levels:
 
@@ -1160,9 +1186,9 @@ def droplevel(self, level=-1):
         else:
             return mi
 
-    def to_pandas(self, **kwargs):
+    def to_pandas(self, nullable=False, **kwargs):
         if hasattr(self, "_source_data"):
-            result = self._source_data.to_pandas()
+            result = self._source_data.to_pandas(nullable=nullable)
             result.columns = self.names
             return pd.MultiIndex.from_frame(result)
 
@@ -1203,9 +1229,11 @@ def from_pandas(cls, multiindex, nan_as_null=None):
         >>> import cudf
         >>> import pandas as pd
         >>> pmi = pd.MultiIndex(levels=[['a', 'b'], ['c', 'd']],
-                                codes=[[0, 1], [1, ]])
+        ...                     codes=[[0, 1], [1, 1]])
         >>> cudf.from_pandas(pmi)
-        MultiIndex( ... )
+        MultiIndex([('a', 'd'),
+                    ('b', 'd')],
+                   )
         """
         if not isinstance(multiindex, pd.MultiIndex):
             raise TypeError("not a pandas.MultiIndex")
@@ -1289,35 +1317,19 @@ def fillna(self, value):
         ...         names=["x", "y"],
         ...       )
         >>> index
-        MultiIndex(levels=[0       a
-        1       b
-        2       c
-        3    None
-        dtype: object, 0       1
-        1    None
-        2       5
-        dtype: object],
-        codes=   x  y
-        0  0  0
-        1  0  2
-        2  1  1
-        3  2  1
-        4  3  0)
+        MultiIndex([( 'a',  '1'),
+                    ( 'a',  '5'),
+                    ( 'b', <NA>),
+                    ( 'c', <NA>),
+                    (<NA>,  '1')],
+                   names=['x', 'y'])
         >>> index.fillna('hello')
-        MultiIndex(levels=[0        a
-        1        b
-        2        c
-        3    hello
-        dtype: object, 0        1
-        1        5
-        2    hello
-        dtype: object],
-        codes=   x  y
-        0  0  0
-        1  0  1
-        2  1  2
-        3  2  2
-        4  3  0)
+        MultiIndex([(    'a',     '1'),
+                    (    'a',     '5'),
+                    (    'b', 'hello'),
+                    (    'c', 'hello'),
+                    ('hello',     '1')],
+                   names=['x', 'y'])
         """
 
         return super().fillna(value=value)
@@ -1372,50 +1384,35 @@ def append(self, other):
         --------
         >>> import cudf
         >>> idx1 = cudf.MultiIndex(
-        ... levels=[[1, 2], ['blue', 'red']],
-        ... codes=[[0, 0, 1, 1], [1, 0, 1, 0]])
+        ...     levels=[[1, 2], ['blue', 'red']],
+        ...     codes=[[0, 0, 1, 1], [1, 0, 1, 0]]
+        ... )
         >>> idx2 = cudf.MultiIndex(
-        ... levels=[[3, 4], ['blue', 'red']],
-        ... codes=[[0, 0, 1, 1], [1, 0, 1, 0]])
+        ...     levels=[[3, 4], ['blue', 'red']],
+        ...     codes=[[0, 0, 1, 1], [1, 0, 1, 0]]
+        ... )
         >>> idx1
-        MultiIndex(levels=[0    1
-        1    2
-        dtype: int64, 0    blue
-        1     red
-        dtype: object],
-        codes=   0  1
-        0  0  1
-        1  0  0
-        2  1  1
-        3  1  0)
+        MultiIndex([(1,  'red'),
+                    (1, 'blue'),
+                    (2,  'red'),
+                    (2, 'blue')],
+                   )
         >>> idx2
-        MultiIndex(levels=[0    3
-        1    4
-        dtype: int64, 0    blue
-        1     red
-        dtype: object],
-        codes=   0  1
-        0  0  1
-        1  0  0
-        2  1  1
-        3  1  0)
+        MultiIndex([(3,  'red'),
+                    (3, 'blue'),
+                    (4,  'red'),
+                    (4, 'blue')],
+                   )
         >>> idx1.append(idx2)
-        MultiIndex(levels=[0    1
-        1    2
-        2    3
-        3    4
-        dtype: int64, 0    blue
-        1     red
-        dtype: object],
-        codes=   0  1
-        0  0  1
-        1  0  0
-        2  1  1
-        3  1  0
-        4  2  1
-        5  2  0
-        6  3  1
-        7  3  0)
+        MultiIndex([(1,  'red'),
+                    (1, 'blue'),
+                    (2,  'red'),
+                    (2, 'blue'),
+                    (3,  'red'),
+                    (3, 'blue'),
+                    (4,  'red'),
+                    (4, 'blue')],
+                   )
         """
         if isinstance(other, (list, tuple)):
             to_concat = [self]
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 23b007cf6d2..b5707a3a07c 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -164,13 +164,13 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
     >>> df3 = cudf.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
     ...                    columns=['letter', 'number', 'animal'])
     >>> df3
-    letter  number animal
+      letter  number animal
     0      c       3    cat
     1      d       4    dog
     >>> cudf.concat([df1, df3], sort=False)
       letter  number animal
-    0      a       1   None
-    1      b       2   None
+    0      a       1   <NA>
+    1      b       2   <NA>
     0      c       3    cat
     1      d       4    dog
 
@@ -625,7 +625,7 @@ def get_dummies(
     0   1.0
     1   2.0
     2   NaN
-    3  null
+    3  <NA>
 
     >>> cudf.get_dummies(df, dummy_na=True, columns=["a"])
        a_1.0  a_2.0  a_nan  a_null
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index fa7ac273b2f..3872e296ed5 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -39,7 +39,7 @@ def __init__(self, value, dtype=None):
         >>> df = cudf.DataFrame({'a':[1,2,3], 'b':[4.5, 5.5, 6.5]})
         >>> slr = cudf.Scalar(10, dtype='uint8')
         >>> df - slr
-        a    b
+           a    b
         0 -9 -5.5
         1 -8 -4.5
         2 -7 -3.5
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 0ea76877bab..28fb10ea334 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -254,7 +254,7 @@ def from_pandas(cls, s, nan_as_null=None):
         0    10.0
         1    20.0
         2    30.0
-        3    null
+        3    <NA>
         dtype: float64
         >>> cudf.Series.from_pandas(pds, nan_as_null=False)
         0    10.0
@@ -1054,7 +1054,7 @@ def tail(self, n=5):
         --------
         >>> import cudf
         >>> ser = cudf.Series([4, 3, 2, 1, 0])
-        >>> print(ser.tail(2))
+        >>> ser.tail(2)
         3    1
         4    0
         """
@@ -1446,12 +1446,12 @@ def rfloordiv(self, other, fill_value=None, axis=0):
         >>> s
         0      10
         1      20
-        2    null
+        2    <NA>
         dtype: int64
         >>> s.rfloordiv(200)
         0      20
         1      10
-        2    null
+        2    <NA>
         dtype: int64
         >>> s.rfloordiv(200, fill_value=2)
         0     20
@@ -1868,7 +1868,7 @@ def dropna(self, axis=0, inplace=False, how=None):
         >>> ser = cudf.Series(['', None, 'abc'])
         >>> ser
         0
-        1    None
+        1    <NA>
         2     abc
         dtype: object
         >>> ser.dropna()
@@ -1896,7 +1896,9 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False):
     def fill(self, fill_value, begin=0, end=-1, inplace=False):
         return self._fill([fill_value], begin, end, inplace)
 
-    def fillna(self, value, method=None, axis=None, inplace=False, limit=None):
+    def fillna(
+        self, value=None, method=None, axis=None, inplace=False, limit=None
+    ):
         if isinstance(value, pd.Series):
             value = Series.from_pandas(value)
 
@@ -2574,13 +2576,7 @@ def factorize(self, na_sentinel=-1):
         1    c
         dtype: object
         """
-        cats = self.dropna().unique().astype(self.dtype)
-
-        name = self.name  # label_encoding mutates self.name
-        labels = self.label_encoding(cats=cats, na_sentinel=na_sentinel)
-        self.name = name
-
-        return labels, cats
+        return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel)
 
     # UDF related
 
@@ -5146,7 +5142,7 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
     0    1.9876543
     1    2.9876654
     2    3.9876543
-    3         null
+    3         <NA>
     4          9.9
     5          1.0
     dtype: float64
@@ -5154,9 +5150,9 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
     0    1.987654321
     1    2.987654321
     2    3.987654321
-    3           null
+    3           <NA>
     4           19.9
-    5           null
+    5           <NA>
     dtype: float64
     >>> cudf.isclose(s1, s2)
     0     True
@@ -5184,8 +5180,6 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
     dtype: bool
     """
 
-    index = None
-
     if not can_convert_to_column(a):
         raise TypeError(
             f"Parameter `a` is expected to be a "
@@ -5202,6 +5196,8 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
     if isinstance(b, pd.Series):
         b = Series.from_pandas(b)
 
+    index = None
+
     if isinstance(a, cudf.Series) and isinstance(b, cudf.Series):
         b = b.reindex(a.index)
         index = as_index(a.index)
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 732aafee3c0..206786fad42 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -3,6 +3,7 @@
 import warnings
 
 import numpy as np
+import pandas as pd
 from pandas.core.tools.datetimes import _unit_map
 
 import cudf
@@ -331,3 +332,168 @@ def get_units(value):
         return _unit_map[value.lower()]
 
     return value
+
+
+class _DateOffsetScalars(object):
+    def __init__(self, scalars):
+        self._gpu_scalars = scalars
+
+
+class _UndoOffsetMeta(pd._libs.tslibs.offsets.OffsetMeta):
+    """
+    For backward compatibility reasons, `pd.DateOffset` is defined
+    with a metaclass `OffsetMeta`, which makes it such that any
+    subclass of `pd._libs.tslibs.offset.BaseOffset` is reported as
+    a subclass of `pd.DateOffset`.
+
+    Because we subclass `pd.DateOffset`, we inherit this behaviour,
+    but don't want to. This metaclass inherits from `OffsetMeta`
+    and restores normal instance and subclass checking to any
+    classes that use it.
+    """
+
+    @classmethod
+    def __instancecheck__(cls, obj) -> bool:
+        return type.__instancecheck__(cls, obj)
+
+    @classmethod
+    def __subclasscheck__(cls, obj) -> bool:
+        return type.__subclasscheck__(cls, obj)
+
+
+class DateOffset(pd.DateOffset, metaclass=_UndoOffsetMeta):
+    def __init__(self, n=1, normalize=False, **kwds):
+        """
+        An object used for binary ops where calendrical arithmetic
+        is desired rather than absolute time arithmetic. Used to
+        add or subtract a whole number of periods, such as several
+        months or years, to a series or index of datetime dtype.
+        Works similarly to pd.DateOffset, and currently supports a
+        subset of its functionality. The arguments that aren't yet
+        supported are:
+            - years
+            - weeks
+            - days
+            - hours
+            - minutes
+            - seconds
+            - microseconds
+            - milliseconds
+            - nanoseconds
+        In addition, cuDF does not yet support DateOffset arguments
+        that 'replace' units in the datetime data being operated on
+        such as
+            - year
+            - month
+            - week
+            - day
+            - hour
+            - minute
+            - second
+            - microsecond
+            - millisecond
+            - nanosecond
+        Finally, cuDF does not yet support rounding via a `normalize`
+        keyword argument.
+
+        Parameters
+        ----------
+        n : int, default 1
+            The number of time periods the offset represents.
+        **kwds
+            Temporal parameter that add to or replace the offset value.
+            Parameters that **add** to the offset (like Timedelta):
+            - months
+
+        See Also
+        --------
+        pandas.DateOffset : The equivalent Pandas object that this
+        object replicates
+
+        Examples
+        --------
+        >>> from cudf import DateOffset
+        >>> ts = cudf.Series([
+            "2000-01-01 00:00:00.012345678",
+            "2000-01-31 00:00:00.012345678",
+            "2000-02-29 00:00:00.012345678",
+        ], dtype='datetime64[ns])
+        >>> ts + DateOffset(months=3)
+        0   2000-04-01 00:00:00.012345678
+        1   2000-04-30 00:00:00.012345678
+        2   2000-05-29 00:00:00.012345678
+        dtype: datetime64[ns]
+        >>> ts - DateOffset(months=12)
+        0   1999-01-01 00:00:00.012345678
+        1   1999-01-31 00:00:00.012345678
+        2   1999-02-28 00:00:00.012345678
+        dtype: datetime64[ns]
+        """
+        if normalize:
+            raise NotImplementedError(
+                "normalize not yet supported for DateOffset"
+            )
+
+        # TODO: Pandas supports combinations
+        if len(kwds) > 1:
+            raise NotImplementedError("Multiple time units not yet supported")
+
+        all_possible_kwargs = {
+            "years",
+            "months",
+            "weeks",
+            "days",
+            "hours",
+            "minutes",
+            "seconds",
+            "microseconds",
+            "nanoseconds",
+            "year",
+            "month",
+            "week",
+            "day",
+            "hour",
+            "minute",
+            "second",
+            "microsecond",
+            "millisecond" "nanosecond",
+        }
+
+        supported_kwargs = {"months"}
+
+        scalars = {}
+        for k, v in kwds.items():
+            if k in all_possible_kwargs:
+                # Months must be int16
+                dtype = "int16" if k == "months" else None
+                scalars[k] = cudf.Scalar(v, dtype=dtype)
+
+        super().__init__(n=n, normalize=normalize, **kwds)
+
+        wrong_kwargs = set(kwds.keys()).difference(supported_kwargs)
+        if len(wrong_kwargs) > 0:
+            raise ValueError(
+                f"Keyword arguments '{','.join(list(wrong_kwargs))}'"
+                " are not yet supported in cuDF DateOffsets"
+            )
+        self._scalars = _DateOffsetScalars(scalars)
+
+    def _generate_column(self, size, op):
+        months = self._scalars._gpu_scalars["months"]
+        months = -months if op == "sub" else months
+        # TODO: pass a scalar instead of constructing a column
+        # https://github.com/rapidsai/cudf/issues/6990
+        col = cudf.core.column.as_column(months, length=size)
+        return col
+
+    @property
+    def _is_no_op(self):
+        # some logic could be implemented here for more complex cases
+        # such as +1 year, -12 months
+        return all([i == 0 for i in self.kwds.values()])
+
+    def __setattr__(self, name, value):
+        if not isinstance(value, _DateOffsetScalars):
+            raise AttributeError("DateOffset objects are immutable.")
+        else:
+            object.__setattr__(self, name, value)
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index 7b83002d0fa..2766e0d79d0 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -136,18 +136,6 @@ def to_csv(
                 "Dataframe doesn't have the labels provided in columns"
             )
 
-    if sep == "-":
-        # TODO: Remove this error once following issue is fixed:
-        # https://github.com/rapidsai/cudf/issues/6699
-        if any(
-            isinstance(col, cudf.core.column.DatetimeColumn)
-            for col in df._data.columns
-        ):
-            raise ValueError(
-                "sep cannot be '-' when writing a datetime64 dtype to csv, "
-                "refer to: https://github.com/rapidsai/cudf/issues/6699"
-            )
-
     # TODO: Need to typecast categorical columns to the underlying
     # categories dtype to write the actual data to csv. Remove this
     # workaround once following issue is fixed:
diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py
index 3db3b056f87..65055293804 100644
--- a/python/cudf/cudf/io/dlpack.py
+++ b/python/cudf/cudf/io/dlpack.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2020, NVIDIA CORPORATION.
 
 from cudf._lib import interop as libdlpack
 from cudf.core.column import ColumnBase
@@ -28,6 +28,11 @@ def from_dlpack(pycapsule_obj):
     -------
     A cuDF DataFrame or Series depending on if the input DLPack tensor is 1D
     or 2D.
+
+    Notes
+    -----
+    cuDF from_dlpack() assumes column-major (Fortran order) input. If the input
+    tensor is row-major, transpose it before passing it to this function.
     """
 
     res = libdlpack.from_dlpack(pycapsule_obj)
@@ -46,8 +51,8 @@ def to_dlpack(cudf_obj):
     `dmlc/dlpack <https://github.com/dmlc/dlpack>`_.
 
     This function takes a cuDF object as input, and returns a PyCapsule object
-    which contains a pointer to DLPack tensor. This function deep
-    copies the data in the cuDF object into the DLPack tensor.
+    which contains a pointer to DLPack tensor. This function deep copies
+    the data in the cuDF object into the DLPack tensor.
 
     Parameters
     ----------
@@ -57,6 +62,11 @@ def to_dlpack(cudf_obj):
     Returns
     -------
     A  DLPack tensor pointer which is encapsulated in a PyCapsule object.
+
+    Notes
+    -----
+    cuDF to_dlpack() produces column-major (Fortran order) output. If the
+    output tensor needs to be row major, transpose the output of this function.
     """
     if len(cudf_obj) == 0:
         raise ValueError("Cannot create DLPack tensor of 0 size")
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.decimal.runpos.issue.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.decimal.runpos.issue.orc
new file mode 100644
index 00000000000..72139acd88a
Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.decimal.runpos.issue.orc differ
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.gmt.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.gmt.orc
new file mode 100644
index 00000000000..7256130d79d
Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.gmt.orc differ
diff --git a/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_0.16.pkl b/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_0.16.pkl
new file mode 100644
index 00000000000..30e31487a82
Binary files /dev/null and b/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_0.16.pkl differ
diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py
index 446a25c217a..f9e0bb2ce8a 100644
--- a/python/cudf/cudf/tests/test_array_ufunc.py
+++ b/python/cudf/cudf/tests/test_array_ufunc.py
@@ -6,28 +6,106 @@
 from cudf.tests.utils import assert_eq
 
 
+@pytest.fixture
+def np_ar_tup():
+    np.random.seed(0)
+    return (np.random.random(100), np.random.random(100))
+
+
+comparison_ops_ls = [
+    np.greater,
+    np.greater_equal,
+    np.less,
+    np.less_equal,
+    np.equal,
+    np.not_equal,
+]
+
+
 @pytest.mark.parametrize(
-    "np_ar_tup", [(np.random.random(100), np.random.random(100))]
+    "func", comparison_ops_ls + [np.subtract, np.fmod, np.power]
 )
+def test_ufunc_cudf_non_nullseries(np_ar_tup, func):
+    x, y = np_ar_tup[0], np_ar_tup[1]
+    s_1, s_2 = cudf.Series(x), cudf.Series(y)
+    expect = func(x, y)
+    got = func(s_1, s_2)
+    assert_eq(expect, got.to_array())
+
+
 @pytest.mark.parametrize(
-    "func", [np.greater, np.less, np.less_equal, np.subtract],
+    "func", [np.bitwise_and, np.bitwise_or, np.bitwise_xor],
 )
-def test_ufunc_cudf_series(np_ar_tup, func):
-    x, y = np_ar_tup[0], np_ar_tup[1]
+def test_ufunc_cudf_series_bitwise(func):
+    np.random.seed(0)
+    x = np.random.randint(size=100, low=0, high=100)
+    y = np.random.randint(size=100, low=0, high=100)
+
     s_1, s_2 = cudf.Series(x), cudf.Series(y)
     expect = func(x, y)
     got = func(s_1, s_2)
-    if np.isscalar(expect):
-        assert_eq(expect, got)
-    else:
-        assert_eq(expect, got.to_array())
+    assert_eq(expect, got.to_array())
 
 
 @pytest.mark.parametrize(
-    "np_ar_tup", [(np.random.random(100), np.random.random(100))]
+    "func",
+    [
+        np.subtract,
+        np.multiply,
+        np.floor_divide,
+        np.true_divide,
+        np.power,
+        np.remainder,
+        np.divide,
+    ],
 )
+def test_ufunc_cudf_null_series(np_ar_tup, func):
+    x, y = np_ar_tup[0].astype(np.float32), np_ar_tup[1].astype(np.float32)
+    x[0] = np.nan
+    y[1] = np.nan
+    s_1, s_2 = cudf.Series(x), cudf.Series(y)
+    expect = func(x, y)
+    got = func(s_1, s_2)
+    assert_eq(expect, got.fillna(np.nan).to_array())
+
+    scalar = 0.5
+    expect = func(x, scalar)
+    got = func(s_1, scalar)
+    assert_eq(expect, got.fillna(np.nan).to_array())
+
+    expect = func(scalar, x)
+    got = func(scalar, s_1)
+    assert_eq(expect, got.fillna(np.nan).to_array())
+
+
+@pytest.mark.xfail(
+    reason="""cuDF comparison operations with <NA> incorrectly
+    returns False rather than <NA>"""
+)
+@pytest.mark.parametrize(
+    "func", comparison_ops_ls,
+)
+def test_ufunc_cudf_null_series_comparison_ops(np_ar_tup, func):
+    x, y = np_ar_tup[0].astype(np.float32), np_ar_tup[1].astype(np.float32)
+    x[0] = np.nan
+    y[1] = np.nan
+    s_1, s_2 = cudf.Series(x), cudf.Series(y)
+    expect = func(x, y)
+    got = func(s_1, s_2)
+    assert_eq(expect, got.fillna(np.nan).to_array())
+
+    scalar = 0.5
+    expect = func(x, scalar)
+    got = func(s_1, scalar)
+    assert_eq(expect, got.fillna(np.nan).to_array())
+
+    expect = func(scalar, x)
+    got = func(scalar, s_1)
+    assert_eq(expect, got.fillna(np.nan).to_array())
+
+
 @pytest.mark.parametrize(
-    "func", [np.greater, np.less, np.less_equal],
+    "func", [np.logaddexp, np.fmax, np.fmod],
 )
 def test_ufunc_cudf_series_cupy_array(np_ar_tup, func):
     x, y = np_ar_tup[0], np_ar_tup[1]
@@ -36,14 +114,12 @@ def test_ufunc_cudf_series_cupy_array(np_ar_tup, func):
     cudf_s = cudf.Series(x)
     cupy_ar = cp.array(y)
     got = func(cudf_s, cupy_ar)
-    if np.isscalar(expect):
-        assert_eq(expect, got)
-    else:
-        assert_eq(expect, got.to_array())
+    assert_eq(expect, got.to_array())
 
 
 @pytest.mark.parametrize(
-    "func", [np.greater, np.less, np.less_equal],
+    "func",
+    [np.fmod, np.logaddexp, np.bitwise_and, np.bitwise_or, np.bitwise_xor],
 )
 def test_error_with_null_cudf_series(func):
     s_1 = cudf.Series([1, 2])
@@ -66,7 +142,7 @@ def test_error_with_null_cudf_series(func):
 
 
 @pytest.mark.parametrize(
-    "func", [np.absolute, np.sign],
+    "func", [np.absolute, np.sign, np.exp2, np.tanh],
 )
 def test_ufunc_cudf_series_with_index(func):
     data = [-1, 2, 3, 0]
@@ -81,7 +157,7 @@ def test_ufunc_cudf_series_with_index(func):
 
 
 @pytest.mark.parametrize(
-    "func", [np.greater, np.logaddexp],
+    "func", [np.logaddexp2],
 )
 def test_ufunc_cudf_series_with_nonaligned_index(func):
     cudf_s1 = cudf.Series(data=[-1, 2, 3, 0], index=[2, 3, 1, 0])
@@ -93,3 +169,15 @@ def test_ufunc_cudf_series_with_nonaligned_index(func):
         ValueError, match="Can only compare identically-labeled Series objects"
     ):
         func(cudf_s1, cudf_s2)
+
+
+@pytest.mark.parametrize(
+    "func", [np.add],
+)
+def test_ufunc_cudf_series_error_with_out_kwarg(func):
+    cudf_s1 = cudf.Series(data=[-1, 2, 3, 0])
+    cudf_s2 = cudf.Series(data=[-1, 2, 3, 0])
+    cudf_s3 = cudf.Series(data=[0, 0, 0, 0])
+    # this throws a value-error because of presence of out kwarg
+    with pytest.raises(TypeError):
+        func(x1=cudf_s1, x2=cudf_s2, out=cudf_s3)
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 86e35d9bd21..be1cef50ec3 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -1461,6 +1461,42 @@ def test_scalar_power_invalid(dtype_l, dtype_r):
         lval_gpu ** rval_gpu
 
 
+@pytest.mark.parametrize(
+    "date_col",
+    [
+        [
+            "2000-01-01 00:00:00.012345678",
+            "2000-01-31 00:00:00.012345678",
+            "2000-02-29 00:00:00.012345678",
+        ]
+    ],
+)
+@pytest.mark.parametrize("n_periods", [0, 1, -1, 12, -12])
+@pytest.mark.parametrize("frequency", ["months"])
+@pytest.mark.parametrize(
+    "dtype",
+    ["datetime64[ns]", "datetime64[us]", "datetime64[ms]", "datetime64[s]"],
+)
+def test_datetime_dateoffset_binaryop(date_col, n_periods, frequency, dtype):
+    gsr = cudf.Series(date_col, dtype=dtype)
+    psr = gsr.to_pandas()  # converts to nanos
+
+    kwargs = {frequency: n_periods}
+
+    goffset = cudf.DateOffset(**kwargs)
+    poffset = pd.DateOffset(**kwargs)
+
+    expect = psr + poffset
+    got = gsr + goffset
+
+    utils.assert_eq(expect, got)
+
+    expect = psr - poffset
+    got = gsr - goffset
+
+    utils.assert_eq(expect, got)
+
+
 @pytest.mark.parametrize("frame", [cudf.Series, cudf.Index, cudf.DataFrame])
 @pytest.mark.parametrize(
     "dtype", ["int", "str", "datetime64[s]", "timedelta64[s]", "category"]
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index d4ab13c3b93..94a5f061383 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1905,21 +1905,14 @@ def test_csv_reader_category_error():
         cudf.read_csv(StringIO(csv_buf), dtype="category")
 
 
-def test_csv_writer_datetime_sep_error():
-    # TODO: Remove this test once following
-    # issues is fixed: https://github.com/rapidsai/cudf/issues/6699
+def test_csv_writer_datetime_sep():
     df = cudf.DataFrame(
         {"a": cudf.Series([22343, 2323423, 234324234], dtype="datetime64[ns]")}
     )
-
-    with pytest.raises(
-        ValueError,
-        match=re.escape(
-            "sep cannot be '-' when writing a datetime64 dtype to csv, "
-            "refer to: https://github.com/rapidsai/cudf/issues/6699"
-        ),
-    ):
-        df.to_csv(sep="-")
+    df["a"] = df["a"].astype("datetime64[s]")
+    expected = df.to_pandas().to_csv(date_format="%Y-%m-%dT%H:%M:%SZ", sep="-")
+    actual = df.to_csv(sep="-")
+    assert expected == actual
 
 
 def test_na_filter_empty_fields():
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 25c9be66756..88254f4fb48 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -8074,3 +8074,105 @@ def assert_local_eq(actual, df, expected, host_columns):
         else:
             expected.index = index
     assert_local_eq(actual, df, expected, host_columns)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"a": [1, 2, 3], "b": [3.0, 4.0, 5.0], "c": [True, True, False]},
+        {"a": [1.0, 2.0, 3.0], "b": [3.0, 4.0, 5.0], "c": [True, True, False]},
+        {"a": [1, 2, 3], "b": [3, 4, 5], "c": [True, True, False]},
+        {"a": [1, 2, 3], "b": [True, True, False], "c": [False, True, False]},
+        {
+            "a": [1.0, 2.0, 3.0],
+            "b": [True, True, False],
+            "c": [False, True, False],
+        },
+        {"a": [1, 2, 3], "b": [3, 4, 5], "c": [2.0, 3.0, 4.0]},
+        {"a": [1, 2, 3], "b": [2.0, 3.0, 4.0], "c": [5.0, 6.0, 4.0]},
+    ],
+)
+@pytest.mark.parametrize(
+    "aggs",
+    [
+        ["min", "sum", "max"],
+        ("min", "sum", "max"),
+        {"min", "sum", "max"},
+        "sum",
+        {"a": "sum", "b": "min", "c": "max"},
+        {"a": ["sum"], "b": ["min"], "c": ["max"]},
+        {"a": ("sum"), "b": ("min"), "c": ("max")},
+        {"a": {"sum"}, "b": {"min"}, "c": {"max"}},
+        {"a": ["sum", "min"], "b": ["sum", "max"], "c": ["min", "max"]},
+        {"a": ("sum", "min"), "b": ("sum", "max"), "c": ("min", "max")},
+        {"a": {"sum", "min"}, "b": {"sum", "max"}, "c": {"min", "max"}},
+    ],
+)
+def test_agg_for_dataframes(data, aggs):
+    pdf = pd.DataFrame(data)
+    gdf = gd.DataFrame(data)
+
+    expect = pdf.agg(aggs)
+    got = gdf.agg(aggs)
+
+    assert_eq(expect, got, check_dtype=False)
+
+
+@pytest.mark.parametrize("aggs", [{"a": np.sum, "b": np.min, "c": np.max}])
+def test_agg_for_unsupported_function(aggs):
+    gdf = gd.DataFrame(
+        {"a": [1, 2, 3], "b": [3.0, 4.0, 5.0], "c": [True, True, False]}
+    )
+
+    with pytest.raises(NotImplementedError):
+        gdf.agg(aggs)
+
+
+@pytest.mark.parametrize("aggs", ["asdf"])
+def test_agg_for_dataframe_with_invalid_function(aggs):
+    gdf = gd.DataFrame(
+        {"a": [1, 2, 3], "b": [3.0, 4.0, 5.0], "c": [True, True, False]}
+    )
+
+    with pytest.raises(
+        AttributeError,
+        match=f"{aggs} is not a valid function for 'DataFrame' object",
+    ):
+        gdf.agg(aggs)
+
+
+@pytest.mark.parametrize("aggs", [{"a": "asdf"}])
+def test_agg_for_series_with_invalid_function(aggs):
+    gdf = gd.DataFrame(
+        {"a": [1, 2, 3], "b": [3.0, 4.0, 5.0], "c": [True, True, False]}
+    )
+
+    with pytest.raises(
+        AttributeError,
+        match=f"{aggs['a']} is not a valid function for 'Series' object",
+    ):
+        gdf.agg(aggs)
+
+
+@pytest.mark.parametrize(
+    "aggs",
+    [
+        "sum",
+        ["min", "sum", "max"],
+        {"a": {"sum", "min"}, "b": {"sum", "max"}, "c": {"min", "max"}},
+    ],
+)
+def test_agg_for_dataframe_with_string_columns(aggs):
+    gdf = gd.DataFrame(
+        {"a": ["m", "n", "o"], "b": ["t", "u", "v"], "c": ["x", "y", "z"]},
+        index=["a", "b", "c"],
+    )
+
+    with pytest.raises(
+        NotImplementedError,
+        match=re.escape(
+            "DataFrame.agg() is not supported for "
+            "frames containing string columns"
+        ),
+    ):
+        gdf.agg(aggs)
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index c72859bf9d7..044c8bd5954 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -1241,3 +1241,8 @@ def test_datetime_infer_format(data, dtype):
     actual = sr.astype(dtype)
 
     assert_eq(expected, actual)
+
+
+def test_dateoffset_instance_subclass_check():
+    assert not issubclass(pd.DateOffset, cudf.DateOffset)
+    assert not isinstance(pd.DateOffset(), cudf.DateOffset)
diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py
index 4c42ee2a7bf..61d11fa5961 100644
--- a/python/cudf/cudf/tests/test_factorize.py
+++ b/python/cudf/cudf/tests/test_factorize.py
@@ -1,14 +1,17 @@
 # Copyright (c) 2018, NVIDIA CORPORATION.
 
+import cupy as cp
 import numpy as np
+import pandas as pd
 import pytest
 
-from cudf.core import DataFrame, Series
+import cudf
+from cudf.core import DataFrame, Index
 from cudf.tests.utils import assert_eq
 
 
 @pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)])
-def test_factorize(ncats, nelem):
+def test_factorize_series_obj(ncats, nelem):
     df = DataFrame()
     np.random.seed(0)
 
@@ -17,15 +20,34 @@ def test_factorize(ncats, nelem):
 
     uvals, labels = df["cats"].factorize()
     np.testing.assert_array_equal(labels.to_array(), sorted(set(arr)))
-    assert isinstance(uvals, Series)
-    assert isinstance(labels, Series)
+    assert isinstance(uvals, cp.core.core.ndarray)
+    assert isinstance(labels, Index)
 
     encoder = dict((labels[idx], idx) for idx in range(len(labels)))
     handcoded = [encoder[v] for v in arr]
-    np.testing.assert_array_equal(uvals.to_array(), handcoded)
+    np.testing.assert_array_equal(uvals.get(), handcoded)
 
 
-def test_factorize_index():
+@pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)])
+def test_factorize_index_obj(ncats, nelem):
+    df = DataFrame()
+    np.random.seed(0)
+
+    # initialize data frame
+    df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32)
+    df = df.set_index("cats")
+
+    uvals, labels = df.index.factorize()
+    np.testing.assert_array_equal(labels.values.get(), sorted(set(arr)))
+    assert isinstance(uvals, cp.core.core.ndarray)
+    assert isinstance(labels, Index)
+
+    encoder = dict((labels[idx], idx) for idx in range(len(labels)))
+    handcoded = [encoder[v] for v in arr]
+    np.testing.assert_array_equal(uvals.get(), handcoded)
+
+
+def test_factorize_series_index():
     df = DataFrame()
     df["col1"] = ["C", "H", "C", "W", "W", "W", "W", "W", "C", "W"]
     df["col2"] = [
@@ -40,10 +62,7 @@ def test_factorize_index():
         2992446.0,
         2992448.0,
     ]
-
-    assert_eq(
-        df.col1.factorize()[0].to_array(), df.to_pandas().col1.factorize()[0]
-    )
+    assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0])
     assert_eq(
         df.col1.factorize()[1].to_pandas().values,
         df.to_pandas().col1.factorize()[1].values,
@@ -51,10 +70,72 @@ def test_factorize_index():
 
     df = df.set_index("col2")
 
-    assert_eq(
-        df.col1.factorize()[0].to_array(), df.to_pandas().col1.factorize()[0]
-    )
+    assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0])
     assert_eq(
         df.col1.factorize()[1].to_pandas().values,
         df.to_pandas().col1.factorize()[1].values,
     )
+
+
+def test_cudf_factorize_series():
+    data = [1, 2, 3, 4, 5]
+
+    psr = pd.Series(data)
+    gsr = cudf.Series(data)
+
+    expect = pd.factorize(psr)
+    got = cudf.factorize(gsr)
+
+    assert len(expect) == len(got)
+
+    np.testing.assert_array_equal(expect[0], got[0].get())
+    np.testing.assert_array_equal(expect[1], got[1].values.get())
+
+
+def test_cudf_factorize_index():
+    data = [1, 2, 3, 4, 5]
+
+    pi = pd.Index(data)
+    gi = cudf.Index(data)
+
+    expect = pd.factorize(pi)
+    got = cudf.factorize(gi)
+
+    assert len(expect) == len(got)
+
+    np.testing.assert_array_equal(expect[0], got[0].get())
+    np.testing.assert_array_equal(expect[1], got[1].values.get())
+
+
+def test_cudf_factorize_array():
+    data = [1, 2, 3, 4, 5]
+
+    parr = np.array(data)
+    garr = cp.array(data)
+
+    expect = pd.factorize(parr)
+    got = cudf.factorize(garr)
+
+    assert len(expect) == len(got)
+
+    np.testing.assert_array_equal(expect[0], got[0].get())
+    np.testing.assert_array_equal(expect[1], got[1].get())
+
+
+def test_factorize_result_classes():
+    data = [1, 2, 3]
+
+    labels, cats = cudf.factorize(cudf.Series(data))
+
+    assert isinstance(labels, cp.core.core.ndarray)
+    assert isinstance(cats, cudf.Index)
+
+    labels, cats = cudf.factorize(cudf.Index(data))
+
+    assert isinstance(labels, cp.core.core.ndarray)
+    assert isinstance(cats, cudf.Index)
+
+    labels, cats = cudf.factorize(cp.array(data))
+
+    assert isinstance(labels, cp.core.core.ndarray)
+    assert isinstance(cats, cp.core.core.ndarray)
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 5a601df4bb6..b42586f4137 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -1322,6 +1322,26 @@ def test_groupby_list_single_element(list_agg):
     )
 
 
+@pytest.mark.parametrize(
+    "agg", [list, [list, "count"], {"b": list, "c": "sum"}]
+)
+def test_groupby_list_strings(agg):
+    pdf = pd.DataFrame(
+        {
+            "a": [1, 1, 1, 2, 2],
+            "b": ["b", "a", None, "e", "d"],
+            "c": [1, 2, 3, 4, 5],
+        }
+    )
+    gdf = cudf.from_pandas(pdf)
+
+    assert_eq(
+        pdf.groupby("a").agg(agg),
+        gdf.groupby("a").agg(agg),
+        check_dtype=False,
+    )
+
+
 def test_groupby_list_columns_excluded():
     pdf = pd.DataFrame(
         {
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 3f45d15067a..2fba419045a 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -9,7 +9,7 @@
 from cudf import DataFrame, Series
 from cudf.core._compat import PANDAS_GE_110
 from cudf.tests import utils
-from cudf.tests.utils import INTEGER_TYPES, assert_eq
+from cudf.tests.utils import INTEGER_TYPES, assert_eq, assert_exceptions_equal
 
 index_dtypes = INTEGER_TYPES
 
@@ -328,7 +328,6 @@ def test_dataframe_loc_mask(mask, arg):
     assert_eq(pdf.loc[mask, arg], gdf.loc[mask, arg])
 
 
-@pytest.mark.xfail(raises=IndexError, reason="label scalar is out of bound")
 def test_dataframe_loc_outbound():
     df = DataFrame()
     size = 10
@@ -341,7 +340,7 @@ def test_dataframe_loc_outbound():
     pdf["a"] = ha
     pdf["b"] = hb
 
-    np.testing.assert_equal(df.loc[11].to_array(), pdf.loc[11])
+    assert_exceptions_equal(lambda: pdf.loc[11], lambda: df.loc[11])
 
 
 def test_series_loc_numerical():
@@ -1118,19 +1117,39 @@ def test_iloc_negative_indices():
 
 
 def test_out_of_bounds_indexing():
-    a = cudf.Series([1, 2, 3])
-    with pytest.raises(IndexError):
-        a[[0, 1, 9]]
-    with pytest.raises(IndexError):
-        a[[0, 1, -4]]
-    with pytest.raises(IndexError):
-        a[[0, 1, 9]] = 2
-    with pytest.raises(IndexError):
-        a[[0, 1, -4]] = 2
-    with pytest.raises(IndexError):
-        a[4:6].iloc[-1] = 2
-    with pytest.raises(IndexError):
-        a[4:6].iloc[1] = 2
+    psr = pd.Series([1, 2, 3])
+    gsr = cudf.from_pandas(psr)
+
+    assert_exceptions_equal(
+        lambda: psr[[0, 1, 9]],
+        lambda: gsr[[0, 1, 9]],
+        compare_error_message=False,
+    )
+    assert_exceptions_equal(
+        lambda: psr[[0, 1, -4]],
+        lambda: gsr[[0, 1, -4]],
+        compare_error_message=False,
+    )
+    assert_exceptions_equal(
+        lambda: psr.__setitem__([0, 1, 9], 2),
+        lambda: gsr.__setitem__([0, 1, 9], 2),
+        compare_error_message=False,
+    )
+    assert_exceptions_equal(
+        lambda: psr.__setitem__([0, 1, -4], 2),
+        lambda: gsr.__setitem__([0, 1, -4], 2),
+        compare_error_message=False,
+    )
+    assert_exceptions_equal(
+        lambda: psr[4:6].iloc.__setitem__(-1, 2),
+        lambda: gsr[4:6].iloc.__setitem__(-1, 2),
+        compare_error_message=False,
+    )
+    assert_exceptions_equal(
+        lambda: psr[4:6].iloc.__setitem__(1, 2),
+        lambda: gsr[4:6].iloc.__setitem__(1, 2),
+        compare_error_message=False,
+    )
 
 
 def test_sliced_indexing():
@@ -1300,3 +1319,27 @@ def test_iloc_with_lists(data, key):
     psr = pd.Series(data)
     gsr = cudf.Series(data)
     assert_eq(psr.iloc[key], gsr.iloc[key])
+
+
+@pytest.mark.parametrize("key", [5, -10, "0", "a", np.array(5), np.array("a")])
+def test_loc_bad_key_type(key):
+    psr = pd.Series([1, 2, 3])
+    gsr = cudf.from_pandas(psr)
+    assert_exceptions_equal(lambda: psr[key], lambda: gsr[key])
+    assert_exceptions_equal(lambda: psr.loc[key], lambda: gsr.loc[key])
+
+
+@pytest.mark.parametrize("key", ["b", 1.0, np.array("b")])
+def test_loc_bad_key_type_string_index(key):
+    psr = pd.Series([1, 2, 3], index=["a", "1", "c"])
+    gsr = cudf.from_pandas(psr)
+    assert_exceptions_equal(lambda: psr[key], lambda: gsr[key])
+    assert_exceptions_equal(lambda: psr.loc[key], lambda: gsr.loc[key])
+
+
+def test_loc_zero_dim_array():
+    psr = pd.Series([1, 2, 3])
+    gsr = cudf.from_pandas(psr)
+
+    assert_eq(psr[np.array(0)], gsr[np.array(0)])
+    assert_eq(psr[np.array([0])[0]], gsr[np.array([0])[0]])
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 385754bee70..4e0e8aa9259 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -422,6 +422,8 @@ def test_orc_writer_sliced(tmpdir):
     [
         "TestOrcFile.decimal.same.values.orc",
         "TestOrcFile.decimal.multiple.values.orc",
+        # For addional information take look at PR 7034
+        "TestOrcFile.decimal.runpos.issue.orc",
     ],
 )
 def test_orc_reader_decimal_type(datadir, orc_file):
@@ -584,3 +586,15 @@ def test_orc_write_bool_statistics(tmpdir, datadir, nrows):
                 "number_of_values"
             ]
             assert normalized_equals(actual_valid_count, stats_valid_count)
+
+
+def test_orc_reader_gmt_timestamps(datadir):
+    path = datadir / "TestOrcFile.gmt.orc"
+    try:
+        orcfile = pa.orc.ORCFile(path)
+    except pa.ArrowIOError as e:
+        pytest.skip(".orc file is not found: %s" % e)
+
+    pdf = orcfile.read().to_pandas()
+    gdf = cudf.read_orc(path, engine="cudf").to_pandas()
+    assert_eq(pdf, gdf)
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index 1af89b13f79..5338761372f 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -209,6 +209,43 @@ def test_series_fillna_numerical(psr, data_dtype, fill_value, inplace):
     assert_eq(expected, actual, check_dtype=False)
 
 
+@pytest.mark.parametrize(
+    "data",
+    [
+        [1, None, None, 2, 3, 4],
+        [None, None, 1, 2, None, 3, 4],
+        [1, 2, None, 3, 4, None, None],
+    ],
+)
+@pytest.mark.parametrize("container", [pd.Series, pd.DataFrame])
+@pytest.mark.parametrize("data_dtype", NUMERIC_TYPES)
+@pytest.mark.parametrize("method", ["ffill", "bfill"])
+@pytest.mark.parametrize("inplace", [True, False])
+def test_fillna_method_numerical(data, container, data_dtype, method, inplace):
+    if container == pd.DataFrame:
+        data = {"a": data, "b": data, "c": data}
+
+    pdata = container(data)
+
+    if np.dtype(data_dtype).kind not in ("f"):
+        data_dtype = cudf.utils.dtypes.cudf_dtypes_to_pandas_dtypes[
+            np.dtype(data_dtype)
+        ]
+    pdata = pdata.astype(data_dtype)
+
+    # Explicitly using nans_as_nulls=True
+    gdata = cudf.from_pandas(pdata, nan_as_null=True)
+
+    expected = pdata.fillna(method=method, inplace=inplace)
+    actual = gdata.fillna(method=method, inplace=inplace)
+
+    if inplace:
+        expected = pdata
+        actual = gdata
+
+    assert_eq(expected, actual, check_dtype=False)
+
+
 @pytest.mark.parametrize(
     "psr",
     [
@@ -370,6 +407,95 @@ def test_fillna_datetime(psr, fill_value, inplace):
     assert_eq(expected, got)
 
 
+@pytest.mark.parametrize(
+    "data",
+    [
+        # Categorical
+        pd.Categorical([1, 2, None, None, 3, 4]),
+        pd.Categorical([None, None, 1, None, 3, 4]),
+        pd.Categorical([1, 2, None, 3, 4, None, None]),
+        pd.Categorical(["1", "20", None, None, "3", "40"]),
+        pd.Categorical([None, None, "10", None, "30", "4"]),
+        pd.Categorical(["1", "20", None, "30", "4", None, None]),
+        # Datetime
+        np.array(
+            [
+                "2020-01-01 08:00:00",
+                "2020-01-01 09:00:00",
+                None,
+                "2020-01-01 10:00:00",
+                None,
+                "2020-01-01 10:00:00",
+            ],
+            dtype="datetime64[ns]",
+        ),
+        np.array(
+            [
+                None,
+                None,
+                "2020-01-01 09:00:00",
+                "2020-01-01 10:00:00",
+                None,
+                "2020-01-01 10:00:00",
+            ],
+            dtype="datetime64[ns]",
+        ),
+        np.array(
+            [
+                "2020-01-01 09:00:00",
+                None,
+                None,
+                "2020-01-01 10:00:00",
+                None,
+                None,
+            ],
+            dtype="datetime64[ns]",
+        ),
+        # Timedelta
+        np.array(
+            [10, 100, 1000, None, None, 10, 100, 1000], dtype="datetime64[ns]"
+        ),
+        np.array(
+            [None, None, 10, None, 1000, 100, 10], dtype="datetime64[ns]"
+        ),
+        np.array(
+            [10, 100, None, None, 1000, None, None], dtype="datetime64[ns]"
+        ),
+        # String
+        np.array(
+            ["10", "100", "1000", None, None, "10", "100", "1000"],
+            dtype="object",
+        ),
+        np.array(
+            [None, None, "1000", None, "10", "100", "10"], dtype="object"
+        ),
+        np.array(
+            ["10", "100", None, None, "1000", None, None], dtype="object"
+        ),
+    ],
+)
+@pytest.mark.parametrize("container", [pd.Series, pd.DataFrame])
+@pytest.mark.parametrize("method", ["ffill", "bfill"])
+@pytest.mark.parametrize("inplace", [True, False])
+def test_fillna_method_fixed_width_non_num(data, container, method, inplace):
+    if container == pd.DataFrame:
+        data = {"a": data, "b": data, "c": data}
+
+    pdata = container(data)
+
+    # Explicitly using nans_as_nulls=True
+    gdata = cudf.from_pandas(pdata, nan_as_null=True)
+
+    expected = pdata.fillna(method=method, inplace=inplace)
+    actual = gdata.fillna(method=method, inplace=inplace)
+
+    if inplace:
+        expected = pdata
+        actual = gdata
+
+    assert_eq(expected, actual)
+
+
 @pytest.mark.parametrize(
     "df",
     [
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 3b2362b6124..55cff5ae6dd 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -52,6 +52,7 @@ def test_null_series(nrows, dtype):
     elif "Int" in psrepr:
         psrepr = psrepr.replace("Int", "int")
     assert psrepr.split() == sr.__repr__().split()
+    pd.reset_option("display.max_rows")
 
 
 dtype_categories = [
@@ -79,9 +80,8 @@ def test_null_dataframe(ncols):
     pdfrepr = pdfrepr.replace("NaN", "<NA>")
     pdfrepr = pdfrepr.replace("NaT", "<NA>")
     pdfrepr = pdfrepr.replace("None", "<NA>")
-    print(pdf)
-    print(gdf)
     assert pdfrepr.split() == gdf.__repr__().split()
+    pd.reset_option("display.max_columns")
 
 
 @pytest.mark.parametrize("dtype", repr_categories)
@@ -92,6 +92,7 @@ def test_full_series(nrows, dtype):
     sr = cudf.from_pandas(ps)
     pd.options.display.max_rows = int(nrows)
     assert ps.__repr__() == sr.__repr__()
+    pd.reset_option("display.max_rows")
 
 
 @pytest.mark.parametrize("dtype", repr_categories)
@@ -111,6 +112,8 @@ def test_full_dataframe_20(dtype, nrows, ncols):
     assert pdf.__repr__() == gdf.__repr__()
     assert pdf._repr_html_() == gdf._repr_html_()
     assert pdf._repr_latex_() == gdf._repr_latex_()
+    pd.reset_option("display.max_rows")
+    pd.reset_option("display.max_columns")
 
 
 @pytest.mark.parametrize("dtype", repr_categories)
@@ -126,6 +129,8 @@ def test_full_dataframe_21(dtype, nrows, ncols):
     pd.options.display.max_rows = int(nrows)
     pd.options.display.max_columns = int(ncols)
     assert pdf.__repr__() == gdf.__repr__()
+    pd.reset_option("display.max_rows")
+    pd.reset_option("display.max_columns")
 
 
 @given(
@@ -142,6 +147,7 @@ def test_integer_dataframe(x):
     pd.options.display.max_columns = 1
     assert gdf.__repr__() == pdf.__repr__()
     assert gdf.T.__repr__() == pdf.T.__repr__()
+    pd.reset_option("display.max_columns")
 
 
 @given(
@@ -153,8 +159,7 @@ def test_integer_dataframe(x):
 def test_integer_series(x):
     sr = cudf.Series(x)
     ps = pd.Series(x)
-    print(sr)
-    print(ps)
+
     assert sr.__repr__() == ps.__repr__()
 
 
@@ -226,7 +231,10 @@ def test_MI():
     gdfT = gdf.T
     pdfT = pdf.T
     assert gdf.__repr__() == pdf.__repr__()
+    assert gdf.index.__repr__() == pdf.index.__repr__()
     assert gdfT.__repr__() == pdfT.__repr__()
+    pd.reset_option("display.max_rows")
+    pd.reset_option("display.max_columns")
 
 
 @pytest.mark.parametrize("nrows", [0, 1, 3, 5, 10])
@@ -241,7 +249,10 @@ def test_groupby_MI(nrows, ncols):
     pd.options.display.max_rows = nrows
     pd.options.display.max_columns = ncols
     assert gdg.__repr__() == pdg.__repr__()
+    assert gdg.index.__repr__() == pdg.index.__repr__()
     assert gdg.T.__repr__() == pdg.T.__repr__()
+    pd.reset_option("display.max_rows")
+    pd.reset_option("display.max_columns")
 
 
 @pytest.mark.parametrize("dtype", utils.NUMERIC_TYPES)
@@ -1116,3 +1127,292 @@ def test_timedelta_index_repr(index, expected_repr):
     actual_repr = index.__repr__()
 
     assert actual_repr.split() == expected_repr.split()
+
+
+@pytest.mark.parametrize(
+    "pmi",
+    [
+        pd.MultiIndex.from_tuples(
+            [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")]
+        ),
+        pd.MultiIndex.from_tuples(
+            [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")] * 10
+        ),
+        pd.MultiIndex.from_tuples([(1, "red", 102, "sdf")]),
+        pd.MultiIndex.from_tuples(
+            [
+                ("abc", 0.234, 1),
+                ("a", -0.34, 0),
+                ("ai", 111, 4385798),
+                ("rapids", 0, 34534534),
+            ],
+            names=["alphabets", "floats", "ints"],
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "max_seq_items",
+    [
+        None,
+        pytest.param(
+            1,
+            marks=pytest.mark.xfail(
+                reason="https://github.com/pandas-dev/pandas/issues/38415"
+            ),
+        ),
+        2,
+        5,
+        10,
+        100,
+    ],
+)
+def test_mulitIndex_repr(pmi, max_seq_items):
+    pd.set_option("display.max_seq_items", max_seq_items)
+    gmi = cudf.from_pandas(pmi)
+    print(gmi)
+    print(pmi)
+    assert gmi.__repr__() == pmi.__repr__()
+    pd.reset_option("display.max_seq_items")
+
+
+@pytest.mark.parametrize(
+    "gdi, expected_repr",
+    [
+        (
+            cudf.DataFrame(
+                {
+                    "a": [None, 1, 2, 3],
+                    "b": ["abc", None, "xyz", None],
+                    "c": [0.345, np.nan, 100, 10],
+                }
+            )
+            .set_index(["a", "b"])
+            .index,
+            textwrap.dedent(
+                """
+                MultiIndex([(<NA>, 'abc'),
+                            (   1,  <NA>),
+                            (   2, 'xyz'),
+                            (   3,  <NA>)],
+                        names=['a', 'b'])
+                """
+            ),
+        ),
+        (
+            cudf.DataFrame(
+                {
+                    "a": cudf.Series([None, np.nan, 2, 3], nan_as_null=False),
+                    "b": ["abc", None, "xyz", None],
+                    "c": [0.345, np.nan, 100, 10],
+                }
+            )
+            .set_index(["a", "b"])
+            .index,
+            textwrap.dedent(
+                """
+            MultiIndex([(<NA>, 'abc'),
+                        ( nan,  <NA>),
+                        ( 2.0, 'xyz'),
+                        ( 3.0,  <NA>)],
+                    names=['a', 'b'])
+            """
+            ),
+        ),
+        (
+            cudf.DataFrame(
+                {
+                    "a": cudf.Series([None, 1, 2, 3], dtype="datetime64[ns]"),
+                    "b": ["abc", None, "xyz", None],
+                    "c": [0.345, np.nan, 100, 10],
+                }
+            )
+            .set_index(["a", "b"])
+            .index,
+            textwrap.dedent(
+                """
+            MultiIndex([(                         '<NA>', 'abc'),
+                        ('1970-01-01 00:00:00.000000001',  <NA>),
+                        ('1970-01-01 00:00:00.000000002', 'xyz'),
+                        ('1970-01-01 00:00:00.000000003',  <NA>)],
+                    names=['a', 'b'])
+            """
+            ),
+        ),
+        (
+            cudf.DataFrame(
+                {
+                    "a": cudf.Series([None, 1, 2, 3], dtype="datetime64[ns]"),
+                    "b": ["abc", None, "xyz", None],
+                    "c": [0.345, np.nan, 100, 10],
+                }
+            )
+            .set_index(["a", "b", "c"])
+            .index,
+            textwrap.dedent(
+                """
+                MultiIndex([(                         '<NA>', 'abc', 0.345),
+                            ('1970-01-01 00:00:00.000000001',  <NA>,  <NA>),
+                            ('1970-01-01 00:00:00.000000002', 'xyz', 100.0),
+                            ('1970-01-01 00:00:00.000000003',  <NA>,  10.0)],
+                        names=['a', 'b', 'c'])
+                """
+            ),
+        ),
+        (
+            cudf.DataFrame(
+                {
+                    "a": ["abc", None, "xyz", None],
+                    "b": cudf.Series([None, 1, 2, 3], dtype="timedelta64[ns]"),
+                    "c": [0.345, np.nan, 100, 10],
+                }
+            )
+            .set_index(["a", "b", "c"])
+            .index,
+            textwrap.dedent(
+                """
+                MultiIndex([('abc',                      '<NA>', 0.345),
+                            ( <NA>, '0 days 00:00:00.000000001',  <NA>),
+                            ('xyz', '0 days 00:00:00.000000002', 100.0),
+                            ( <NA>, '0 days 00:00:00.000000003',  10.0)],
+                        names=['a', 'b', 'c'])
+                """
+            ),
+        ),
+        (
+            cudf.DataFrame(
+                {
+                    "a": ["abc", None, "xyz", None],
+                    "b": cudf.Series([None, 1, 2, 3], dtype="timedelta64[ns]"),
+                    "c": [0.345, np.nan, 100, 10],
+                }
+            )
+            .set_index(["c", "a"])
+            .index,
+            textwrap.dedent(
+                """
+                MultiIndex([(0.345, 'abc'),
+                            ( <NA>,  <NA>),
+                            (100.0, 'xyz'),
+                            ( 10.0,  <NA>)],
+                        names=['c', 'a'])
+                """
+            ),
+        ),
+        (
+            cudf.DataFrame(
+                {
+                    "a": [None, None, None, None],
+                    "b": cudf.Series(
+                        [None, None, None, None], dtype="timedelta64[ns]"
+                    ),
+                    "c": [0.345, np.nan, 100, 10],
+                }
+            )
+            .set_index(["b", "a"])
+            .index,
+            textwrap.dedent(
+                """
+            MultiIndex([('<NA>', <NA>),
+                        ('<NA>', <NA>),
+                        ('<NA>', <NA>),
+                        ('<NA>', <NA>)],
+                    names=['b', 'a'])
+            """
+            ),
+        ),
+        (
+            cudf.DataFrame(
+                {
+                    "a": [1, 2, None, 3, 5],
+                    "b": [
+                        "abc",
+                        "def, hi, bye",
+                        None,
+                        ", one, two, three, four",
+                        None,
+                    ],
+                    "c": cudf.Series(
+                        [0.3232, np.nan, 1, None, -0.34534], nan_as_null=False
+                    ),
+                    "d": [None, 100, 2000324, None, None],
+                }
+            )
+            .set_index(["a", "b", "c", "d"])
+            .index,
+            textwrap.dedent(
+                """
+    MultiIndex([(   1,                     'abc',   0.3232,    <NA>),
+                (   2,            'def, hi, bye',      nan,     100),
+                (<NA>,                      <NA>,      1.0, 2000324),
+                (   3, ', one, two, three, four',     <NA>,    <NA>),
+                (   5,                      <NA>, -0.34534,    <NA>)],
+            names=['a', 'b', 'c', 'd'])
+    """
+            ),
+        ),
+        (
+            cudf.DataFrame(
+                {
+                    "a": [1, 2, None, 3, 5],
+                    "b": [
+                        "abc",
+                        "def, hi, bye",
+                        None,
+                        ", one, two, three, four",
+                        None,
+                    ],
+                    "c": cudf.Series(
+                        [0.3232, np.nan, 1, None, -0.34534], nan_as_null=False
+                    ),
+                    "d": [None, 100, 2000324, None, None],
+                }
+            )
+            .set_index(["b", "a", "c", "d"])
+            .index,
+            textwrap.dedent(
+                """
+    MultiIndex([(                    'abc',    1,   0.3232,    <NA>),
+                (           'def, hi, bye',    2,      nan,     100),
+                (                     <NA>, <NA>,      1.0, 2000324),
+                (', one, two, three, four',    3,     <NA>,    <NA>),
+                (                     <NA>,    5, -0.34534,    <NA>)],
+            names=['b', 'a', 'c', 'd'])
+    """
+            ),
+        ),
+        (
+            cudf.DataFrame(
+                {
+                    "a": ["(abc", "2", None, "3", "5"],
+                    "b": [
+                        "abc",
+                        "def, hi, bye",
+                        None,
+                        ", one, two, three, four",
+                        None,
+                    ],
+                    "c": cudf.Series(
+                        [0.3232, np.nan, 1, None, -0.34534], nan_as_null=False
+                    ),
+                    "d": [None, 100, 2000324, None, None],
+                }
+            )
+            .set_index(["a", "b", "c", "d"])
+            .index,
+            textwrap.dedent(
+                """
+    MultiIndex([('(abc',                     'abc',   0.3232,    <NA>),
+                (   '2',            'def, hi, bye',      nan,     100),
+                (  <NA>,                      <NA>,      1.0, 2000324),
+                (   '3', ', one, two, three, four',     <NA>,    <NA>),
+                (   '5',                      <NA>, -0.34534,    <NA>)],
+            names=['a', 'b', 'c', 'd'])
+    """
+            ),
+        ),
+    ],
+)
+def test_mulitIndex_null_repr(gdi, expected_repr):
+    actual_repr = gdi.__repr__()
+
+    assert actual_repr.split() == expected_repr.split()
diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py
index 2e84c81c124..f4d04f84097 100644
--- a/python/cudf/cudf/tests/test_serialize.py
+++ b/python/cudf/cudf/tests/test_serialize.py
@@ -1,5 +1,7 @@
 # Copyright (c) 2018, NVIDIA CORPORATION.
 
+import pickle
+
 import msgpack
 import numpy as np
 import pandas as pd
@@ -285,3 +287,12 @@ def test_serialize_list_columns(data):
     df = cudf.DataFrame(data)
     recreated = df.__class__.deserialize(*df.serialize())
     assert_eq(recreated, df)
+
+
+def test_deserialize_cudf_0_16(datadir):
+    fname = datadir / "pkl" / "stringColumnWithRangeIndex_cudf_0.16.pkl"
+
+    expected = cudf.DataFrame({"a": ["hi", "hello", "world", None]})
+    actual = pickle.load(open(fname, "rb"))
+
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 300b17b1625..4c6589789bf 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -474,8 +474,8 @@ def test_series_factorize(data, na_sentinel):
     expected_labels, expected_cats = psr.factorize(na_sentinel=na_sentinel)
     actual_labels, actual_cats = gsr.factorize(na_sentinel=na_sentinel)
 
-    assert_eq(expected_labels, actual_labels.to_array())
-    assert_eq(expected_cats.values, actual_cats.to_array())
+    assert_eq(expected_labels, actual_labels.get())
+    assert_eq(expected_cats.values, actual_cats.to_pandas().values)
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index c3277f2a329..e71c90b1ec9 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -256,7 +256,7 @@
 --------
 >>> import cudf
 >>> num_rows, stripes, names = cudf.io.read_orc_metadata(filename)
->>> df = [cudf.read_orc(fname, stripe=i) for i in range(stripes)]
+>>> df = [cudf.read_orc(fname, stripes=i) for i in range(stripes)]
 >>> df = cudf.concat(df)
 >>> df
   num1                datetime text
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 22beda40d53..74622a8ceb2 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -473,31 +473,73 @@ def search_range(start, stop, x, step=1, side="left"):
     return max(min(length, i), 0)
 
 
-# Utils for using appropriate dispatch for array functions
-def get_appropriate_dispatched_func(
-    cudf_submodule, cudf_ser_submodule, cupy_submodule, func, args, kwargs
-):
-    fname = func.__name__
+_UFUNC_ALIASES = {
+    "power": "pow",
+    "equal": "eq",
+    "not_equal": "ne",
+    "less": "lt",
+    "less_equal": "le",
+    "greater": "gt",
+    "greater_equal": "ge",
+    "absolute": "abs",
+}
+# For op(., cudf.Series) -> cudf.Series.__r{op}__
+_REVERSED_NAMES = {
+    "lt": "__gt__",
+    "le": "__ge__",
+    "gt": "__lt__",
+    "ge": "__le__",
+    "eq": "__eq__",
+    "ne": "__ne__",
+}
 
-    if hasattr(cudf_submodule, fname):
-        cudf_func = getattr(cudf_submodule, fname)
-        return cudf_func(*args, **kwargs)
 
-    elif hasattr(cudf_ser_submodule, fname):
+# todo: can probably be used to remove cudf/core/ops.py
+def _get_cudf_series_ufunc(fname, args, kwargs, cudf_ser_submodule):
+    if isinstance(args[0], cudf.Series):
         cudf_ser_func = getattr(cudf_ser_submodule, fname)
         return cudf_ser_func(*args, **kwargs)
+    elif len(args) == 2 and isinstance(args[1], cudf.Series):
+        rev_name = _REVERSED_NAMES.get(fname, f"__r{fname}__")
+        cudf_ser_func = getattr(cudf_ser_submodule, rev_name)
+        return cudf_ser_func(args[1], args[0], **kwargs)
+    return NotImplemented
 
-    elif hasattr(cupy_submodule, fname):
-        cupy_func = getattr(cupy_submodule, fname)
-        # Handle case if cupy impliments it as a numpy function
-        # Unsure if needed
-        if cupy_func is func:
-            return NotImplemented
-
-        cupy_compatible_args, index = _get_cupy_compatible_args_index(args)
-        if cupy_compatible_args:
-            cupy_output = cupy_func(*cupy_compatible_args, **kwargs)
-            return _cast_to_appropriate_cudf_type(cupy_output, index)
+
+# Utils for using appropriate dispatch for array functions
+def get_appropriate_dispatched_func(
+    cudf_submodule, cudf_ser_submodule, cupy_submodule, func, args, kwargs
+):
+    if kwargs.get("out") is None:
+        fname = func.__name__
+        # Dispatch these functions to appropiate alias from the _UFUNC_ALIASES
+        is_ufunc = fname in _UFUNC_ALIASES
+        fname = _UFUNC_ALIASES.get(fname, fname)
+
+        if hasattr(cudf_submodule, fname):
+            cudf_func = getattr(cudf_submodule, fname)
+            return cudf_func(*args, **kwargs)
+
+        elif hasattr(cudf_ser_submodule, fname):
+            if is_ufunc:
+                return _get_cudf_series_ufunc(
+                    fname, args, kwargs, cudf_ser_submodule
+                )
+            else:
+                cudf_ser_func = getattr(cudf_ser_submodule, fname)
+                return cudf_ser_func(*args, **kwargs)
+
+        elif hasattr(cupy_submodule, fname):
+            cupy_func = getattr(cupy_submodule, fname)
+            # Handle case if cupy impliments it as a numpy function
+            # Unsure if needed
+            if cupy_func is func:
+                return NotImplemented
+
+            cupy_compatible_args, index = _get_cupy_compatible_args_index(args)
+            if cupy_compatible_args:
+                cupy_output = cupy_func(*cupy_compatible_args, **kwargs)
+                return _cast_to_appropriate_cudf_type(cupy_output, index)
 
     return NotImplemented
 
diff --git a/python/dask_cudf/dask_cudf/io/csv.py b/python/dask_cudf/dask_cudf/io/csv.py
index 740114d1e65..dea8674ede9 100644
--- a/python/dask_cudf/dask_cudf/io/csv.py
+++ b/python/dask_cudf/dask_cudf/io/csv.py
@@ -4,6 +4,8 @@
 from glob import glob
 from warnings import warn
 
+from fsspec.utils import infer_compression
+
 from dask import dataframe as dd
 from dask.base import tokenize
 from dask.compatibility import apply
@@ -42,7 +44,12 @@ def _internal_read_csv(path, chunksize="256 MiB", **kwargs):
         path, tokenize, **kwargs
     )  # TODO: get last modified time
 
-    compression = kwargs.get("compression", False)
+    compression = kwargs.get("compression", "infer")
+
+    if compression == "infer":
+        # Infer compression from first path by default
+        compression = infer_compression(filenames[0])
+
     if compression and chunksize:
         # compressed CSVs reading must read the entire file
         kwargs.pop("byte_range", None)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_csv.py b/python/dask_cudf/dask_cudf/io/tests/test_csv.py
index dcb561965a8..db1c47c8819 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_csv.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_csv.py
@@ -78,12 +78,10 @@ def test_read_csv_w_bytes(tmp_path):
 
 def test_read_csv_compression(tmp_path):
     df = pd.DataFrame(dict(x=np.arange(20), y=np.arange(20)))
-    df.to_csv(tmp_path / "data.csv.gz", index=False, compression="gzip")
+    df.to_csv(tmp_path / "data.csv.gz", index=False)
 
     with pytest.warns(UserWarning) as w:
-        df2 = dask_cudf.read_csv(
-            tmp_path / "*.csv.gz", chunksize="50 B", compression="gzip"
-        )
+        df2 = dask_cudf.read_csv(tmp_path / "*.csv.gz", chunksize="50 B")
 
     assert len(w) == 1
     msg = str(w[0].message)
@@ -93,9 +91,7 @@ def test_read_csv_compression(tmp_path):
     dd.assert_eq(df2, df, check_index=False)
 
     with warnings.catch_warnings(record=True) as record:
-        df2 = dask_cudf.read_csv(
-            tmp_path / "*.csv.gz", chunksize=None, compression="gzip"
-        )
+        df2 = dask_cudf.read_csv(tmp_path / "*.csv.gz", chunksize=None)
 
         assert not record
 
@@ -119,7 +115,7 @@ def test_read_csv_compression_file_list(tmp_path):
 
 
 @pytest.mark.parametrize("size", [0, 3, 20])
-@pytest.mark.parametrize("compression", ["gzip", "infer"])
+@pytest.mark.parametrize("compression", ["gzip", None])
 def test_read_csv_chunksize_none(tmp_path, compression, size):
     df = pd.DataFrame(dict(x=np.arange(size), y=np.arange(size)))
 
@@ -135,6 +131,6 @@ def test_read_csv_chunksize_none(tmp_path, compression, size):
     else:
         typ = None
 
-    df.to_csv(path, index=False)
+    df.to_csv(path, index=False, compression=compression)
     df2 = dask_cudf.read_csv(path, chunksize=None, dtype=typ)
     dd.assert_eq(df, df2)
diff --git a/thirdparty/CMakeLists.txt b/thirdparty/CMakeLists.txt
index 5202764c844..a5928e5171f 100644
--- a/thirdparty/CMakeLists.txt
+++ b/thirdparty/CMakeLists.txt
@@ -12,6 +12,8 @@ FetchContent_Declare(
     GIT_REPOSITORY https://github.com/NVIDIA/thrust.git
     GIT_TAG        1.10.0
     GIT_SHALLOW    true
+    # NOTE: If you change the GIT_TAG you will likely need to change this patch file too
+    PATCH_COMMAND  patch -p1 -N < ${CMAKE_CURRENT_SOURCE_DIR}/thrust.patch || true
 )
 
 FetchContent_GetProperties(thrust)
diff --git a/thirdparty/thrust.patch b/thirdparty/thrust.patch
new file mode 100644
index 00000000000..3f876f7ffb7
--- /dev/null
+++ b/thirdparty/thrust.patch
@@ -0,0 +1,44 @@
+diff --git a/thrust/system/cuda/detail/sort.h b/thrust/system/cuda/detail/sort.h
+index 1ffeef0..5e80800 100644
+--- a/thrust/system/cuda/detail/sort.h
++++ b/thrust/system/cuda/detail/sort.h
+@@ -108,7 +108,7 @@ namespace __merge_sort {
+     key_type key2 = keys_shared[keys2_beg];
+ 
+ 
+-#pragma unroll
++#pragma unroll 1
+     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+     {
+       bool p = (keys2_beg < keys2_end) &&
+@@ -311,10 +311,10 @@ namespace __merge_sort {
+       void stable_odd_even_sort(key_type (&keys)[ITEMS_PER_THREAD],
+                                 item_type (&items)[ITEMS_PER_THREAD])
+       {
+-#pragma unroll
++#pragma unroll 1
+         for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+         {
+-#pragma unroll
++#pragma unroll 1
+           for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
+           {
+             if (compare_op(keys[j + 1], keys[j]))
+@@ -350,7 +350,7 @@ namespace __merge_sort {
+         // each thread has  sorted keys_loc
+         // merge sort keys_loc in shared memory
+         //
+-#pragma unroll
++#pragma unroll 1
+         for (int coop = 2; coop <= BLOCK_THREADS; coop *= 2)
+         {
+           sync_threadblock();
+@@ -479,7 +479,7 @@ namespace __merge_sort {
+           // and fill the remainig keys with it
+           //
+           key_type max_key = keys_loc[0];
+-#pragma unroll
++#pragma unroll 1
+           for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
+           {
+             if (ITEMS_PER_THREAD * tid + ITEM < num_remaining)